1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * linux/mm/madvise.c
4 *
5 * Copyright (C) 1999 Linus Torvalds
6 * Copyright (C) 2002 Christoph Hellwig
7 */
8
9 #include <linux/mman.h>
10 #include <linux/pagemap.h>
11 #include <linux/syscalls.h>
12 #include <linux/mempolicy.h>
13 #include <linux/page_size_compat.h>
14 #include <linux/page-isolation.h>
15 #include <linux/pgsize_migration.h>
16 #include <linux/page_idle.h>
17 #include <linux/userfaultfd_k.h>
18 #include <linux/hugetlb.h>
19 #include <linux/falloc.h>
20 #include <linux/fadvise.h>
21 #include <linux/sched.h>
22 #include <linux/sched/mm.h>
23 #include <linux/mm_inline.h>
24 #include <linux/string.h>
25 #include <linux/uio.h>
26 #include <linux/ksm.h>
27 #include <linux/fs.h>
28 #include <linux/file.h>
29 #include <linux/blkdev.h>
30 #include <linux/backing-dev.h>
31 #include <linux/pagewalk.h>
32 #include <linux/swap.h>
33 #include <linux/swapops.h>
34 #include <linux/shmem_fs.h>
35 #include <linux/mmu_notifier.h>
36 #include <trace/hooks/mm.h>
37 #include <trace/hooks/madvise.h>
38
39 #include <asm/tlb.h>
40
41 #include "internal.h"
42 #include "swap.h"
43
44 /*
45 * Maximum number of attempts we make to install guard pages before we give up
46 * and return -ERESTARTNOINTR to have userspace try again.
47 */
48 #define MAX_MADVISE_GUARD_RETRIES 3
49
50 struct madvise_walk_private {
51 struct mmu_gather *tlb;
52 bool pageout;
53 void *private;
54 };
55
56 enum madvise_lock_mode {
57 MADVISE_NO_LOCK,
58 MADVISE_MMAP_READ_LOCK,
59 MADVISE_MMAP_WRITE_LOCK,
60 MADVISE_VMA_READ_LOCK,
61 };
62
63 struct madvise_behavior {
64 int behavior;
65 struct mmu_gather *tlb;
66 enum madvise_lock_mode lock_mode;
67 };
68
69 #ifdef CONFIG_ANON_VMA_NAME
anon_vma_name_alloc(const char * name)70 struct anon_vma_name *anon_vma_name_alloc(const char *name)
71 {
72 struct anon_vma_name *anon_name;
73 size_t count;
74
75 /* Add 1 for NUL terminator at the end of the anon_name->name */
76 count = strlen(name) + 1;
77 anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
78 if (anon_name) {
79 kref_init(&anon_name->kref);
80 memcpy(anon_name->name, name, count);
81 }
82
83 return anon_name;
84 }
85
anon_vma_name_free(struct kref * kref)86 void anon_vma_name_free(struct kref *kref)
87 {
88 struct anon_vma_name *anon_name =
89 container_of(kref, struct anon_vma_name, kref);
90 kfree(anon_name);
91 }
92
anon_vma_name(struct vm_area_struct * vma)93 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
94 {
95 if (!rwsem_is_locked(&vma->vm_mm->mmap_lock))
96 vma_assert_locked(vma);
97
98 return vma->anon_name;
99 }
100 EXPORT_SYMBOL_GPL(anon_vma_name);
101
102 /* mmap_lock should be write-locked */
replace_anon_vma_name(struct vm_area_struct * vma,struct anon_vma_name * anon_name)103 static int replace_anon_vma_name(struct vm_area_struct *vma,
104 struct anon_vma_name *anon_name)
105 {
106 struct anon_vma_name *orig_name = anon_vma_name(vma);
107
108 if (!anon_name) {
109 vma->anon_name = NULL;
110 anon_vma_name_put(orig_name);
111 return 0;
112 }
113
114 if (anon_vma_name_eq(orig_name, anon_name))
115 return 0;
116
117 vma->anon_name = anon_vma_name_reuse(anon_name);
118 anon_vma_name_put(orig_name);
119
120 return 0;
121 }
122 #else /* CONFIG_ANON_VMA_NAME */
replace_anon_vma_name(struct vm_area_struct * vma,struct anon_vma_name * anon_name)123 static int replace_anon_vma_name(struct vm_area_struct *vma,
124 struct anon_vma_name *anon_name)
125 {
126 if (anon_name)
127 return -EINVAL;
128
129 return 0;
130 }
131 #endif /* CONFIG_ANON_VMA_NAME */
132 /*
133 * Update the vm_flags on region of a vma, splitting it or merging it as
134 * necessary. Must be called with mmap_lock held for writing;
135 * Caller should ensure anon_name stability by raising its refcount even when
136 * anon_name belongs to a valid vma because this function might free that vma.
137 */
madvise_update_vma(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,unsigned long new_flags,struct anon_vma_name * anon_name)138 static int madvise_update_vma(struct vm_area_struct *vma,
139 struct vm_area_struct **prev, unsigned long start,
140 unsigned long end, unsigned long new_flags,
141 struct anon_vma_name *anon_name)
142 {
143 struct mm_struct *mm = vma->vm_mm;
144 int error;
145 VMA_ITERATOR(vmi, mm, start);
146
147 if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
148 *prev = vma;
149 return 0;
150 }
151
152 vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags,
153 anon_name);
154 if (IS_ERR(vma))
155 return PTR_ERR(vma);
156
157 *prev = vma;
158
159 /* vm_flags is protected by the mmap_lock held in write mode. */
160 vma_start_write(vma);
161 vm_flags_reset(vma, new_flags);
162 if (!vma->vm_file || vma_is_anon_shmem(vma)) {
163 error = replace_anon_vma_name(vma, anon_name);
164 if (error)
165 return error;
166 }
167
168 return 0;
169 }
170
171 #ifdef CONFIG_SWAP
swapin_walk_pmd_entry(pmd_t * pmd,unsigned long start,unsigned long end,struct mm_walk * walk)172 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
173 unsigned long end, struct mm_walk *walk)
174 {
175 struct vm_area_struct *vma = walk->private;
176 struct swap_iocb *splug = NULL;
177 pte_t *ptep = NULL;
178 spinlock_t *ptl;
179 unsigned long addr;
180
181 for (addr = start; addr < end; addr += PAGE_SIZE) {
182 pte_t pte;
183 swp_entry_t entry;
184 struct folio *folio;
185
186 if (!ptep++) {
187 ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
188 if (!ptep)
189 break;
190 }
191
192 pte = ptep_get(ptep);
193 if (!is_swap_pte(pte))
194 continue;
195 entry = pte_to_swp_entry(pte);
196 if (unlikely(non_swap_entry(entry)))
197 continue;
198
199 pte_unmap_unlock(ptep, ptl);
200 ptep = NULL;
201 trace_android_vh_madvise_swapin_walk_pmd_entry(entry);
202
203 folio = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
204 vma, addr, &splug);
205 if (folio)
206 folio_put(folio);
207 }
208
209 if (ptep)
210 pte_unmap_unlock(ptep, ptl);
211 swap_read_unplug(splug);
212 cond_resched();
213
214 return 0;
215 }
216
217 static const struct mm_walk_ops swapin_walk_ops = {
218 .pmd_entry = swapin_walk_pmd_entry,
219 .walk_lock = PGWALK_RDLOCK,
220 };
221
shmem_swapin_range(struct vm_area_struct * vma,unsigned long start,unsigned long end,struct address_space * mapping)222 static void shmem_swapin_range(struct vm_area_struct *vma,
223 unsigned long start, unsigned long end,
224 struct address_space *mapping)
225 {
226 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
227 pgoff_t end_index = linear_page_index(vma, end) - 1;
228 struct folio *folio;
229 struct swap_iocb *splug = NULL;
230
231 rcu_read_lock();
232 xas_for_each(&xas, folio, end_index) {
233 unsigned long addr;
234 swp_entry_t entry;
235
236 if (!xa_is_value(folio))
237 continue;
238 entry = radix_to_swp_entry(folio);
239 /* There might be swapin error entries in shmem mapping. */
240 if (non_swap_entry(entry))
241 continue;
242
243 addr = vma->vm_start +
244 ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT);
245 xas_pause(&xas);
246 rcu_read_unlock();
247
248 folio = read_swap_cache_async(entry, mapping_gfp_mask(mapping),
249 vma, addr, &splug);
250 if (folio)
251 folio_put(folio);
252
253 rcu_read_lock();
254 }
255 rcu_read_unlock();
256 swap_read_unplug(splug);
257 }
258 #endif /* CONFIG_SWAP */
259
260 /*
261 * Schedule all required I/O operations. Do not wait for completion.
262 */
madvise_willneed(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end)263 static long madvise_willneed(struct vm_area_struct *vma,
264 struct vm_area_struct **prev,
265 unsigned long start, unsigned long end)
266 {
267 struct mm_struct *mm = vma->vm_mm;
268 struct file *file = vma->vm_file;
269 loff_t offset;
270
271 *prev = vma;
272 #ifdef CONFIG_SWAP
273 if (!file) {
274 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
275 lru_add_drain(); /* Push any new pages onto the LRU now */
276 return 0;
277 }
278
279 if (shmem_mapping(file->f_mapping)) {
280 shmem_swapin_range(vma, start, end, file->f_mapping);
281 lru_add_drain(); /* Push any new pages onto the LRU now */
282 return 0;
283 }
284 #else
285 if (!file)
286 return -EBADF;
287 #endif
288
289 if (IS_DAX(file_inode(file))) {
290 /* no bad return value, but ignore advice */
291 return 0;
292 }
293
294 /*
295 * Filesystem's fadvise may need to take various locks. We need to
296 * explicitly grab a reference because the vma (and hence the
297 * vma's reference to the file) can go away as soon as we drop
298 * mmap_lock.
299 */
300 *prev = NULL; /* tell sys_madvise we drop mmap_lock */
301 get_file(file);
302 offset = (loff_t)(start - vma->vm_start)
303 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
304 mmap_read_unlock(mm);
305 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
306 fput(file);
307 mmap_read_lock(mm);
308 return 0;
309 }
310
can_do_file_pageout(struct vm_area_struct * vma)311 static inline bool can_do_file_pageout(struct vm_area_struct *vma)
312 {
313 if (!vma->vm_file)
314 return false;
315 /*
316 * paging out pagecache only for non-anonymous mappings that correspond
317 * to the files the calling process could (if tried) open for writing;
318 * otherwise we'd be including shared non-exclusive mappings, which
319 * opens a side channel.
320 */
321 return inode_owner_or_capable(&nop_mnt_idmap,
322 file_inode(vma->vm_file)) ||
323 file_permission(vma->vm_file, MAY_WRITE) == 0;
324 }
325
madvise_folio_pte_batch(unsigned long addr,unsigned long end,struct folio * folio,pte_t * ptep,pte_t pte,bool * any_young,bool * any_dirty)326 static inline int madvise_folio_pte_batch(unsigned long addr, unsigned long end,
327 struct folio *folio, pte_t *ptep,
328 pte_t pte, bool *any_young,
329 bool *any_dirty)
330 {
331 const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
332 int max_nr = (end - addr) / PAGE_SIZE;
333
334 return folio_pte_batch(folio, addr, ptep, pte, max_nr, fpb_flags, NULL,
335 any_young, any_dirty);
336 }
337
madvise_cold_or_pageout_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)338 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
339 unsigned long addr, unsigned long end,
340 struct mm_walk *walk)
341 {
342 struct madvise_walk_private *private = walk->private;
343 struct mmu_gather *tlb = private->tlb;
344 bool pageout = private->pageout;
345 struct mm_struct *mm = tlb->mm;
346 struct vm_area_struct *vma = walk->vma;
347 pte_t *start_pte, *pte, ptent;
348 spinlock_t *ptl;
349 struct folio *folio = NULL;
350 LIST_HEAD(folio_list);
351 bool pageout_anon_only_filter;
352 unsigned int batch_count = 0;
353 bool abort_madvise = false;
354 int nr;
355 int ret = 0;
356
357 trace_android_vh_madvise_cold_or_pageout_abort(vma, &abort_madvise);
358 if (fatal_signal_pending(current) || abort_madvise)
359 return -EINTR;
360
361 trace_android_vh_madvise_pageout_bypass(mm, pageout, &ret);
362 if (ret)
363 return ret;
364
365 pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) &&
366 !can_do_file_pageout(vma);
367
368 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
369 if (pmd_trans_huge(*pmd)) {
370 pmd_t orig_pmd;
371 unsigned long next = pmd_addr_end(addr, end);
372
373 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
374 ptl = pmd_trans_huge_lock(pmd, vma);
375 if (!ptl)
376 return 0;
377
378 orig_pmd = *pmd;
379 if (is_huge_zero_pmd(orig_pmd))
380 goto huge_unlock;
381
382 if (unlikely(!pmd_present(orig_pmd))) {
383 VM_BUG_ON(thp_migration_supported() &&
384 !is_pmd_migration_entry(orig_pmd));
385 goto huge_unlock;
386 }
387
388 folio = pmd_folio(orig_pmd);
389
390 /* Do not interfere with other mappings of this folio */
391 if (folio_likely_mapped_shared(folio))
392 goto huge_unlock;
393
394 if (pageout_anon_only_filter && !folio_test_anon(folio))
395 goto huge_unlock;
396
397 if (next - addr != HPAGE_PMD_SIZE) {
398 int err;
399
400 folio_get(folio);
401 spin_unlock(ptl);
402 folio_lock(folio);
403 err = split_folio(folio);
404 folio_unlock(folio);
405 folio_put(folio);
406 if (!err)
407 goto regular_folio;
408 return 0;
409 }
410
411 if (!pageout && pmd_young(orig_pmd)) {
412 pmdp_invalidate(vma, addr, pmd);
413 orig_pmd = pmd_mkold(orig_pmd);
414
415 set_pmd_at(mm, addr, pmd, orig_pmd);
416 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
417 }
418
419 folio_clear_referenced(folio);
420 folio_test_clear_young(folio);
421 if (folio_test_active(folio))
422 folio_set_workingset(folio);
423 if (pageout) {
424 if (folio_isolate_lru(folio)) {
425 if (folio_test_unevictable(folio))
426 folio_putback_lru(folio);
427 else
428 list_add(&folio->lru, &folio_list);
429 }
430 } else
431 folio_deactivate(folio);
432 huge_unlock:
433 spin_unlock(ptl);
434 if (pageout)
435 __reclaim_pages(&folio_list, private->private);
436 return 0;
437 }
438
439 regular_folio:
440 #endif
441 tlb_change_page_size(tlb, PAGE_SIZE);
442 restart:
443 start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
444 if (!start_pte)
445 return 0;
446 flush_tlb_batched_pending(mm);
447 arch_enter_lazy_mmu_mode();
448 for (; addr < end; pte += nr, addr += nr * PAGE_SIZE) {
449 bool need_skip = false;
450 nr = 1;
451 ptent = ptep_get(pte);
452
453 if (++batch_count == SWAP_CLUSTER_MAX) {
454 batch_count = 0;
455 if (need_resched()) {
456 arch_leave_lazy_mmu_mode();
457 pte_unmap_unlock(start_pte, ptl);
458 cond_resched();
459 goto restart;
460 }
461 }
462
463 if (pte_none(ptent))
464 continue;
465
466 if (!pte_present(ptent))
467 continue;
468
469 folio = vm_normal_folio(vma, addr, ptent);
470 if (!folio || folio_is_zone_device(folio))
471 continue;
472
473 trace_android_vh_madvise_cold_pageout_skip(vma, folio, pageout,
474 &need_skip);
475
476 if (need_skip)
477 continue;
478
479 /*
480 * If we encounter a large folio, only split it if it is not
481 * fully mapped within the range we are operating on. Otherwise
482 * leave it as is so that it can be swapped out whole. If we
483 * fail to split a folio, leave it in place and advance to the
484 * next pte in the range.
485 */
486 if (folio_test_large(folio)) {
487 bool any_young;
488
489 nr = madvise_folio_pte_batch(addr, end, folio, pte,
490 ptent, &any_young, NULL);
491
492 if (any_young)
493 ptent = pte_mkyoung(ptent);
494
495 if (nr < folio_nr_pages(folio)) {
496 int err;
497 bool bypass = false;
498
499 trace_android_vh_split_large_folio_bypass(&bypass);
500 if (bypass)
501 continue;
502 if (folio_likely_mapped_shared(folio))
503 continue;
504 if (pageout_anon_only_filter && !folio_test_anon(folio))
505 continue;
506 if (!folio_trylock(folio))
507 continue;
508 folio_get(folio);
509 arch_leave_lazy_mmu_mode();
510 pte_unmap_unlock(start_pte, ptl);
511 start_pte = NULL;
512 err = split_folio(folio);
513 folio_unlock(folio);
514 folio_put(folio);
515 start_pte = pte =
516 pte_offset_map_lock(mm, pmd, addr, &ptl);
517 if (!start_pte)
518 break;
519 flush_tlb_batched_pending(mm);
520 arch_enter_lazy_mmu_mode();
521 if (!err)
522 nr = 0;
523 continue;
524 }
525 }
526
527 /*
528 * Do not interfere with other mappings of this folio and
529 * non-LRU folio. If we have a large folio at this point, we
530 * know it is fully mapped so if its mapcount is the same as its
531 * number of pages, it must be exclusive.
532 */
533 if (!folio_test_lru(folio) ||
534 folio_mapcount(folio) != folio_nr_pages(folio))
535 continue;
536
537 if (pageout_anon_only_filter && !folio_test_anon(folio))
538 continue;
539
540 if (!pageout && pte_young(ptent)) {
541 clear_young_dirty_ptes(vma, addr, pte, nr,
542 CYDP_CLEAR_YOUNG);
543 tlb_remove_tlb_entries(tlb, pte, nr, addr);
544 }
545
546 /*
547 * We are deactivating a folio for accelerating reclaiming.
548 * VM couldn't reclaim the folio unless we clear PG_young.
549 * As a side effect, it makes confuse idle-page tracking
550 * because they will miss recent referenced history.
551 */
552 folio_clear_referenced(folio);
553 folio_test_clear_young(folio);
554 if (folio_test_active(folio))
555 folio_set_workingset(folio);
556 if (pageout) {
557 if (folio_isolate_lru(folio)) {
558 if (folio_test_unevictable(folio))
559 folio_putback_lru(folio);
560 else
561 list_add(&folio->lru, &folio_list);
562 }
563 } else
564 folio_deactivate(folio);
565 }
566
567 if (start_pte) {
568 arch_leave_lazy_mmu_mode();
569 pte_unmap_unlock(start_pte, ptl);
570 }
571 if (pageout)
572 __reclaim_pages(&folio_list, private->private);
573 cond_resched();
574
575 return 0;
576 }
577
578 static const struct mm_walk_ops cold_walk_ops = {
579 .pmd_entry = madvise_cold_or_pageout_pte_range,
580 .walk_lock = PGWALK_RDLOCK,
581 };
582
madvise_cold_page_range(struct mmu_gather * tlb,struct vm_area_struct * vma,unsigned long addr,unsigned long end)583 static void madvise_cold_page_range(struct mmu_gather *tlb,
584 struct vm_area_struct *vma,
585 unsigned long addr, unsigned long end)
586 {
587 struct madvise_walk_private walk_private = {
588 .pageout = false,
589 .tlb = tlb,
590 };
591
592 tlb_start_vma(tlb, vma);
593 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
594 tlb_end_vma(tlb, vma);
595 }
596
can_madv_lru_vma(struct vm_area_struct * vma)597 static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
598 {
599 return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB));
600 }
601
madvise_cold(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start_addr,unsigned long end_addr)602 static long madvise_cold(struct vm_area_struct *vma,
603 struct vm_area_struct **prev,
604 unsigned long start_addr, unsigned long end_addr)
605 {
606 struct mm_struct *mm = vma->vm_mm;
607 struct mmu_gather tlb;
608
609 *prev = vma;
610 if (!can_madv_lru_vma(vma))
611 return -EINVAL;
612
613 lru_add_drain();
614 tlb_gather_mmu(&tlb, mm);
615 madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
616 tlb_finish_mmu(&tlb);
617
618 return 0;
619 }
620
madvise_pageout_page_range(struct mmu_gather * tlb,struct vm_area_struct * vma,unsigned long addr,unsigned long end)621 static int madvise_pageout_page_range(struct mmu_gather *tlb,
622 struct vm_area_struct *vma,
623 unsigned long addr, unsigned long end)
624 {
625 struct madvise_walk_private walk_private = {
626 .pageout = true,
627 .tlb = tlb,
628 };
629 int ret;
630 LIST_HEAD(folio_list);
631
632 trace_android_rvh_madvise_pageout_begin(&walk_private.private);
633
634 tlb_start_vma(tlb, vma);
635 ret = walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
636 tlb_end_vma(tlb, vma);
637
638 trace_android_rvh_madvise_pageout_end(walk_private.private, &folio_list);
639 if (!list_empty(&folio_list))
640 reclaim_pages(&folio_list);
641
642 return ret;
643 }
644
madvise_pageout(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start_addr,unsigned long end_addr)645 static long madvise_pageout(struct vm_area_struct *vma,
646 struct vm_area_struct **prev,
647 unsigned long start_addr, unsigned long end_addr)
648 {
649 struct mm_struct *mm = vma->vm_mm;
650 struct mmu_gather tlb;
651 int ret;
652 bool return_error = false;
653
654 *prev = vma;
655 if (!can_madv_lru_vma(vma))
656 return -EINVAL;
657
658 /*
659 * If the VMA belongs to a private file mapping, there can be private
660 * dirty pages which can be paged out if even this process is neither
661 * owner nor write capable of the file. We allow private file mappings
662 * further to pageout dirty anon pages.
663 */
664 if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) &&
665 (vma->vm_flags & VM_MAYSHARE)))
666 return 0;
667
668 lru_add_drain();
669 tlb_gather_mmu(&tlb, mm);
670 ret = madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
671 tlb_finish_mmu(&tlb);
672
673 trace_android_vh_madvise_pageout_return_error(ret, &return_error);
674 if (return_error)
675 return (long)ret;
676
677 return 0;
678 }
679
madvise_free_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)680 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
681 unsigned long end, struct mm_walk *walk)
682
683 {
684 const cydp_t cydp_flags = CYDP_CLEAR_YOUNG | CYDP_CLEAR_DIRTY;
685 struct mmu_gather *tlb = walk->private;
686 struct mm_struct *mm = tlb->mm;
687 struct vm_area_struct *vma = walk->vma;
688 spinlock_t *ptl;
689 pte_t *start_pte, *pte, ptent;
690 struct folio *folio;
691 int nr_swap = 0;
692 unsigned long next;
693 int nr, max_nr;
694
695 next = pmd_addr_end(addr, end);
696 if (pmd_trans_huge(*pmd))
697 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
698 return 0;
699
700 tlb_change_page_size(tlb, PAGE_SIZE);
701 start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
702 if (!start_pte)
703 return 0;
704 flush_tlb_batched_pending(mm);
705 arch_enter_lazy_mmu_mode();
706 for (; addr != end; pte += nr, addr += PAGE_SIZE * nr) {
707 nr = 1;
708 ptent = ptep_get(pte);
709
710 if (pte_none(ptent))
711 continue;
712 /*
713 * If the pte has swp_entry, just clear page table to
714 * prevent swap-in which is more expensive rather than
715 * (page allocation + zeroing).
716 */
717 if (!pte_present(ptent)) {
718 swp_entry_t entry;
719
720 entry = pte_to_swp_entry(ptent);
721 if (!non_swap_entry(entry)) {
722 max_nr = (end - addr) / PAGE_SIZE;
723 nr = swap_pte_batch(pte, max_nr, ptent);
724 nr_swap -= nr;
725 free_swap_and_cache_nr(entry, nr);
726 clear_not_present_full_ptes(mm, addr, pte, nr, tlb->fullmm);
727 } else if (is_hwpoison_entry(entry) ||
728 is_poisoned_swp_entry(entry)) {
729 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
730 }
731 continue;
732 }
733
734 folio = vm_normal_folio(vma, addr, ptent);
735 if (!folio || folio_is_zone_device(folio))
736 continue;
737
738 /*
739 * If we encounter a large folio, only split it if it is not
740 * fully mapped within the range we are operating on. Otherwise
741 * leave it as is so that it can be marked as lazyfree. If we
742 * fail to split a folio, leave it in place and advance to the
743 * next pte in the range.
744 */
745 if (folio_test_large(folio)) {
746 bool any_young, any_dirty;
747
748 nr = madvise_folio_pte_batch(addr, end, folio, pte,
749 ptent, &any_young, &any_dirty);
750
751 if (nr < folio_nr_pages(folio)) {
752 int err;
753
754 if (folio_likely_mapped_shared(folio))
755 continue;
756 if (!folio_trylock(folio))
757 continue;
758 folio_get(folio);
759 arch_leave_lazy_mmu_mode();
760 pte_unmap_unlock(start_pte, ptl);
761 start_pte = NULL;
762 err = split_folio(folio);
763 folio_unlock(folio);
764 folio_put(folio);
765 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
766 start_pte = pte;
767 if (!start_pte)
768 break;
769 flush_tlb_batched_pending(mm);
770 arch_enter_lazy_mmu_mode();
771 if (!err)
772 nr = 0;
773 continue;
774 }
775
776 if (any_young)
777 ptent = pte_mkyoung(ptent);
778 if (any_dirty)
779 ptent = pte_mkdirty(ptent);
780 }
781
782 if (folio_test_swapcache(folio) || folio_test_dirty(folio)) {
783 if (!folio_trylock(folio))
784 continue;
785 /*
786 * If we have a large folio at this point, we know it is
787 * fully mapped so if its mapcount is the same as its
788 * number of pages, it must be exclusive.
789 */
790 if (folio_mapcount(folio) != folio_nr_pages(folio)) {
791 folio_unlock(folio);
792 continue;
793 }
794
795 if (folio_test_swapcache(folio) &&
796 !folio_free_swap(folio)) {
797 folio_unlock(folio);
798 continue;
799 }
800
801 folio_clear_dirty(folio);
802 folio_unlock(folio);
803 }
804
805 if (pte_young(ptent) || pte_dirty(ptent)) {
806 clear_young_dirty_ptes(vma, addr, pte, nr, cydp_flags);
807 tlb_remove_tlb_entries(tlb, pte, nr, addr);
808 }
809 folio_mark_lazyfree(folio);
810 }
811
812 if (nr_swap)
813 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
814 if (start_pte) {
815 arch_leave_lazy_mmu_mode();
816 pte_unmap_unlock(start_pte, ptl);
817 }
818 cond_resched();
819
820 return 0;
821 }
822
823 static const struct mm_walk_ops madvise_free_walk_ops = {
824 .pmd_entry = madvise_free_pte_range,
825 .walk_lock = PGWALK_RDLOCK,
826 };
827
madvise_free_single_vma(struct vm_area_struct * vma,unsigned long start_addr,unsigned long end_addr)828 static int madvise_free_single_vma(struct vm_area_struct *vma,
829 unsigned long start_addr, unsigned long end_addr)
830 {
831 struct mm_struct *mm = vma->vm_mm;
832 struct mmu_notifier_range range;
833 struct mmu_gather tlb;
834
835 /* MADV_FREE works for only anon vma at the moment */
836 if (!vma_is_anonymous(vma))
837 return -EINVAL;
838
839 range.start = max(vma->vm_start, start_addr);
840 if (range.start >= vma->vm_end)
841 return -EINVAL;
842 range.end = min(vma->vm_end, end_addr);
843 if (range.end <= vma->vm_start)
844 return -EINVAL;
845 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm,
846 range.start, range.end);
847
848 lru_add_drain();
849 tlb_gather_mmu(&tlb, mm);
850 update_hiwater_rss(mm);
851
852 mmu_notifier_invalidate_range_start(&range);
853 tlb_start_vma(&tlb, vma);
854 walk_page_range(vma->vm_mm, range.start, range.end,
855 &madvise_free_walk_ops, &tlb);
856 tlb_end_vma(&tlb, vma);
857 mmu_notifier_invalidate_range_end(&range);
858 tlb_finish_mmu(&tlb);
859
860 return 0;
861 }
862
863 /*
864 * Application no longer needs these pages. If the pages are dirty,
865 * it's OK to just throw them away. The app will be more careful about
866 * data it wants to keep. Be sure to free swap resources too. The
867 * zap_page_range_single call sets things up for shrink_active_list to actually
868 * free these pages later if no one else has touched them in the meantime,
869 * although we could add these pages to a global reuse list for
870 * shrink_active_list to pick up before reclaiming other pages.
871 *
872 * NB: This interface discards data rather than pushes it out to swap,
873 * as some implementations do. This has performance implications for
874 * applications like large transactional databases which want to discard
875 * pages in anonymous maps after committing to backing store the data
876 * that was kept in them. There is no reason to write this data out to
877 * the swap area if the application is discarding it.
878 *
879 * An interface that causes the system to free clean pages and flush
880 * dirty pages is already available as msync(MS_INVALIDATE).
881 */
madvise_dontneed_single_vma(struct vm_area_struct * vma,unsigned long start,unsigned long end)882 static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
883 unsigned long start, unsigned long end)
884 {
885 madvise_vma_pad_pages(vma, start, end);
886
887 zap_page_range_single(vma, start, end - start, NULL);
888 return 0;
889 }
890
madvise_dontneed_free_valid_vma(struct vm_area_struct * vma,unsigned long start,unsigned long * end,int behavior)891 static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma,
892 unsigned long start,
893 unsigned long *end,
894 int behavior)
895 {
896 if (!is_vm_hugetlb_page(vma)) {
897 unsigned int forbidden = VM_PFNMAP;
898
899 if (behavior != MADV_DONTNEED_LOCKED)
900 forbidden |= VM_LOCKED;
901
902 return !(vma->vm_flags & forbidden);
903 }
904
905 if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED)
906 return false;
907 if (start & ~huge_page_mask(hstate_vma(vma)))
908 return false;
909
910 /*
911 * Madvise callers expect the length to be rounded up to PAGE_SIZE
912 * boundaries, and may be unaware that this VMA uses huge pages.
913 * Avoid unexpected data loss by rounding down the number of
914 * huge pages freed.
915 */
916 *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma)));
917
918 return true;
919 }
920
madvise_dontneed_free(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,struct madvise_behavior * madv_behavior)921 static long madvise_dontneed_free(struct vm_area_struct *vma,
922 struct vm_area_struct **prev,
923 unsigned long start, unsigned long end,
924 struct madvise_behavior *madv_behavior)
925 {
926 int behavior = madv_behavior->behavior;
927 struct mm_struct *mm = vma->vm_mm;
928
929 *prev = vma;
930 if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior))
931 return -EINVAL;
932
933 if (start == end)
934 return 0;
935
936 if (!userfaultfd_remove(vma, start, end)) {
937 *prev = NULL; /* mmap_lock has been dropped, prev is stale */
938
939 mmap_read_lock(mm);
940 vma = vma_lookup(mm, start);
941 if (!vma)
942 return -ENOMEM;
943 /*
944 * Potential end adjustment for hugetlb vma is OK as
945 * the check below keeps end within vma.
946 */
947 if (!madvise_dontneed_free_valid_vma(vma, start, &end,
948 behavior))
949 return -EINVAL;
950 if (end > vma->vm_end) {
951 /*
952 * Don't fail if end > vma->vm_end. If the old
953 * vma was split while the mmap_lock was
954 * released the effect of the concurrent
955 * operation may not cause madvise() to
956 * have an undefined result. There may be an
957 * adjacent next vma that we'll walk
958 * next. userfaultfd_remove() will generate an
959 * UFFD_EVENT_REMOVE repetition on the
960 * end-vma->vm_end range, but the manager can
961 * handle a repetition fine.
962 */
963 end = vma->vm_end;
964 }
965 /*
966 * If the memory region between start and end was
967 * originally backed by 4kB pages and then remapped to
968 * be backed by hugepages while mmap_lock was dropped,
969 * the adjustment for hugetlb vma above may have rounded
970 * end down to the start address.
971 */
972 if (start == end)
973 return 0;
974 VM_WARN_ON(start > end);
975 }
976
977 if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED)
978 return madvise_dontneed_single_vma(vma, start, end);
979 else if (behavior == MADV_FREE)
980 return madvise_free_single_vma(vma, start, end);
981 else
982 return -EINVAL;
983 }
984
madvise_populate(struct mm_struct * mm,unsigned long start,unsigned long end,int behavior)985 static long madvise_populate(struct mm_struct *mm, unsigned long start,
986 unsigned long end, int behavior)
987 {
988 const bool write = behavior == MADV_POPULATE_WRITE;
989 int locked = 1;
990 long pages;
991
992 while (start < end) {
993 /* Populate (prefault) page tables readable/writable. */
994 pages = faultin_page_range(mm, start, end, write, &locked);
995 if (!locked) {
996 mmap_read_lock(mm);
997 locked = 1;
998 }
999 if (pages < 0) {
1000 switch (pages) {
1001 case -EINTR:
1002 return -EINTR;
1003 case -EINVAL: /* Incompatible mappings / permissions. */
1004 return -EINVAL;
1005 case -EHWPOISON:
1006 return -EHWPOISON;
1007 case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
1008 return -EFAULT;
1009 default:
1010 pr_warn_once("%s: unhandled return value: %ld\n",
1011 __func__, pages);
1012 fallthrough;
1013 case -ENOMEM: /* No VMA or out of memory. */
1014 return -ENOMEM;
1015 }
1016 }
1017 start += pages * PAGE_SIZE;
1018 }
1019 return 0;
1020 }
1021
1022 /*
1023 * Application wants to free up the pages and associated backing store.
1024 * This is effectively punching a hole into the middle of a file.
1025 */
madvise_remove(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end)1026 static long madvise_remove(struct vm_area_struct *vma,
1027 struct vm_area_struct **prev,
1028 unsigned long start, unsigned long end)
1029 {
1030 loff_t offset;
1031 int error;
1032 struct file *f;
1033 struct mm_struct *mm = vma->vm_mm;
1034
1035 *prev = NULL; /* tell sys_madvise we drop mmap_lock */
1036
1037 if (vma->vm_flags & VM_LOCKED)
1038 return -EINVAL;
1039
1040 f = vma->vm_file;
1041
1042 if (!f || !f->f_mapping || !f->f_mapping->host) {
1043 return -EINVAL;
1044 }
1045
1046 if (!vma_is_shared_maywrite(vma))
1047 return -EACCES;
1048
1049 offset = (loff_t)(start - vma->vm_start)
1050 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
1051
1052 /*
1053 * Filesystem's fallocate may need to take i_rwsem. We need to
1054 * explicitly grab a reference because the vma (and hence the
1055 * vma's reference to the file) can go away as soon as we drop
1056 * mmap_lock.
1057 */
1058 get_file(f);
1059 if (userfaultfd_remove(vma, start, end)) {
1060 /* mmap_lock was not released by userfaultfd_remove() */
1061 mmap_read_unlock(mm);
1062 }
1063 error = vfs_fallocate(f,
1064 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
1065 offset, end - start);
1066 fput(f);
1067 mmap_read_lock(mm);
1068 return error;
1069 }
1070
is_valid_guard_vma(struct vm_area_struct * vma,bool allow_locked)1071 static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked)
1072 {
1073 vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB;
1074
1075 /*
1076 * A user could lock after setting a guard range but that's fine, as
1077 * they'd not be able to fault in. The issue arises when we try to zap
1078 * existing locked VMAs. We don't want to do that.
1079 */
1080 if (!allow_locked)
1081 disallowed |= VM_LOCKED;
1082
1083 return !(vma->vm_flags & disallowed);
1084 }
1085
is_guard_pte_marker(pte_t ptent)1086 static bool is_guard_pte_marker(pte_t ptent)
1087 {
1088 return is_pte_marker(ptent) &&
1089 is_guard_swp_entry(pte_to_swp_entry(ptent));
1090 }
1091
guard_install_pud_entry(pud_t * pud,unsigned long addr,unsigned long next,struct mm_walk * walk)1092 static int guard_install_pud_entry(pud_t *pud, unsigned long addr,
1093 unsigned long next, struct mm_walk *walk)
1094 {
1095 pud_t pudval = pudp_get(pud);
1096
1097 /* If huge return >0 so we abort the operation + zap. */
1098 return pud_trans_huge(pudval) || pud_devmap(pudval);
1099 }
1100
guard_install_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)1101 static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr,
1102 unsigned long next, struct mm_walk *walk)
1103 {
1104 pmd_t pmdval = pmdp_get(pmd);
1105
1106 /* If huge return >0 so we abort the operation + zap. */
1107 return pmd_trans_huge(pmdval) || pmd_devmap(pmdval);
1108 }
1109
guard_install_pte_entry(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)1110 static int guard_install_pte_entry(pte_t *pte, unsigned long addr,
1111 unsigned long next, struct mm_walk *walk)
1112 {
1113 pte_t pteval = ptep_get(pte);
1114 unsigned long *nr_pages = (unsigned long *)walk->private;
1115
1116 /* If there is already a guard page marker, we have nothing to do. */
1117 if (is_guard_pte_marker(pteval)) {
1118 (*nr_pages)++;
1119
1120 return 0;
1121 }
1122
1123 /* If populated return >0 so we abort the operation + zap. */
1124 return 1;
1125 }
1126
guard_install_set_pte(unsigned long addr,unsigned long next,pte_t * ptep,struct mm_walk * walk)1127 static int guard_install_set_pte(unsigned long addr, unsigned long next,
1128 pte_t *ptep, struct mm_walk *walk)
1129 {
1130 unsigned long *nr_pages = (unsigned long *)walk->private;
1131
1132 /* Simply install a PTE marker, this causes segfault on access. */
1133 *ptep = make_pte_marker(PTE_MARKER_GUARD);
1134 (*nr_pages)++;
1135
1136 return 0;
1137 }
1138
1139 static const struct mm_walk_ops guard_install_walk_ops = {
1140 .pud_entry = guard_install_pud_entry,
1141 .pmd_entry = guard_install_pmd_entry,
1142 .pte_entry = guard_install_pte_entry,
1143 .install_pte = guard_install_set_pte,
1144 .walk_lock = PGWALK_RDLOCK,
1145 };
1146
madvise_guard_install(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end)1147 static long madvise_guard_install(struct vm_area_struct *vma,
1148 struct vm_area_struct **prev,
1149 unsigned long start, unsigned long end)
1150 {
1151 long err;
1152 int i;
1153
1154 *prev = vma;
1155 if (!is_valid_guard_vma(vma, /* allow_locked = */false))
1156 return -EINVAL;
1157
1158 /*
1159 * If we install guard markers, then the range is no longer
1160 * empty from a page table perspective and therefore it's
1161 * appropriate to have an anon_vma.
1162 *
1163 * This ensures that on fork, we copy page tables correctly.
1164 */
1165 err = anon_vma_prepare(vma);
1166 if (err)
1167 return err;
1168
1169 /*
1170 * Optimistically try to install the guard marker pages first. If any
1171 * non-guard pages are encountered, give up and zap the range before
1172 * trying again.
1173 *
1174 * We try a few times before giving up and releasing back to userland to
1175 * loop around, releasing locks in the process to avoid contention. This
1176 * would only happen if there was a great many racing page faults.
1177 *
1178 * In most cases we should simply install the guard markers immediately
1179 * with no zap or looping.
1180 */
1181 for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) {
1182 unsigned long nr_pages = 0;
1183
1184 /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */
1185 err = walk_page_range_mm(vma->vm_mm, start, end,
1186 &guard_install_walk_ops, &nr_pages);
1187 if (err < 0)
1188 return err;
1189
1190 if (err == 0) {
1191 unsigned long nr_expected_pages = PHYS_PFN(end - start);
1192
1193 VM_WARN_ON(nr_pages != nr_expected_pages);
1194 return 0;
1195 }
1196
1197 /*
1198 * OK some of the range have non-guard pages mapped, zap
1199 * them. This leaves existing guard pages in place.
1200 */
1201 zap_page_range_single(vma, start, end - start, NULL);
1202 }
1203
1204 /*
1205 * We were unable to install the guard pages due to being raced by page
1206 * faults. This should not happen ordinarily. We return to userspace and
1207 * immediately retry, relieving lock contention.
1208 */
1209 return restart_syscall();
1210 }
1211
guard_remove_pud_entry(pud_t * pud,unsigned long addr,unsigned long next,struct mm_walk * walk)1212 static int guard_remove_pud_entry(pud_t *pud, unsigned long addr,
1213 unsigned long next, struct mm_walk *walk)
1214 {
1215 pud_t pudval = pudp_get(pud);
1216
1217 /* If huge, cannot have guard pages present, so no-op - skip. */
1218 if (pud_trans_huge(pudval) || pud_devmap(pudval))
1219 walk->action = ACTION_CONTINUE;
1220
1221 return 0;
1222 }
1223
guard_remove_pmd_entry(pmd_t * pmd,unsigned long addr,unsigned long next,struct mm_walk * walk)1224 static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr,
1225 unsigned long next, struct mm_walk *walk)
1226 {
1227 pmd_t pmdval = pmdp_get(pmd);
1228
1229 /* If huge, cannot have guard pages present, so no-op - skip. */
1230 if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval))
1231 walk->action = ACTION_CONTINUE;
1232
1233 return 0;
1234 }
1235
guard_remove_pte_entry(pte_t * pte,unsigned long addr,unsigned long next,struct mm_walk * walk)1236 static int guard_remove_pte_entry(pte_t *pte, unsigned long addr,
1237 unsigned long next, struct mm_walk *walk)
1238 {
1239 pte_t ptent = ptep_get(pte);
1240
1241 if (is_guard_pte_marker(ptent)) {
1242 /* Simply clear the PTE marker. */
1243 pte_clear_not_present_full(walk->mm, addr, pte, false);
1244 update_mmu_cache(walk->vma, addr, pte);
1245 }
1246
1247 return 0;
1248 }
1249
1250 static const struct mm_walk_ops guard_remove_walk_ops = {
1251 .pud_entry = guard_remove_pud_entry,
1252 .pmd_entry = guard_remove_pmd_entry,
1253 .pte_entry = guard_remove_pte_entry,
1254 .walk_lock = PGWALK_RDLOCK,
1255 };
1256
madvise_guard_remove(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end)1257 static long madvise_guard_remove(struct vm_area_struct *vma,
1258 struct vm_area_struct **prev,
1259 unsigned long start, unsigned long end)
1260 {
1261 *prev = vma;
1262 /*
1263 * We're ok with removing guards in mlock()'d ranges, as this is a
1264 * non-destructive action.
1265 */
1266 if (!is_valid_guard_vma(vma, /* allow_locked = */true))
1267 return -EINVAL;
1268
1269 return walk_page_range(vma->vm_mm, start, end,
1270 &guard_remove_walk_ops, NULL);
1271 }
1272
1273 /*
1274 * Apply an madvise behavior to a region of a vma. madvise_update_vma
1275 * will handle splitting a vm area into separate areas, each area with its own
1276 * behavior.
1277 */
madvise_vma_behavior(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,void * behavior_arg)1278 static int madvise_vma_behavior(struct vm_area_struct *vma,
1279 struct vm_area_struct **prev,
1280 unsigned long start, unsigned long end,
1281 void *behavior_arg)
1282 {
1283 struct madvise_behavior *arg = behavior_arg;
1284 int behavior = arg->behavior;
1285 int error;
1286 struct anon_vma_name *anon_name;
1287 unsigned long new_flags = vma->vm_flags;
1288
1289 if (unlikely(!can_modify_vma_madv(vma, behavior)))
1290 return -EPERM;
1291
1292 switch (behavior) {
1293 case MADV_REMOVE:
1294 return madvise_remove(vma, prev, start, end);
1295 case MADV_WILLNEED:
1296 return madvise_willneed(vma, prev, start, end);
1297 case MADV_COLD:
1298 return madvise_cold(vma, prev, start, end);
1299 case MADV_PAGEOUT:
1300 return madvise_pageout(vma, prev, start, end);
1301 case MADV_FREE:
1302 case MADV_DONTNEED:
1303 case MADV_DONTNEED_LOCKED:
1304 return madvise_dontneed_free(vma, prev, start, end, arg);
1305 case MADV_NORMAL:
1306 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
1307 break;
1308 case MADV_SEQUENTIAL:
1309 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
1310 break;
1311 case MADV_RANDOM:
1312 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
1313 break;
1314 case MADV_DONTFORK:
1315 new_flags |= VM_DONTCOPY;
1316 break;
1317 case MADV_DOFORK:
1318 if (vma->vm_flags & VM_IO)
1319 return -EINVAL;
1320 new_flags &= ~VM_DONTCOPY;
1321 break;
1322 case MADV_WIPEONFORK:
1323 /* MADV_WIPEONFORK is only supported on anonymous memory. */
1324 if (vma->vm_file || vma->vm_flags & VM_SHARED)
1325 return -EINVAL;
1326 new_flags |= VM_WIPEONFORK;
1327 break;
1328 case MADV_KEEPONFORK:
1329 if (vma->vm_flags & VM_DROPPABLE)
1330 return -EINVAL;
1331 new_flags &= ~VM_WIPEONFORK;
1332 break;
1333 case MADV_DONTDUMP:
1334 new_flags |= VM_DONTDUMP;
1335 break;
1336 case MADV_DODUMP:
1337 if ((!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) ||
1338 (vma->vm_flags & VM_DROPPABLE))
1339 return -EINVAL;
1340 new_flags &= ~VM_DONTDUMP;
1341 break;
1342 case MADV_MERGEABLE:
1343 case MADV_UNMERGEABLE:
1344 error = ksm_madvise(vma, start, end, behavior, &new_flags);
1345 if (error)
1346 goto out;
1347 break;
1348 case MADV_HUGEPAGE:
1349 case MADV_NOHUGEPAGE:
1350 error = hugepage_madvise(vma, &new_flags, behavior);
1351 if (error)
1352 goto out;
1353 break;
1354 case MADV_COLLAPSE:
1355 return madvise_collapse(vma, prev, start, end);
1356 case MADV_GUARD_INSTALL:
1357 return madvise_guard_install(vma, prev, start, end);
1358 case MADV_GUARD_REMOVE:
1359 return madvise_guard_remove(vma, prev, start, end);
1360 }
1361
1362 anon_name = anon_vma_name(vma);
1363 anon_vma_name_get(anon_name);
1364 error = madvise_update_vma(vma, prev, start, end, new_flags,
1365 anon_name);
1366 anon_vma_name_put(anon_name);
1367
1368 out:
1369 /*
1370 * madvise() returns EAGAIN if kernel resources, such as
1371 * slab, are temporarily unavailable.
1372 */
1373 if (error == -ENOMEM)
1374 error = -EAGAIN;
1375 return error;
1376 }
1377
1378 #ifdef CONFIG_MEMORY_FAILURE
1379 /*
1380 * Error injection support for memory error handling.
1381 */
madvise_inject_error(int behavior,unsigned long start,unsigned long end)1382 static int madvise_inject_error(int behavior,
1383 unsigned long start, unsigned long end)
1384 {
1385 unsigned long size;
1386
1387 if (!capable(CAP_SYS_ADMIN))
1388 return -EPERM;
1389
1390
1391 for (; start < end; start += size) {
1392 unsigned long pfn;
1393 struct page *page;
1394 int ret;
1395
1396 ret = get_user_pages_fast(start, 1, 0, &page);
1397 if (ret != 1)
1398 return ret;
1399 pfn = page_to_pfn(page);
1400
1401 /*
1402 * When soft offlining hugepages, after migrating the page
1403 * we dissolve it, therefore in the second loop "page" will
1404 * no longer be a compound page.
1405 */
1406 size = page_size(compound_head(page));
1407
1408 if (behavior == MADV_SOFT_OFFLINE) {
1409 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
1410 pfn, start);
1411 ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
1412 } else {
1413 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
1414 pfn, start);
1415 ret = memory_failure(pfn, MF_ACTION_REQUIRED | MF_COUNT_INCREASED | MF_SW_SIMULATED);
1416 if (ret == -EOPNOTSUPP)
1417 ret = 0;
1418 }
1419
1420 if (ret)
1421 return ret;
1422 }
1423
1424 return 0;
1425 }
1426 #endif
1427
1428 static bool
madvise_behavior_valid(int behavior)1429 madvise_behavior_valid(int behavior)
1430 {
1431 switch (behavior) {
1432 case MADV_DOFORK:
1433 case MADV_DONTFORK:
1434 case MADV_NORMAL:
1435 case MADV_SEQUENTIAL:
1436 case MADV_RANDOM:
1437 case MADV_REMOVE:
1438 case MADV_WILLNEED:
1439 case MADV_DONTNEED:
1440 case MADV_DONTNEED_LOCKED:
1441 case MADV_FREE:
1442 case MADV_COLD:
1443 case MADV_PAGEOUT:
1444 case MADV_POPULATE_READ:
1445 case MADV_POPULATE_WRITE:
1446 #ifdef CONFIG_KSM
1447 case MADV_MERGEABLE:
1448 case MADV_UNMERGEABLE:
1449 #endif
1450 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1451 case MADV_HUGEPAGE:
1452 case MADV_NOHUGEPAGE:
1453 case MADV_COLLAPSE:
1454 #endif
1455 case MADV_DONTDUMP:
1456 case MADV_DODUMP:
1457 case MADV_WIPEONFORK:
1458 case MADV_KEEPONFORK:
1459 case MADV_GUARD_INSTALL:
1460 case MADV_GUARD_REMOVE:
1461 #ifdef CONFIG_MEMORY_FAILURE
1462 case MADV_SOFT_OFFLINE:
1463 case MADV_HWPOISON:
1464 #endif
1465 return true;
1466
1467 default:
1468 return false;
1469 }
1470 }
1471
process_madvise_behavior_valid(int behavior)1472 static bool process_madvise_behavior_valid(int behavior)
1473 {
1474 switch (behavior) {
1475 case MADV_COLD:
1476 case MADV_PAGEOUT:
1477 case MADV_WILLNEED:
1478 case MADV_COLLAPSE:
1479 return true;
1480 default:
1481 return false;
1482 }
1483 }
1484
1485 /*
1486 * Try to acquire a VMA read lock if possible.
1487 *
1488 * We only support this lock over a single VMA, which the input range must
1489 * span either partially or fully.
1490 *
1491 * This function always returns with an appropriate lock held. If a VMA read
1492 * lock could be acquired, we return the locked VMA.
1493 *
1494 * If a VMA read lock could not be acquired, we return NULL and expect caller to
1495 * fallback to mmap lock behaviour.
1496 */
try_vma_read_lock(struct mm_struct * mm,struct madvise_behavior * madv_behavior,unsigned long start,unsigned long end)1497 static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
1498 struct madvise_behavior *madv_behavior,
1499 unsigned long start, unsigned long end)
1500 {
1501 struct vm_area_struct *vma;
1502
1503 vma = lock_vma_under_rcu(mm, start);
1504 if (!vma)
1505 goto take_mmap_read_lock;
1506 /*
1507 * Must span only a single VMA; uffd and remote processes are
1508 * unsupported.
1509 */
1510 if (end > vma->vm_end || current->mm != mm ||
1511 userfaultfd_armed(vma)) {
1512 vma_end_read(vma);
1513 goto take_mmap_read_lock;
1514 }
1515 return vma;
1516
1517 take_mmap_read_lock:
1518 mmap_read_lock(mm);
1519 madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
1520 return NULL;
1521 }
1522
1523 /*
1524 * Walk the vmas in range [start,end), and call the visit function on each one.
1525 * The visit function will get start and end parameters that cover the overlap
1526 * between the current vma and the original range. Any unmapped regions in the
1527 * original range will result in this function returning -ENOMEM while still
1528 * calling the visit function on all of the existing vmas in the range.
1529 * Must be called with the mmap_lock held for reading or writing.
1530 */
1531 static
madvise_walk_vmas(struct mm_struct * mm,unsigned long start,unsigned long end,struct madvise_behavior * madv_behavior,void * arg,int (* visit)(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,void * arg))1532 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
1533 unsigned long end, struct madvise_behavior *madv_behavior,
1534 void *arg,
1535 int (*visit)(struct vm_area_struct *vma,
1536 struct vm_area_struct **prev, unsigned long start,
1537 unsigned long end, void *arg))
1538 {
1539 struct vm_area_struct *vma;
1540 struct vm_area_struct *prev;
1541 unsigned long tmp;
1542 int unmapped_error = 0;
1543 int error;
1544
1545 /*
1546 * If VMA read lock is supported, apply madvise to a single VMA
1547 * tentatively, avoiding walking VMAs.
1548 */
1549 if (madv_behavior && madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) {
1550 vma = try_vma_read_lock(mm, madv_behavior, start, end);
1551 if (vma) {
1552 prev = vma;
1553 error = visit(vma, &prev, start, end, arg);
1554 vma_end_read(vma);
1555 return error;
1556 }
1557 }
1558
1559 /*
1560 * If the interval [start,end) covers some unmapped address
1561 * ranges, just ignore them, but return -ENOMEM at the end.
1562 * - different from the way of handling in mlock etc.
1563 */
1564 vma = find_vma_prev(mm, start, &prev);
1565 if (vma && start > vma->vm_start)
1566 prev = vma;
1567
1568 for (;;) {
1569 /* Still start < end. */
1570 if (!vma)
1571 return -ENOMEM;
1572
1573 /* Here start < (end|vma->vm_end). */
1574 if (start < vma->vm_start) {
1575 unmapped_error = -ENOMEM;
1576 start = vma->vm_start;
1577 if (start >= end)
1578 break;
1579 }
1580
1581 /* Here vma->vm_start <= start < (end|vma->vm_end) */
1582 tmp = vma->vm_end;
1583 if (end < tmp)
1584 tmp = end;
1585
1586 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
1587 error = visit(vma, &prev, start, tmp, arg);
1588 if (error)
1589 return error;
1590 start = tmp;
1591 if (prev && start < prev->vm_end)
1592 start = prev->vm_end;
1593 if (start >= end)
1594 break;
1595 if (prev)
1596 vma = find_vma(mm, prev->vm_end);
1597 else /* madvise_remove dropped mmap_lock */
1598 vma = find_vma(mm, start);
1599 }
1600
1601 return unmapped_error;
1602 }
1603
1604 #ifdef CONFIG_ANON_VMA_NAME
madvise_vma_anon_name(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,void * anon_name)1605 static int madvise_vma_anon_name(struct vm_area_struct *vma,
1606 struct vm_area_struct **prev,
1607 unsigned long start, unsigned long end,
1608 void *anon_name)
1609 {
1610 int error;
1611
1612 /* Only anonymous mappings can be named */
1613 if (vma->vm_file && !vma_is_anon_shmem(vma))
1614 return -EBADF;
1615
1616 error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
1617 anon_name);
1618
1619 /*
1620 * madvise() returns EAGAIN if kernel resources, such as
1621 * slab, are temporarily unavailable.
1622 */
1623 if (error == -ENOMEM)
1624 error = -EAGAIN;
1625 return error;
1626 }
1627
madvise_set_anon_name(struct mm_struct * mm,unsigned long start,unsigned long len_in,struct anon_vma_name * anon_name)1628 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
1629 unsigned long len_in, struct anon_vma_name *anon_name)
1630 {
1631 unsigned long end;
1632 unsigned long len;
1633
1634 if (start & ~__PAGE_MASK)
1635 return -EINVAL;
1636 len = (len_in + ~__PAGE_MASK) & __PAGE_MASK;
1637
1638 /* Check to see whether len was rounded up from small -ve to zero */
1639 if (len_in && !len)
1640 return -EINVAL;
1641
1642 end = start + len;
1643 if (end < start)
1644 return -EINVAL;
1645
1646 if (end == start)
1647 return 0;
1648
1649 return madvise_walk_vmas(mm, start, end, NULL, anon_name,
1650 madvise_vma_anon_name);
1651 }
1652 #endif /* CONFIG_ANON_VMA_NAME */
1653
1654 #ifdef CONFIG_MEMORY_FAILURE
is_memory_failure(int behavior)1655 static bool is_memory_failure(int behavior)
1656 {
1657 switch (behavior) {
1658 case MADV_HWPOISON:
1659 case MADV_SOFT_OFFLINE:
1660 return true;
1661 default:
1662 return false;
1663 }
1664 }
1665 #else
is_memory_failure(int behavior)1666 static bool is_memory_failure(int behavior)
1667 {
1668 return false;
1669 }
1670 #endif
1671
1672 /*
1673 * Any behaviour which results in changes to the vma->vm_flags needs to
1674 * take mmap_lock for writing. Others, which simply traverse vmas, need
1675 * to only take it for reading.
1676 */
get_lock_mode(struct madvise_behavior * madv_behavior)1677 static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
1678 {
1679 int behavior = madv_behavior->behavior;
1680
1681 if (is_memory_failure(behavior))
1682 return MADVISE_NO_LOCK;
1683
1684 switch (behavior) {
1685 case MADV_REMOVE:
1686 case MADV_WILLNEED:
1687 case MADV_COLD:
1688 case MADV_PAGEOUT:
1689 case MADV_FREE:
1690 case MADV_POPULATE_READ:
1691 case MADV_POPULATE_WRITE:
1692 case MADV_COLLAPSE:
1693 case MADV_GUARD_INSTALL:
1694 case MADV_GUARD_REMOVE:
1695 return MADVISE_MMAP_READ_LOCK;
1696 case MADV_DONTNEED:
1697 case MADV_DONTNEED_LOCKED:
1698 return MADVISE_VMA_READ_LOCK;
1699 default:
1700 return MADVISE_MMAP_WRITE_LOCK;
1701 }
1702 }
1703
madvise_lock(struct mm_struct * mm,struct madvise_behavior * madv_behavior)1704 static int madvise_lock(struct mm_struct *mm,
1705 struct madvise_behavior *madv_behavior)
1706 {
1707 enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
1708
1709 switch (lock_mode) {
1710 case MADVISE_NO_LOCK:
1711 break;
1712 case MADVISE_MMAP_WRITE_LOCK:
1713 if (mmap_write_lock_killable(mm))
1714 return -EINTR;
1715 break;
1716 case MADVISE_MMAP_READ_LOCK:
1717 mmap_read_lock(mm);
1718 break;
1719 case MADVISE_VMA_READ_LOCK:
1720 /* We will acquire the lock per-VMA in madvise_walk_vmas(). */
1721 break;
1722 }
1723
1724 madv_behavior->lock_mode = lock_mode;
1725 return 0;
1726 }
1727
madvise_unlock(struct mm_struct * mm,struct madvise_behavior * madv_behavior)1728 static void madvise_unlock(struct mm_struct *mm,
1729 struct madvise_behavior *madv_behavior)
1730 {
1731 switch (madv_behavior->lock_mode) {
1732 case MADVISE_NO_LOCK:
1733 return;
1734 case MADVISE_MMAP_WRITE_LOCK:
1735 mmap_write_unlock(mm);
1736 break;
1737 case MADVISE_MMAP_READ_LOCK:
1738 mmap_read_unlock(mm);
1739 break;
1740 case MADVISE_VMA_READ_LOCK:
1741 /* We will drop the lock per-VMA in madvise_walk_vmas(). */
1742 break;
1743 }
1744
1745 madv_behavior->lock_mode = MADVISE_NO_LOCK;
1746 }
1747
1748 /*
1749 * untagged_addr_remote() assumes mmap_lock is already held. On
1750 * architectures like x86 and RISC-V, tagging is tricky because each
1751 * mm may have a different tagging mask. However, we might only hold
1752 * the per-VMA lock (currently only local processes are supported),
1753 * so untagged_addr is used to avoid the mmap_lock assertion for
1754 * local processes.
1755 */
get_untagged_addr(struct mm_struct * mm,unsigned long start)1756 static inline unsigned long get_untagged_addr(struct mm_struct *mm,
1757 unsigned long start)
1758 {
1759 return current->mm == mm ? untagged_addr(start) :
1760 untagged_addr_remote(mm, start);
1761 }
1762
1763 /*
1764 * The madvise(2) system call.
1765 *
1766 * Applications can use madvise() to advise the kernel how it should
1767 * handle paging I/O in this VM area. The idea is to help the kernel
1768 * use appropriate read-ahead and caching techniques. The information
1769 * provided is advisory only, and can be safely disregarded by the
1770 * kernel without affecting the correct operation of the application.
1771 *
1772 * behavior values:
1773 * MADV_NORMAL - the default behavior is to read clusters. This
1774 * results in some read-ahead and read-behind.
1775 * MADV_RANDOM - the system should read the minimum amount of data
1776 * on any access, since it is unlikely that the appli-
1777 * cation will need more than what it asks for.
1778 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
1779 * once, so they can be aggressively read ahead, and
1780 * can be freed soon after they are accessed.
1781 * MADV_WILLNEED - the application is notifying the system to read
1782 * some pages ahead.
1783 * MADV_DONTNEED - the application is finished with the given range,
1784 * so the kernel can free resources associated with it.
1785 * MADV_FREE - the application marks pages in the given range as lazy free,
1786 * where actual purges are postponed until memory pressure happens.
1787 * MADV_REMOVE - the application wants to free up the given range of
1788 * pages and associated backing store.
1789 * MADV_DONTFORK - omit this area from child's address space when forking:
1790 * typically, to avoid COWing pages pinned by get_user_pages().
1791 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1792 * MADV_WIPEONFORK - present the child process with zero-filled memory in this
1793 * range after a fork.
1794 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1795 * MADV_HWPOISON - trigger memory error handler as if the given memory range
1796 * were corrupted by unrecoverable hardware memory failure.
1797 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
1798 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
1799 * this area with pages of identical content from other such areas.
1800 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1801 * MADV_HUGEPAGE - the application wants to back the given range by transparent
1802 * huge pages in the future. Existing pages might be coalesced and
1803 * new pages might be allocated as THP.
1804 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
1805 * transparent huge pages so the existing pages will not be
1806 * coalesced into THP and new pages will not be allocated as THP.
1807 * MADV_COLLAPSE - synchronously coalesce pages into new THP.
1808 * MADV_DONTDUMP - the application wants to prevent pages in the given range
1809 * from being included in its core dump.
1810 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1811 * MADV_COLD - the application is not expected to use this memory soon,
1812 * deactivate pages in this range so that they can be reclaimed
1813 * easily if memory pressure happens.
1814 * MADV_PAGEOUT - the application is not expected to use this memory soon,
1815 * page out the pages in this range immediately.
1816 * MADV_POPULATE_READ - populate (prefault) page tables readable by
1817 * triggering read faults if required
1818 * MADV_POPULATE_WRITE - populate (prefault) page tables writable by
1819 * triggering write faults if required
1820 *
1821 * return values:
1822 * zero - success
1823 * -EINVAL - start + len < 0, start is not page-aligned,
1824 * "behavior" is not a valid value, or application
1825 * is attempting to release locked or shared pages,
1826 * or the specified address range includes file, Huge TLB,
1827 * MAP_SHARED or VMPFNMAP range.
1828 * -ENOMEM - addresses in the specified range are not currently
1829 * mapped, or are outside the AS of the process.
1830 * -EIO - an I/O error occurred while paging in data.
1831 * -EBADF - map exists, but area maps something that isn't a file.
1832 * -EAGAIN - a kernel resource was temporarily unavailable.
1833 * -EPERM - memory is sealed.
1834 */
do_madvise(struct mm_struct * mm,unsigned long start,size_t len_in,int behavior)1835 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1836 {
1837 unsigned long end;
1838 int error;
1839 size_t len;
1840 struct blk_plug plug;
1841 struct madvise_behavior madv_behavior = {.behavior = behavior};
1842 bool bypass = false;
1843
1844 if (!madvise_behavior_valid(behavior))
1845 return -EINVAL;
1846
1847 if (!__PAGE_ALIGNED(start))
1848 return -EINVAL;
1849 len = __PAGE_ALIGN(len_in);
1850
1851 /* Check to see whether len was rounded up from small -ve to zero */
1852 if (len_in && !len)
1853 return -EINVAL;
1854
1855 end = start + len;
1856 if (end < start)
1857 return -EINVAL;
1858
1859 if (end == start)
1860 return 0;
1861
1862 trace_android_vh_mm_do_madvise_bypass(mm, start, len, behavior,
1863 &error, &bypass);
1864 if (bypass)
1865 return error;
1866
1867 error = madvise_lock(mm, &madv_behavior);
1868 if (error)
1869 return error;
1870
1871 #ifdef CONFIG_MEMORY_FAILURE
1872 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) {
1873 int ret = madvise_inject_error(behavior, start, start + len_in);
1874
1875 madvise_unlock(mm, &madv_behavior);
1876
1877 return ret;
1878 }
1879 #endif
1880
1881 start = get_untagged_addr(mm, start);
1882 end = start + len;
1883
1884 blk_start_plug(&plug);
1885 switch (behavior) {
1886 case MADV_POPULATE_READ:
1887 case MADV_POPULATE_WRITE:
1888 error = madvise_populate(mm, start, end, behavior);
1889 break;
1890 default:
1891 error = madvise_walk_vmas(mm, start, end, &madv_behavior,
1892 &madv_behavior, madvise_vma_behavior);
1893 break;
1894 }
1895 blk_finish_plug(&plug);
1896
1897 madvise_unlock(mm, &madv_behavior);
1898
1899 return error;
1900 }
1901
SYSCALL_DEFINE3(madvise,unsigned long,start,size_t,len_in,int,behavior)1902 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1903 {
1904 return do_madvise(current->mm, start, len_in, behavior);
1905 }
1906
SYSCALL_DEFINE5(process_madvise,int,pidfd,const struct iovec __user *,vec,size_t,vlen,int,behavior,unsigned int,flags)1907 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1908 size_t, vlen, int, behavior, unsigned int, flags)
1909 {
1910 ssize_t ret;
1911 struct iovec iovstack[UIO_FASTIOV];
1912 struct iovec *iov = iovstack;
1913 struct iov_iter iter;
1914 struct task_struct *task;
1915 struct mm_struct *mm;
1916 size_t total_len;
1917 unsigned int f_flags;
1918 bool bypass = false;
1919 bool return_error = false;
1920
1921 trace_android_rvh_process_madvise_bypass(pidfd, vec,
1922 vlen, behavior, flags, &ret, &bypass);
1923 if (bypass)
1924 return ret;
1925
1926 if (flags != 0) {
1927 ret = -EINVAL;
1928 goto out;
1929 }
1930
1931 ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1932 if (ret < 0)
1933 goto out;
1934
1935 task = pidfd_get_task(pidfd, &f_flags);
1936 if (IS_ERR(task)) {
1937 ret = PTR_ERR(task);
1938 goto free_iov;
1939 }
1940
1941 if (!process_madvise_behavior_valid(behavior)) {
1942 ret = -EINVAL;
1943 goto release_task;
1944 }
1945
1946 /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
1947 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1948 if (IS_ERR_OR_NULL(mm)) {
1949 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1950 goto release_task;
1951 }
1952
1953 /*
1954 * Require CAP_SYS_NICE for influencing process performance. Note that
1955 * only non-destructive hints are currently supported.
1956 */
1957 if (mm != current->mm && !capable(CAP_SYS_NICE)) {
1958 ret = -EPERM;
1959 goto release_mm;
1960 }
1961
1962 total_len = iov_iter_count(&iter);
1963 trace_android_vh_process_madvise_begin(task, behavior);
1964
1965 while (iov_iter_count(&iter)) {
1966 trace_android_vh_process_madvise_iter(task, behavior, &ret);
1967 if (ret < 0)
1968 break;
1969 ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
1970 iter_iov_len(&iter), behavior);
1971 /*
1972 * An madvise operation is attempting to restart the syscall,
1973 * but we cannot proceed as it would not be correct to repeat
1974 * the operation in aggregate, and would be surprising to the
1975 * user.
1976 *
1977 * As we have already dropped locks, it is safe to just loop and
1978 * try again. We check for fatal signals in case we need exit
1979 * early anyway.
1980 */
1981 if (ret == -ERESTARTNOINTR) {
1982 if (fatal_signal_pending(current)) {
1983 ret = -EINTR;
1984 break;
1985 }
1986 continue;
1987 }
1988 if (ret < 0)
1989 break;
1990 iov_iter_advance(&iter, iter_iov_len(&iter));
1991 }
1992 trace_android_vh_process_madvise_return_error(behavior, ret, &return_error);
1993 if (return_error)
1994 goto release_mm;
1995
1996 ret = (total_len - iov_iter_count(&iter)) ? : ret;
1997
1998 release_mm:
1999 mmput(mm);
2000 release_task:
2001 put_task_struct(task);
2002 free_iov:
2003 kfree(iov);
2004 out:
2005 trace_android_vh_process_madvise(behavior, &ret, NULL);
2006 return ret;
2007 }
2008