1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * linux/mm/madvise.c
4 *
5 * Copyright (C) 1999 Linus Torvalds
6 * Copyright (C) 2002 Christoph Hellwig
7 */
8
9 #include <linux/mman.h>
10 #include <linux/pagemap.h>
11 #include <linux/syscalls.h>
12 #include <linux/mempolicy.h>
13 #include <linux/page-isolation.h>
14 #include <linux/pgsize_migration.h>
15 #include <linux/page_idle.h>
16 #include <linux/userfaultfd_k.h>
17 #include <linux/hugetlb.h>
18 #include <linux/falloc.h>
19 #include <linux/fadvise.h>
20 #include <linux/sched.h>
21 #include <linux/sched/mm.h>
22 #include <linux/mm_inline.h>
23 #include <linux/string.h>
24 #include <linux/uio.h>
25 #include <linux/ksm.h>
26 #include <linux/fs.h>
27 #include <linux/file.h>
28 #include <linux/blkdev.h>
29 #include <linux/backing-dev.h>
30 #include <linux/pagewalk.h>
31 #include <linux/swap.h>
32 #include <linux/swapops.h>
33 #include <linux/shmem_fs.h>
34 #include <linux/mmu_notifier.h>
35 #include <trace/hooks/mm.h>
36
37 #include <asm/tlb.h>
38
39 #include "internal.h"
40
41 struct madvise_walk_private {
42 struct mmu_gather *tlb;
43 bool pageout;
44 bool can_pageout_file;
45 };
46
47 /*
48 * Any behaviour which results in changes to the vma->vm_flags needs to
49 * take mmap_lock for writing. Others, which simply traverse vmas, need
50 * to only take it for reading.
51 */
madvise_need_mmap_write(int behavior)52 static int madvise_need_mmap_write(int behavior)
53 {
54 switch (behavior) {
55 case MADV_REMOVE:
56 case MADV_WILLNEED:
57 case MADV_DONTNEED:
58 case MADV_COLD:
59 case MADV_PAGEOUT:
60 case MADV_FREE:
61 case MADV_POPULATE_READ:
62 case MADV_POPULATE_WRITE:
63 return 0;
64 default:
65 /* be safe, default to 1. list exceptions explicitly */
66 return 1;
67 }
68 }
69
70 #ifdef CONFIG_ANON_VMA_NAME
anon_vma_name_alloc(const char * name)71 struct anon_vma_name *anon_vma_name_alloc(const char *name)
72 {
73 struct anon_vma_name *anon_name;
74 size_t count;
75
76 /* Add 1 for NUL terminator at the end of the anon_name->name */
77 count = strlen(name) + 1;
78 anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
79 if (anon_name) {
80 kref_init(&anon_name->kref);
81 memcpy(anon_name->name, name, count);
82 }
83
84 return anon_name;
85 }
86
anon_vma_name_free(struct kref * kref)87 void anon_vma_name_free(struct kref *kref)
88 {
89 struct anon_vma_name *anon_name =
90 container_of(kref, struct anon_vma_name, kref);
91 kfree(anon_name);
92 }
93
anon_vma_name(struct vm_area_struct * vma)94 struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
95 {
96 mmap_assert_locked(vma->vm_mm);
97
98 if (vma->vm_file)
99 return NULL;
100
101 return vma->anon_name;
102 }
103
104 /* mmap_lock should be write-locked */
replace_anon_vma_name(struct vm_area_struct * vma,struct anon_vma_name * anon_name)105 static int replace_anon_vma_name(struct vm_area_struct *vma,
106 struct anon_vma_name *anon_name)
107 {
108 struct anon_vma_name *orig_name = anon_vma_name(vma);
109
110 if (!anon_name) {
111 vma->anon_name = NULL;
112 anon_vma_name_put(orig_name);
113 return 0;
114 }
115
116 if (anon_vma_name_eq(orig_name, anon_name))
117 return 0;
118
119 vma->anon_name = anon_vma_name_reuse(anon_name);
120 anon_vma_name_put(orig_name);
121
122 return 0;
123 }
124 #else /* CONFIG_ANON_VMA_NAME */
replace_anon_vma_name(struct vm_area_struct * vma,struct anon_vma_name * anon_name)125 static int replace_anon_vma_name(struct vm_area_struct *vma,
126 struct anon_vma_name *anon_name)
127 {
128 if (anon_name)
129 return -EINVAL;
130
131 return 0;
132 }
133 #endif /* CONFIG_ANON_VMA_NAME */
134 /*
135 * Update the vm_flags on region of a vma, splitting it or merging it as
136 * necessary. Must be called with mmap_sem held for writing;
137 * Caller should ensure anon_name stability by raising its refcount even when
138 * anon_name belongs to a valid vma because this function might free that vma.
139 */
madvise_update_vma(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,unsigned long new_flags,struct anon_vma_name * anon_name)140 static int madvise_update_vma(struct vm_area_struct *vma,
141 struct vm_area_struct **prev, unsigned long start,
142 unsigned long end, unsigned long new_flags,
143 struct anon_vma_name *anon_name)
144 {
145 struct mm_struct *mm = vma->vm_mm;
146 int error;
147 pgoff_t pgoff;
148
149 if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
150 *prev = vma;
151 return 0;
152 }
153
154 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
155 *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
156 vma->vm_file, pgoff, vma_policy(vma),
157 vma->vm_userfaultfd_ctx, anon_name);
158 if (*prev) {
159 vma = *prev;
160 goto success;
161 }
162
163 *prev = vma;
164
165 if (start != vma->vm_start) {
166 if (unlikely(mm->map_count >= sysctl_max_map_count))
167 return -ENOMEM;
168 error = __split_vma(mm, vma, start, 1);
169 if (error)
170 return error;
171 }
172
173 if (end != vma->vm_end) {
174 if (unlikely(mm->map_count >= sysctl_max_map_count))
175 return -ENOMEM;
176 error = __split_vma(mm, vma, end, 0);
177 if (error)
178 return error;
179 }
180
181 success:
182 /*
183 * vm_flags is protected by the mmap_lock held in write mode.
184 */
185 vma->vm_flags = new_flags;
186 if (!vma->vm_file) {
187 error = replace_anon_vma_name(vma, anon_name);
188 if (error)
189 return error;
190 }
191
192 return 0;
193 }
194
195 #ifdef CONFIG_SWAP
swapin_walk_pmd_entry(pmd_t * pmd,unsigned long start,unsigned long end,struct mm_walk * walk)196 static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
197 unsigned long end, struct mm_walk *walk)
198 {
199 pte_t *orig_pte;
200 struct vm_area_struct *vma = walk->private;
201 unsigned long index;
202
203 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
204 return 0;
205
206 for (index = start; index != end; index += PAGE_SIZE) {
207 pte_t pte;
208 swp_entry_t entry;
209 struct page *page;
210 spinlock_t *ptl;
211
212 orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
213 pte = *(orig_pte + ((index - start) / PAGE_SIZE));
214 pte_unmap_unlock(orig_pte, ptl);
215
216 if (pte_present(pte) || pte_none(pte))
217 continue;
218 entry = pte_to_swp_entry(pte);
219 if (unlikely(non_swap_entry(entry)))
220 continue;
221
222 page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE,
223 vma, index, false);
224 if (page)
225 put_page(page);
226 }
227
228 return 0;
229 }
230
231 static const struct mm_walk_ops swapin_walk_ops = {
232 .pmd_entry = swapin_walk_pmd_entry,
233 };
234
force_shm_swapin_readahead(struct vm_area_struct * vma,unsigned long start,unsigned long end,struct address_space * mapping)235 static void force_shm_swapin_readahead(struct vm_area_struct *vma,
236 unsigned long start, unsigned long end,
237 struct address_space *mapping)
238 {
239 XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start));
240 pgoff_t end_index = linear_page_index(vma, end + PAGE_SIZE - 1);
241 struct page *page;
242
243 rcu_read_lock();
244 xas_for_each(&xas, page, end_index) {
245 swp_entry_t swap;
246
247 if (!xa_is_value(page))
248 continue;
249 xas_pause(&xas);
250 rcu_read_unlock();
251
252 swap = radix_to_swp_entry(page);
253 page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
254 NULL, 0, false);
255 if (page)
256 put_page(page);
257
258 rcu_read_lock();
259 }
260 rcu_read_unlock();
261
262 lru_add_drain(); /* Push any new pages onto the LRU now */
263 }
264 #endif /* CONFIG_SWAP */
265
266 /*
267 * Schedule all required I/O operations. Do not wait for completion.
268 */
madvise_willneed(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end)269 static long madvise_willneed(struct vm_area_struct *vma,
270 struct vm_area_struct **prev,
271 unsigned long start, unsigned long end)
272 {
273 struct mm_struct *mm = vma->vm_mm;
274 struct file *file = vma->vm_file;
275 loff_t offset;
276
277 *prev = vma;
278 #ifdef CONFIG_SWAP
279 if (!file) {
280 walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
281 lru_add_drain(); /* Push any new pages onto the LRU now */
282 return 0;
283 }
284
285 if (shmem_mapping(file->f_mapping)) {
286 force_shm_swapin_readahead(vma, start, end,
287 file->f_mapping);
288 return 0;
289 }
290 #else
291 if (!file)
292 return -EBADF;
293 #endif
294
295 if (IS_DAX(file_inode(file))) {
296 /* no bad return value, but ignore advice */
297 return 0;
298 }
299
300 /*
301 * Filesystem's fadvise may need to take various locks. We need to
302 * explicitly grab a reference because the vma (and hence the
303 * vma's reference to the file) can go away as soon as we drop
304 * mmap_lock.
305 */
306 *prev = NULL; /* tell sys_madvise we drop mmap_lock */
307 get_file(file);
308 offset = (loff_t)(start - vma->vm_start)
309 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
310 mmap_read_unlock(mm);
311 vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
312 fput(file);
313 mmap_read_lock(mm);
314 return 0;
315 }
316
madvise_cold_or_pageout_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)317 static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
318 unsigned long addr, unsigned long end,
319 struct mm_walk *walk)
320 {
321 struct madvise_walk_private *private = walk->private;
322 struct mmu_gather *tlb = private->tlb;
323 bool pageout = private->pageout;
324 bool pageout_anon_only = pageout && !private->can_pageout_file;
325 struct mm_struct *mm = tlb->mm;
326 struct vm_area_struct *vma = walk->vma;
327 pte_t *orig_pte, *pte, ptent;
328 spinlock_t *ptl;
329 struct page *page = NULL;
330 LIST_HEAD(page_list);
331
332 if (fatal_signal_pending(current))
333 return -EINTR;
334
335 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
336 if (pmd_trans_huge(*pmd)) {
337 pmd_t orig_pmd;
338 unsigned long next = pmd_addr_end(addr, end);
339
340 tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
341 ptl = pmd_trans_huge_lock(pmd, vma);
342 if (!ptl)
343 return 0;
344
345 orig_pmd = *pmd;
346 if (is_huge_zero_pmd(orig_pmd))
347 goto huge_unlock;
348
349 if (unlikely(!pmd_present(orig_pmd))) {
350 VM_BUG_ON(thp_migration_supported() &&
351 !is_pmd_migration_entry(orig_pmd));
352 goto huge_unlock;
353 }
354
355 page = pmd_page(orig_pmd);
356
357 /* Do not interfere with other mappings of this page */
358 if (page_mapcount(page) != 1)
359 goto huge_unlock;
360
361 if (pageout_anon_only && !PageAnon(page))
362 goto huge_unlock;
363
364 if (next - addr != HPAGE_PMD_SIZE) {
365 int err;
366
367 get_page(page);
368 spin_unlock(ptl);
369 lock_page(page);
370 err = split_huge_page(page);
371 unlock_page(page);
372 put_page(page);
373 if (!err)
374 goto regular_page;
375 return 0;
376 }
377
378 if (pmd_young(orig_pmd)) {
379 pmdp_invalidate(vma, addr, pmd);
380 orig_pmd = pmd_mkold(orig_pmd);
381
382 set_pmd_at(mm, addr, pmd, orig_pmd);
383 tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
384 }
385
386 ClearPageReferenced(page);
387 test_and_clear_page_young(page);
388 if (pageout) {
389 if (!isolate_lru_page(page)) {
390 if (PageUnevictable(page))
391 putback_lru_page(page);
392 else
393 list_add(&page->lru, &page_list);
394 }
395 } else
396 deactivate_page(page);
397 huge_unlock:
398 spin_unlock(ptl);
399 if (pageout)
400 reclaim_pages(&page_list);
401 return 0;
402 }
403
404 regular_page:
405 if (pmd_trans_unstable(pmd))
406 return 0;
407 #endif
408 tlb_change_page_size(tlb, PAGE_SIZE);
409 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
410 flush_tlb_batched_pending(mm);
411 arch_enter_lazy_mmu_mode();
412 for (; addr < end; pte++, addr += PAGE_SIZE) {
413 ptent = *pte;
414
415 if (pte_none(ptent))
416 continue;
417
418 if (!pte_present(ptent))
419 continue;
420
421 page = vm_normal_page(vma, addr, ptent);
422 if (!page)
423 continue;
424
425 /*
426 * Creating a THP page is expensive so split it only if we
427 * are sure it's worth. Split it if we are only owner.
428 */
429 if (PageTransCompound(page)) {
430 if (page_mapcount(page) != 1)
431 break;
432 if (pageout_anon_only && !PageAnon(page))
433 break;
434 get_page(page);
435 if (!trylock_page(page)) {
436 put_page(page);
437 break;
438 }
439 pte_unmap_unlock(orig_pte, ptl);
440 if (split_huge_page(page)) {
441 unlock_page(page);
442 put_page(page);
443 pte_offset_map_lock(mm, pmd, addr, &ptl);
444 break;
445 }
446 unlock_page(page);
447 put_page(page);
448 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
449 pte--;
450 addr -= PAGE_SIZE;
451 continue;
452 }
453
454 /*
455 * Do not interfere with other mappings of this page and
456 * non-LRU page.
457 */
458 if (!PageLRU(page) || page_mapcount(page) != 1)
459 continue;
460
461 if (pageout_anon_only && !PageAnon(page))
462 continue;
463
464 VM_BUG_ON_PAGE(PageTransCompound(page), page);
465
466 if (pte_young(ptent)) {
467 ptent = ptep_get_and_clear_full(mm, addr, pte,
468 tlb->fullmm);
469 ptent = pte_mkold(ptent);
470 set_pte_at(mm, addr, pte, ptent);
471 tlb_remove_tlb_entry(tlb, pte, addr);
472 }
473
474 /*
475 * We are deactivating a page for accelerating reclaiming.
476 * VM couldn't reclaim the page unless we clear PG_young.
477 * As a side effect, it makes confuse idle-page tracking
478 * because they will miss recent referenced history.
479 */
480 ClearPageReferenced(page);
481 test_and_clear_page_young(page);
482 if (pageout) {
483 if (!isolate_lru_page(page)) {
484 if (PageUnevictable(page))
485 putback_lru_page(page);
486 else
487 list_add(&page->lru, &page_list);
488 }
489 } else
490 deactivate_page(page);
491 }
492
493 arch_leave_lazy_mmu_mode();
494 pte_unmap_unlock(orig_pte, ptl);
495 if (pageout)
496 reclaim_pages(&page_list);
497 cond_resched();
498
499 return 0;
500 }
501
502 static const struct mm_walk_ops cold_walk_ops = {
503 .pmd_entry = madvise_cold_or_pageout_pte_range,
504 };
505
madvise_cold_page_range(struct mmu_gather * tlb,struct vm_area_struct * vma,unsigned long addr,unsigned long end)506 static void madvise_cold_page_range(struct mmu_gather *tlb,
507 struct vm_area_struct *vma,
508 unsigned long addr, unsigned long end)
509 {
510 struct madvise_walk_private walk_private = {
511 .pageout = false,
512 .tlb = tlb,
513 };
514
515 tlb_start_vma(tlb, vma);
516 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
517 tlb_end_vma(tlb, vma);
518 }
519
madvise_cold(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start_addr,unsigned long end_addr)520 static long madvise_cold(struct vm_area_struct *vma,
521 struct vm_area_struct **prev,
522 unsigned long start_addr, unsigned long end_addr)
523 {
524 struct mm_struct *mm = vma->vm_mm;
525 struct mmu_gather tlb;
526
527 *prev = vma;
528 if (!can_madv_lru_vma(vma))
529 return -EINVAL;
530
531 lru_add_drain();
532 tlb_gather_mmu(&tlb, mm);
533 madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
534 tlb_finish_mmu(&tlb);
535
536 return 0;
537 }
538
madvise_pageout_page_range(struct mmu_gather * tlb,struct vm_area_struct * vma,unsigned long addr,unsigned long end,bool can_pageout_file)539 static void madvise_pageout_page_range(struct mmu_gather *tlb,
540 struct vm_area_struct *vma,
541 unsigned long addr, unsigned long end,
542 bool can_pageout_file)
543 {
544 struct madvise_walk_private walk_private = {
545 .pageout = true,
546 .tlb = tlb,
547 .can_pageout_file = can_pageout_file,
548 };
549
550 tlb_start_vma(tlb, vma);
551 walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
552 tlb_end_vma(tlb, vma);
553 }
554
can_do_file_pageout(struct vm_area_struct * vma)555 static inline bool can_do_file_pageout(struct vm_area_struct *vma)
556 {
557 if (!vma->vm_file)
558 return false;
559 /*
560 * paging out pagecache only for non-anonymous mappings that correspond
561 * to the files the calling process could (if tried) open for writing;
562 * otherwise we'd be including shared non-exclusive mappings, which
563 * opens a side channel.
564 */
565 return inode_owner_or_capable(&init_user_ns,
566 file_inode(vma->vm_file)) ||
567 file_permission(vma->vm_file, MAY_WRITE) == 0;
568 }
569
madvise_pageout(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start_addr,unsigned long end_addr)570 static long madvise_pageout(struct vm_area_struct *vma,
571 struct vm_area_struct **prev,
572 unsigned long start_addr, unsigned long end_addr)
573 {
574 struct mm_struct *mm = vma->vm_mm;
575 struct mmu_gather tlb;
576 bool can_pageout_file;
577
578 *prev = vma;
579 if (!can_madv_lru_vma(vma))
580 return -EINVAL;
581
582 /*
583 * If the VMA belongs to a private file mapping, there can be private
584 * dirty pages which can be paged out if even this process is neither
585 * owner nor write capable of the file. Cache the file access check
586 * here and use it later during page walk.
587 */
588 can_pageout_file = can_do_file_pageout(vma);
589
590 lru_add_drain();
591 tlb_gather_mmu(&tlb, mm);
592 madvise_pageout_page_range(&tlb, vma, start_addr, end_addr, can_pageout_file);
593 tlb_finish_mmu(&tlb);
594
595 return 0;
596 }
597
madvise_free_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct mm_walk * walk)598 static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
599 unsigned long end, struct mm_walk *walk)
600
601 {
602 struct mmu_gather *tlb = walk->private;
603 struct mm_struct *mm = tlb->mm;
604 struct vm_area_struct *vma = walk->vma;
605 spinlock_t *ptl;
606 pte_t *orig_pte, *pte, ptent;
607 struct page *page;
608 int nr_swap = 0;
609 unsigned long next;
610
611 next = pmd_addr_end(addr, end);
612 if (pmd_trans_huge(*pmd))
613 if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next))
614 goto next;
615
616 if (pmd_trans_unstable(pmd))
617 return 0;
618
619 tlb_change_page_size(tlb, PAGE_SIZE);
620 orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
621 flush_tlb_batched_pending(mm);
622 arch_enter_lazy_mmu_mode();
623 for (; addr != end; pte++, addr += PAGE_SIZE) {
624 ptent = *pte;
625
626 if (pte_none(ptent))
627 continue;
628 /*
629 * If the pte has swp_entry, just clear page table to
630 * prevent swap-in which is more expensive rather than
631 * (page allocation + zeroing).
632 */
633 if (!pte_present(ptent)) {
634 swp_entry_t entry;
635
636 entry = pte_to_swp_entry(ptent);
637 if (non_swap_entry(entry))
638 continue;
639 nr_swap--;
640 free_swap_and_cache(entry);
641 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
642 continue;
643 }
644
645 page = vm_normal_page(vma, addr, ptent);
646 if (!page)
647 continue;
648
649 /*
650 * If pmd isn't transhuge but the page is THP and
651 * is owned by only this process, split it and
652 * deactivate all pages.
653 */
654 if (PageTransCompound(page)) {
655 if (page_mapcount(page) != 1)
656 goto out;
657 get_page(page);
658 if (!trylock_page(page)) {
659 put_page(page);
660 goto out;
661 }
662 pte_unmap_unlock(orig_pte, ptl);
663 if (split_huge_page(page)) {
664 unlock_page(page);
665 put_page(page);
666 pte_offset_map_lock(mm, pmd, addr, &ptl);
667 goto out;
668 }
669 unlock_page(page);
670 put_page(page);
671 pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
672 pte--;
673 addr -= PAGE_SIZE;
674 continue;
675 }
676
677 VM_BUG_ON_PAGE(PageTransCompound(page), page);
678
679 if (PageSwapCache(page) || PageDirty(page)) {
680 if (!trylock_page(page))
681 continue;
682 /*
683 * If page is shared with others, we couldn't clear
684 * PG_dirty of the page.
685 */
686 if (page_mapcount(page) != 1) {
687 unlock_page(page);
688 continue;
689 }
690
691 if (PageSwapCache(page) && !try_to_free_swap(page)) {
692 unlock_page(page);
693 continue;
694 }
695
696 ClearPageDirty(page);
697 unlock_page(page);
698 }
699
700 if (pte_young(ptent) || pte_dirty(ptent)) {
701 /*
702 * Some of architecture(ex, PPC) don't update TLB
703 * with set_pte_at and tlb_remove_tlb_entry so for
704 * the portability, remap the pte with old|clean
705 * after pte clearing.
706 */
707 ptent = ptep_get_and_clear_full(mm, addr, pte,
708 tlb->fullmm);
709
710 ptent = pte_mkold(ptent);
711 ptent = pte_mkclean(ptent);
712 set_pte_at(mm, addr, pte, ptent);
713 tlb_remove_tlb_entry(tlb, pte, addr);
714 }
715 mark_page_lazyfree(page);
716 }
717 out:
718 if (nr_swap) {
719 if (current->mm == mm)
720 sync_mm_rss(mm);
721
722 add_mm_counter(mm, MM_SWAPENTS, nr_swap);
723 }
724 arch_leave_lazy_mmu_mode();
725 pte_unmap_unlock(orig_pte, ptl);
726 cond_resched();
727 next:
728 return 0;
729 }
730
731 static const struct mm_walk_ops madvise_free_walk_ops = {
732 .pmd_entry = madvise_free_pte_range,
733 };
734
madvise_free_single_vma(struct vm_area_struct * vma,unsigned long start_addr,unsigned long end_addr)735 static int madvise_free_single_vma(struct vm_area_struct *vma,
736 unsigned long start_addr, unsigned long end_addr)
737 {
738 struct mm_struct *mm = vma->vm_mm;
739 struct mmu_notifier_range range;
740 struct mmu_gather tlb;
741
742 /* MADV_FREE works for only anon vma at the moment */
743 if (!vma_is_anonymous(vma))
744 return -EINVAL;
745
746 range.start = max(vma->vm_start, start_addr);
747 if (range.start >= vma->vm_end)
748 return -EINVAL;
749 range.end = min(vma->vm_end, end_addr);
750 if (range.end <= vma->vm_start)
751 return -EINVAL;
752 mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
753 range.start, range.end);
754
755 lru_add_drain();
756 tlb_gather_mmu(&tlb, mm);
757 update_hiwater_rss(mm);
758
759 mmu_notifier_invalidate_range_start(&range);
760 tlb_start_vma(&tlb, vma);
761 walk_page_range(vma->vm_mm, range.start, range.end,
762 &madvise_free_walk_ops, &tlb);
763 tlb_end_vma(&tlb, vma);
764 mmu_notifier_invalidate_range_end(&range);
765 tlb_finish_mmu(&tlb);
766
767 return 0;
768 }
769
770 /*
771 * Application no longer needs these pages. If the pages are dirty,
772 * it's OK to just throw them away. The app will be more careful about
773 * data it wants to keep. Be sure to free swap resources too. The
774 * zap_page_range call sets things up for shrink_active_list to actually free
775 * these pages later if no one else has touched them in the meantime,
776 * although we could add these pages to a global reuse list for
777 * shrink_active_list to pick up before reclaiming other pages.
778 *
779 * NB: This interface discards data rather than pushes it out to swap,
780 * as some implementations do. This has performance implications for
781 * applications like large transactional databases which want to discard
782 * pages in anonymous maps after committing to backing store the data
783 * that was kept in them. There is no reason to write this data out to
784 * the swap area if the application is discarding it.
785 *
786 * An interface that causes the system to free clean pages and flush
787 * dirty pages is already available as msync(MS_INVALIDATE).
788 */
madvise_dontneed_single_vma(struct vm_area_struct * vma,unsigned long start,unsigned long end)789 static long madvise_dontneed_single_vma(struct vm_area_struct *vma,
790 unsigned long start, unsigned long end)
791 {
792 madvise_vma_pad_pages(vma, start, end);
793
794 zap_page_range(vma, start, end - start);
795 return 0;
796 }
797
madvise_dontneed_free(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,int behavior)798 static long madvise_dontneed_free(struct vm_area_struct *vma,
799 struct vm_area_struct **prev,
800 unsigned long start, unsigned long end,
801 int behavior)
802 {
803 struct mm_struct *mm = vma->vm_mm;
804
805 *prev = vma;
806 if (!can_madv_lru_vma(vma))
807 return -EINVAL;
808
809 if (!userfaultfd_remove(vma, start, end)) {
810 *prev = NULL; /* mmap_lock has been dropped, prev is stale */
811
812 mmap_read_lock(mm);
813 vma = find_vma(mm, start);
814 if (!vma)
815 return -ENOMEM;
816 if (start < vma->vm_start) {
817 /*
818 * This "vma" under revalidation is the one
819 * with the lowest vma->vm_start where start
820 * is also < vma->vm_end. If start <
821 * vma->vm_start it means an hole materialized
822 * in the user address space within the
823 * virtual range passed to MADV_DONTNEED
824 * or MADV_FREE.
825 */
826 return -ENOMEM;
827 }
828 if (!can_madv_lru_vma(vma))
829 return -EINVAL;
830 if (end > vma->vm_end) {
831 /*
832 * Don't fail if end > vma->vm_end. If the old
833 * vma was split while the mmap_lock was
834 * released the effect of the concurrent
835 * operation may not cause madvise() to
836 * have an undefined result. There may be an
837 * adjacent next vma that we'll walk
838 * next. userfaultfd_remove() will generate an
839 * UFFD_EVENT_REMOVE repetition on the
840 * end-vma->vm_end range, but the manager can
841 * handle a repetition fine.
842 */
843 end = vma->vm_end;
844 }
845 VM_WARN_ON(start >= end);
846 }
847
848 if (behavior == MADV_DONTNEED)
849 return madvise_dontneed_single_vma(vma, start, end);
850 else if (behavior == MADV_FREE)
851 return madvise_free_single_vma(vma, start, end);
852 else
853 return -EINVAL;
854 }
855
madvise_populate(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,int behavior)856 static long madvise_populate(struct vm_area_struct *vma,
857 struct vm_area_struct **prev,
858 unsigned long start, unsigned long end,
859 int behavior)
860 {
861 const bool write = behavior == MADV_POPULATE_WRITE;
862 struct mm_struct *mm = vma->vm_mm;
863 unsigned long tmp_end;
864 int locked = 1;
865 long pages;
866
867 *prev = vma;
868
869 while (start < end) {
870 /*
871 * We might have temporarily dropped the lock. For example,
872 * our VMA might have been split.
873 */
874 if (!vma || start >= vma->vm_end) {
875 vma = find_vma(mm, start);
876 if (!vma || start < vma->vm_start)
877 return -ENOMEM;
878 }
879
880 tmp_end = min_t(unsigned long, end, vma->vm_end);
881 /* Populate (prefault) page tables readable/writable. */
882 pages = faultin_vma_page_range(vma, start, tmp_end, write,
883 &locked);
884 if (!locked) {
885 mmap_read_lock(mm);
886 locked = 1;
887 *prev = NULL;
888 vma = NULL;
889 }
890 if (pages < 0) {
891 switch (pages) {
892 case -EINTR:
893 return -EINTR;
894 case -EINVAL: /* Incompatible mappings / permissions. */
895 return -EINVAL;
896 case -EHWPOISON:
897 return -EHWPOISON;
898 case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */
899 return -EFAULT;
900 default:
901 pr_warn_once("%s: unhandled return value: %ld\n",
902 __func__, pages);
903 fallthrough;
904 case -ENOMEM:
905 return -ENOMEM;
906 }
907 }
908 start += pages * PAGE_SIZE;
909 }
910 return 0;
911 }
912
913 /*
914 * Application wants to free up the pages and associated backing store.
915 * This is effectively punching a hole into the middle of a file.
916 */
madvise_remove(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end)917 static long madvise_remove(struct vm_area_struct *vma,
918 struct vm_area_struct **prev,
919 unsigned long start, unsigned long end)
920 {
921 loff_t offset;
922 int error;
923 struct file *f;
924 struct mm_struct *mm = vma->vm_mm;
925
926 *prev = NULL; /* tell sys_madvise we drop mmap_lock */
927
928 if (vma->vm_flags & VM_LOCKED)
929 return -EINVAL;
930
931 f = vma->vm_file;
932
933 if (!f || !f->f_mapping || !f->f_mapping->host) {
934 return -EINVAL;
935 }
936
937 if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
938 return -EACCES;
939
940 offset = (loff_t)(start - vma->vm_start)
941 + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
942
943 /*
944 * Filesystem's fallocate may need to take i_rwsem. We need to
945 * explicitly grab a reference because the vma (and hence the
946 * vma's reference to the file) can go away as soon as we drop
947 * mmap_lock.
948 */
949 get_file(f);
950 if (userfaultfd_remove(vma, start, end)) {
951 /* mmap_lock was not released by userfaultfd_remove() */
952 mmap_read_unlock(mm);
953 }
954 error = vfs_fallocate(f,
955 FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
956 offset, end - start);
957 fput(f);
958 mmap_read_lock(mm);
959 return error;
960 }
961
962 /*
963 * Apply an madvise behavior to a region of a vma. madvise_update_vma
964 * will handle splitting a vm area into separate areas, each area with its own
965 * behavior.
966 */
madvise_vma_behavior(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,unsigned long behavior)967 static int madvise_vma_behavior(struct vm_area_struct *vma,
968 struct vm_area_struct **prev,
969 unsigned long start, unsigned long end,
970 unsigned long behavior)
971 {
972 int error;
973 struct anon_vma_name *anon_name;
974 unsigned long new_flags = vma->vm_flags;
975
976 switch (behavior) {
977 case MADV_REMOVE:
978 return madvise_remove(vma, prev, start, end);
979 case MADV_WILLNEED:
980 return madvise_willneed(vma, prev, start, end);
981 case MADV_COLD:
982 return madvise_cold(vma, prev, start, end);
983 case MADV_PAGEOUT:
984 return madvise_pageout(vma, prev, start, end);
985 case MADV_FREE:
986 case MADV_DONTNEED:
987 return madvise_dontneed_free(vma, prev, start, end, behavior);
988 case MADV_POPULATE_READ:
989 case MADV_POPULATE_WRITE:
990 return madvise_populate(vma, prev, start, end, behavior);
991 case MADV_NORMAL:
992 new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
993 break;
994 case MADV_SEQUENTIAL:
995 new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
996 break;
997 case MADV_RANDOM:
998 new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
999 break;
1000 case MADV_DONTFORK:
1001 new_flags |= VM_DONTCOPY;
1002 break;
1003 case MADV_DOFORK:
1004 if (vma->vm_flags & VM_IO)
1005 return -EINVAL;
1006 new_flags &= ~VM_DONTCOPY;
1007 break;
1008 case MADV_WIPEONFORK:
1009 /* MADV_WIPEONFORK is only supported on anonymous memory. */
1010 if (vma->vm_file || vma->vm_flags & VM_SHARED)
1011 return -EINVAL;
1012 new_flags |= VM_WIPEONFORK;
1013 break;
1014 case MADV_KEEPONFORK:
1015 new_flags &= ~VM_WIPEONFORK;
1016 break;
1017 case MADV_DONTDUMP:
1018 new_flags |= VM_DONTDUMP;
1019 break;
1020 case MADV_DODUMP:
1021 if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
1022 return -EINVAL;
1023 new_flags &= ~VM_DONTDUMP;
1024 break;
1025 case MADV_MERGEABLE:
1026 case MADV_UNMERGEABLE:
1027 error = ksm_madvise(vma, start, end, behavior, &new_flags);
1028 if (error)
1029 goto out;
1030 break;
1031 case MADV_HUGEPAGE:
1032 case MADV_NOHUGEPAGE:
1033 error = hugepage_madvise(vma, &new_flags, behavior);
1034 if (error)
1035 goto out;
1036 break;
1037 }
1038
1039 anon_name = anon_vma_name(vma);
1040 anon_vma_name_get(anon_name);
1041 error = madvise_update_vma(vma, prev, start, end, new_flags,
1042 anon_name);
1043 anon_vma_name_put(anon_name);
1044
1045 out:
1046 /*
1047 * madvise() returns EAGAIN if kernel resources, such as
1048 * slab, are temporarily unavailable.
1049 */
1050 if (error == -ENOMEM)
1051 error = -EAGAIN;
1052 return error;
1053 }
1054
1055 #ifdef CONFIG_MEMORY_FAILURE
1056 /*
1057 * Error injection support for memory error handling.
1058 */
madvise_inject_error(int behavior,unsigned long start,unsigned long end)1059 static int madvise_inject_error(int behavior,
1060 unsigned long start, unsigned long end)
1061 {
1062 unsigned long size;
1063
1064 if (!capable(CAP_SYS_ADMIN))
1065 return -EPERM;
1066
1067
1068 for (; start < end; start += size) {
1069 unsigned long pfn;
1070 struct page *page;
1071 int ret;
1072
1073 ret = get_user_pages_fast(start, 1, 0, &page);
1074 if (ret != 1)
1075 return ret;
1076 pfn = page_to_pfn(page);
1077
1078 /*
1079 * When soft offlining hugepages, after migrating the page
1080 * we dissolve it, therefore in the second loop "page" will
1081 * no longer be a compound page.
1082 */
1083 size = page_size(compound_head(page));
1084
1085 if (behavior == MADV_SOFT_OFFLINE) {
1086 pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n",
1087 pfn, start);
1088 ret = soft_offline_page(pfn, MF_COUNT_INCREASED);
1089 } else {
1090 pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
1091 pfn, start);
1092 ret = memory_failure(pfn, MF_COUNT_INCREASED);
1093 if (ret == -EOPNOTSUPP)
1094 ret = 0;
1095 }
1096
1097 if (ret)
1098 return ret;
1099 }
1100
1101 return 0;
1102 }
1103 #endif
1104
1105 static bool
madvise_behavior_valid(int behavior)1106 madvise_behavior_valid(int behavior)
1107 {
1108 switch (behavior) {
1109 case MADV_DOFORK:
1110 case MADV_DONTFORK:
1111 case MADV_NORMAL:
1112 case MADV_SEQUENTIAL:
1113 case MADV_RANDOM:
1114 case MADV_REMOVE:
1115 case MADV_WILLNEED:
1116 case MADV_DONTNEED:
1117 case MADV_FREE:
1118 case MADV_COLD:
1119 case MADV_PAGEOUT:
1120 case MADV_POPULATE_READ:
1121 case MADV_POPULATE_WRITE:
1122 #ifdef CONFIG_KSM
1123 case MADV_MERGEABLE:
1124 case MADV_UNMERGEABLE:
1125 #endif
1126 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1127 case MADV_HUGEPAGE:
1128 case MADV_NOHUGEPAGE:
1129 #endif
1130 case MADV_DONTDUMP:
1131 case MADV_DODUMP:
1132 case MADV_WIPEONFORK:
1133 case MADV_KEEPONFORK:
1134 #ifdef CONFIG_MEMORY_FAILURE
1135 case MADV_SOFT_OFFLINE:
1136 case MADV_HWPOISON:
1137 #endif
1138 return true;
1139
1140 default:
1141 return false;
1142 }
1143 }
1144
1145 static bool
process_madvise_behavior_valid(int behavior)1146 process_madvise_behavior_valid(int behavior)
1147 {
1148 switch (behavior) {
1149 case MADV_COLD:
1150 case MADV_PAGEOUT:
1151 case MADV_WILLNEED:
1152 return true;
1153 default:
1154 return false;
1155 }
1156 }
1157
1158 /*
1159 * Walk the vmas in range [start,end), and call the visit function on each one.
1160 * The visit function will get start and end parameters that cover the overlap
1161 * between the current vma and the original range. Any unmapped regions in the
1162 * original range will result in this function returning -ENOMEM while still
1163 * calling the visit function on all of the existing vmas in the range.
1164 * Must be called with the mmap_lock held for reading or writing.
1165 */
1166 static
madvise_walk_vmas(struct mm_struct * mm,unsigned long start,unsigned long end,unsigned long arg,int (* visit)(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,unsigned long arg))1167 int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
1168 unsigned long end, unsigned long arg,
1169 int (*visit)(struct vm_area_struct *vma,
1170 struct vm_area_struct **prev, unsigned long start,
1171 unsigned long end, unsigned long arg))
1172 {
1173 struct vm_area_struct *vma;
1174 struct vm_area_struct *prev;
1175 unsigned long tmp;
1176 int unmapped_error = 0;
1177
1178 /*
1179 * If the interval [start,end) covers some unmapped address
1180 * ranges, just ignore them, but return -ENOMEM at the end.
1181 * - different from the way of handling in mlock etc.
1182 */
1183 vma = find_vma_prev(mm, start, &prev);
1184 if (vma && start > vma->vm_start)
1185 prev = vma;
1186
1187 for (;;) {
1188 int error;
1189
1190 /* Still start < end. */
1191 if (!vma)
1192 return -ENOMEM;
1193
1194 /* Here start < (end|vma->vm_end). */
1195 if (start < vma->vm_start) {
1196 unmapped_error = -ENOMEM;
1197 start = vma->vm_start;
1198 if (start >= end)
1199 break;
1200 }
1201
1202 /* Here vma->vm_start <= start < (end|vma->vm_end) */
1203 tmp = vma->vm_end;
1204 if (end < tmp)
1205 tmp = end;
1206
1207 /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
1208 error = visit(vma, &prev, start, tmp, arg);
1209 if (error)
1210 return error;
1211 start = tmp;
1212 if (prev && start < prev->vm_end)
1213 start = prev->vm_end;
1214 if (start >= end)
1215 break;
1216 if (prev)
1217 vma = prev->vm_next;
1218 else /* madvise_remove dropped mmap_lock */
1219 vma = find_vma(mm, start);
1220 }
1221
1222 return unmapped_error;
1223 }
1224
1225 #ifdef CONFIG_ANON_VMA_NAME
madvise_vma_anon_name(struct vm_area_struct * vma,struct vm_area_struct ** prev,unsigned long start,unsigned long end,unsigned long anon_name)1226 static int madvise_vma_anon_name(struct vm_area_struct *vma,
1227 struct vm_area_struct **prev,
1228 unsigned long start, unsigned long end,
1229 unsigned long anon_name)
1230 {
1231 int error;
1232
1233 /* Only anonymous mappings can be named */
1234 if (vma->vm_file)
1235 return -EBADF;
1236
1237 error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
1238 (struct anon_vma_name *)anon_name);
1239
1240 /*
1241 * madvise() returns EAGAIN if kernel resources, such as
1242 * slab, are temporarily unavailable.
1243 */
1244 if (error == -ENOMEM)
1245 error = -EAGAIN;
1246 return error;
1247 }
1248
madvise_set_anon_name(struct mm_struct * mm,unsigned long start,unsigned long len_in,struct anon_vma_name * anon_name)1249 int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
1250 unsigned long len_in, struct anon_vma_name *anon_name)
1251 {
1252 unsigned long end;
1253 unsigned long len;
1254
1255 if (start & ~PAGE_MASK)
1256 return -EINVAL;
1257 len = (len_in + ~PAGE_MASK) & PAGE_MASK;
1258
1259 /* Check to see whether len was rounded up from small -ve to zero */
1260 if (len_in && !len)
1261 return -EINVAL;
1262
1263 end = start + len;
1264 if (end < start)
1265 return -EINVAL;
1266
1267 if (end == start)
1268 return 0;
1269
1270 return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
1271 madvise_vma_anon_name);
1272 }
1273 #endif /* CONFIG_ANON_VMA_NAME */
1274 /*
1275 * The madvise(2) system call.
1276 *
1277 * Applications can use madvise() to advise the kernel how it should
1278 * handle paging I/O in this VM area. The idea is to help the kernel
1279 * use appropriate read-ahead and caching techniques. The information
1280 * provided is advisory only, and can be safely disregarded by the
1281 * kernel without affecting the correct operation of the application.
1282 *
1283 * behavior values:
1284 * MADV_NORMAL - the default behavior is to read clusters. This
1285 * results in some read-ahead and read-behind.
1286 * MADV_RANDOM - the system should read the minimum amount of data
1287 * on any access, since it is unlikely that the appli-
1288 * cation will need more than what it asks for.
1289 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
1290 * once, so they can be aggressively read ahead, and
1291 * can be freed soon after they are accessed.
1292 * MADV_WILLNEED - the application is notifying the system to read
1293 * some pages ahead.
1294 * MADV_DONTNEED - the application is finished with the given range,
1295 * so the kernel can free resources associated with it.
1296 * MADV_FREE - the application marks pages in the given range as lazy free,
1297 * where actual purges are postponed until memory pressure happens.
1298 * MADV_REMOVE - the application wants to free up the given range of
1299 * pages and associated backing store.
1300 * MADV_DONTFORK - omit this area from child's address space when forking:
1301 * typically, to avoid COWing pages pinned by get_user_pages().
1302 * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking.
1303 * MADV_WIPEONFORK - present the child process with zero-filled memory in this
1304 * range after a fork.
1305 * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK
1306 * MADV_HWPOISON - trigger memory error handler as if the given memory range
1307 * were corrupted by unrecoverable hardware memory failure.
1308 * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory.
1309 * MADV_MERGEABLE - the application recommends that KSM try to merge pages in
1310 * this area with pages of identical content from other such areas.
1311 * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others.
1312 * MADV_HUGEPAGE - the application wants to back the given range by transparent
1313 * huge pages in the future. Existing pages might be coalesced and
1314 * new pages might be allocated as THP.
1315 * MADV_NOHUGEPAGE - mark the given range as not worth being backed by
1316 * transparent huge pages so the existing pages will not be
1317 * coalesced into THP and new pages will not be allocated as THP.
1318 * MADV_DONTDUMP - the application wants to prevent pages in the given range
1319 * from being included in its core dump.
1320 * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
1321 * MADV_COLD - the application is not expected to use this memory soon,
1322 * deactivate pages in this range so that they can be reclaimed
1323 * easily if memory pressure happens.
1324 * MADV_PAGEOUT - the application is not expected to use this memory soon,
1325 * page out the pages in this range immediately.
1326 * MADV_POPULATE_READ - populate (prefault) page tables readable by
1327 * triggering read faults if required
1328 * MADV_POPULATE_WRITE - populate (prefault) page tables writable by
1329 * triggering write faults if required
1330 *
1331 * return values:
1332 * zero - success
1333 * -EINVAL - start + len < 0, start is not page-aligned,
1334 * "behavior" is not a valid value, or application
1335 * is attempting to release locked or shared pages,
1336 * or the specified address range includes file, Huge TLB,
1337 * MAP_SHARED or VMPFNMAP range.
1338 * -ENOMEM - addresses in the specified range are not currently
1339 * mapped, or are outside the AS of the process.
1340 * -EIO - an I/O error occurred while paging in data.
1341 * -EBADF - map exists, but area maps something that isn't a file.
1342 * -EAGAIN - a kernel resource was temporarily unavailable.
1343 */
do_madvise(struct mm_struct * mm,unsigned long start,size_t len_in,int behavior)1344 int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
1345 {
1346 unsigned long end;
1347 int error;
1348 int write;
1349 size_t len;
1350 struct blk_plug plug;
1351 bool do_plug = true;
1352
1353 start = untagged_addr(start);
1354
1355 if (!madvise_behavior_valid(behavior))
1356 return -EINVAL;
1357
1358 if (!PAGE_ALIGNED(start))
1359 return -EINVAL;
1360 len = PAGE_ALIGN(len_in);
1361
1362 /* Check to see whether len was rounded up from small -ve to zero */
1363 if (len_in && !len)
1364 return -EINVAL;
1365
1366 end = start + len;
1367 if (end < start)
1368 return -EINVAL;
1369
1370 if (end == start)
1371 return 0;
1372
1373 #ifdef CONFIG_MEMORY_FAILURE
1374 if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
1375 return madvise_inject_error(behavior, start, start + len_in);
1376 #endif
1377
1378 write = madvise_need_mmap_write(behavior);
1379 if (write) {
1380 if (mmap_write_lock_killable(mm))
1381 return -EINTR;
1382 } else {
1383 mmap_read_lock(mm);
1384 }
1385
1386 trace_android_vh_do_madvise_blk_plug(behavior, &do_plug);
1387 if (do_plug)
1388 blk_start_plug(&plug);
1389 error = madvise_walk_vmas(mm, start, end, behavior,
1390 madvise_vma_behavior);
1391 if (do_plug)
1392 blk_finish_plug(&plug);
1393 if (write)
1394 mmap_write_unlock(mm);
1395 else
1396 mmap_read_unlock(mm);
1397
1398 return error;
1399 }
1400
SYSCALL_DEFINE3(madvise,unsigned long,start,size_t,len_in,int,behavior)1401 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
1402 {
1403 return do_madvise(current->mm, start, len_in, behavior);
1404 }
1405
SYSCALL_DEFINE5(process_madvise,int,pidfd,const struct iovec __user *,vec,size_t,vlen,int,behavior,unsigned int,flags)1406 SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
1407 size_t, vlen, int, behavior, unsigned int, flags)
1408 {
1409 ssize_t ret;
1410 struct iovec iovstack[UIO_FASTIOV], iovec;
1411 struct iovec *iov = iovstack;
1412 struct iov_iter iter;
1413 struct pid *pid;
1414 struct task_struct *task;
1415 struct mm_struct *mm;
1416 size_t total_len;
1417 unsigned int f_flags;
1418
1419 if (flags != 0) {
1420 ret = -EINVAL;
1421 goto out;
1422 }
1423
1424 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
1425 if (ret < 0)
1426 goto out;
1427
1428 pid = pidfd_get_pid(pidfd, &f_flags);
1429 if (IS_ERR(pid)) {
1430 ret = PTR_ERR(pid);
1431 goto free_iov;
1432 }
1433
1434 task = get_pid_task(pid, PIDTYPE_PID);
1435 if (!task) {
1436 ret = -ESRCH;
1437 goto put_pid;
1438 }
1439
1440 if (!process_madvise_behavior_valid(behavior)) {
1441 ret = -EINVAL;
1442 goto release_task;
1443 }
1444
1445 /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
1446 mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
1447 if (IS_ERR_OR_NULL(mm)) {
1448 ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH;
1449 goto release_task;
1450 }
1451
1452 /*
1453 * Require CAP_SYS_NICE for influencing process performance. Note that
1454 * only non-destructive hints are currently supported.
1455 */
1456 if (!capable(CAP_SYS_NICE)) {
1457 ret = -EPERM;
1458 goto release_mm;
1459 }
1460
1461 total_len = iov_iter_count(&iter);
1462
1463 while (iov_iter_count(&iter)) {
1464 iovec = iov_iter_iovec(&iter);
1465 ret = do_madvise(mm, (unsigned long)iovec.iov_base,
1466 iovec.iov_len, behavior);
1467 if (ret < 0)
1468 break;
1469 iov_iter_advance(&iter, iovec.iov_len);
1470 }
1471
1472 ret = (total_len - iov_iter_count(&iter)) ? : ret;
1473
1474 release_mm:
1475 mmput(mm);
1476 release_task:
1477 put_task_struct(task);
1478 put_pid:
1479 put_pid(pid);
1480 free_iov:
1481 kfree(iov);
1482 out:
1483 return ret;
1484 }
1485