• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * mm/rmap.c - physical to virtual reverse mappings
3  *
4  * Copyright 2001, Rik van Riel <riel@conectiva.com.br>
5  * Released under the General Public License (GPL).
6  *
7  * Simple, low overhead reverse mapping scheme.
8  * Please try to keep this thing as modular as possible.
9  *
10  * Provides methods for unmapping each kind of mapped page:
11  * the anon methods track anonymous pages, and
12  * the file methods track pages belonging to an inode.
13  *
14  * Original design by Rik van Riel <riel@conectiva.com.br> 2001
15  * File methods by Dave McCracken <dmccr@us.ibm.com> 2003, 2004
16  * Anonymous methods by Andrea Arcangeli <andrea@suse.de> 2004
17  * Contributions by Hugh Dickins 2003, 2004
18  */
19 
20 /*
21  * Lock ordering in mm:
22  *
23  * inode->i_mutex	(while writing or truncating, not reading or faulting)
24  *   mm->mmap_lock
25  *     page->flags PG_locked (lock_page)   * (see huegtlbfs below)
26  *       hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
27  *         mapping->i_mmap_rwsem
28  *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
29  *           anon_vma->rwsem
30  *             mm->page_table_lock or pte_lock
31  *               pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
32  *               swap_lock (in swap_duplicate, swap_info_get)
33  *                 mmlist_lock (in mmput, drain_mmlist and others)
34  *                 mapping->private_lock (in __set_page_dirty_buffers)
35  *                   mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
36  *                     i_pages lock (widely used)
37  *                 inode->i_lock (in set_page_dirty's __mark_inode_dirty)
38  *                 bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
39  *                   sb_lock (within inode_lock in fs/fs-writeback.c)
40  *                   i_pages lock (widely used, in set_page_dirty,
41  *                             in arch-dependent flush_dcache_mmap_lock,
42  *                             within bdi.wb->list_lock in __sync_single_inode)
43  *
44  * anon_vma->rwsem,mapping->i_mutex      (memory_failure, collect_procs_anon)
45  *   ->tasklist_lock
46  *     pte map lock
47  *
48  * * hugetlbfs PageHuge() pages take locks in this order:
49  *         mapping->i_mmap_rwsem
50  *           hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
51  *             page->flags PG_locked (lock_page)
52  */
53 
54 #include <linux/mm.h>
55 #include <linux/sched/mm.h>
56 #include <linux/sched/task.h>
57 #include <linux/pagemap.h>
58 #include <linux/swap.h>
59 #include <linux/swapops.h>
60 #include <linux/slab.h>
61 #include <linux/init.h>
62 #include <linux/ksm.h>
63 #include <linux/rmap.h>
64 #include <linux/rcupdate.h>
65 #include <linux/export.h>
66 #include <linux/memcontrol.h>
67 #include <linux/mmu_notifier.h>
68 #include <linux/migrate.h>
69 #include <linux/hugetlb.h>
70 #include <linux/huge_mm.h>
71 #include <linux/backing-dev.h>
72 #include <linux/page_idle.h>
73 #include <linux/memremap.h>
74 #include <linux/userfaultfd_k.h>
75 
76 #include <asm/tlbflush.h>
77 
78 #include <trace/events/tlb.h>
79 
80 #include <trace/hooks/mm.h>
81 
82 #include "internal.h"
83 
84 static struct kmem_cache *anon_vma_cachep;
85 static struct kmem_cache *anon_vma_chain_cachep;
86 
anon_vma_alloc(void)87 static inline struct anon_vma *anon_vma_alloc(void)
88 {
89 	struct anon_vma *anon_vma;
90 
91 	anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
92 	if (anon_vma) {
93 		atomic_set(&anon_vma->refcount, 1);
94 		anon_vma->num_children = 0;
95 		anon_vma->num_active_vmas = 0;
96 		anon_vma->parent = anon_vma;
97 		/*
98 		 * Initialise the anon_vma root to point to itself. If called
99 		 * from fork, the root will be reset to the parents anon_vma.
100 		 */
101 		anon_vma->root = anon_vma;
102 	}
103 
104 	return anon_vma;
105 }
106 
anon_vma_free(struct anon_vma * anon_vma)107 static inline void anon_vma_free(struct anon_vma *anon_vma)
108 {
109 	VM_BUG_ON(atomic_read(&anon_vma->refcount));
110 
111 	/*
112 	 * Synchronize against page_lock_anon_vma_read() such that
113 	 * we can safely hold the lock without the anon_vma getting
114 	 * freed.
115 	 *
116 	 * Relies on the full mb implied by the atomic_dec_and_test() from
117 	 * put_anon_vma() against the acquire barrier implied by
118 	 * down_read_trylock() from page_lock_anon_vma_read(). This orders:
119 	 *
120 	 * page_lock_anon_vma_read()	VS	put_anon_vma()
121 	 *   down_read_trylock()		  atomic_dec_and_test()
122 	 *   LOCK				  MB
123 	 *   atomic_read()			  rwsem_is_locked()
124 	 *
125 	 * LOCK should suffice since the actual taking of the lock must
126 	 * happen _before_ what follows.
127 	 */
128 	might_sleep();
129 	if (rwsem_is_locked(&anon_vma->root->rwsem)) {
130 		anon_vma_lock_write(anon_vma);
131 		anon_vma_unlock_write(anon_vma);
132 	}
133 
134 	kmem_cache_free(anon_vma_cachep, anon_vma);
135 }
136 
anon_vma_chain_alloc(gfp_t gfp)137 static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
138 {
139 	return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
140 }
141 
anon_vma_chain_free(struct anon_vma_chain * anon_vma_chain)142 static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
143 {
144 	kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
145 }
146 
anon_vma_chain_link(struct vm_area_struct * vma,struct anon_vma_chain * avc,struct anon_vma * anon_vma)147 static void anon_vma_chain_link(struct vm_area_struct *vma,
148 				struct anon_vma_chain *avc,
149 				struct anon_vma *anon_vma)
150 {
151 	avc->vma = vma;
152 	avc->anon_vma = anon_vma;
153 	list_add(&avc->same_vma, &vma->anon_vma_chain);
154 	anon_vma_interval_tree_insert(avc, &anon_vma->rb_root);
155 }
156 
157 /**
158  * __anon_vma_prepare - attach an anon_vma to a memory region
159  * @vma: the memory region in question
160  *
161  * This makes sure the memory mapping described by 'vma' has
162  * an 'anon_vma' attached to it, so that we can associate the
163  * anonymous pages mapped into it with that anon_vma.
164  *
165  * The common case will be that we already have one, which
166  * is handled inline by anon_vma_prepare(). But if
167  * not we either need to find an adjacent mapping that we
168  * can re-use the anon_vma from (very common when the only
169  * reason for splitting a vma has been mprotect()), or we
170  * allocate a new one.
171  *
172  * Anon-vma allocations are very subtle, because we may have
173  * optimistically looked up an anon_vma in page_lock_anon_vma_read()
174  * and that may actually touch the spinlock even in the newly
175  * allocated vma (it depends on RCU to make sure that the
176  * anon_vma isn't actually destroyed).
177  *
178  * As a result, we need to do proper anon_vma locking even
179  * for the new allocation. At the same time, we do not want
180  * to do any locking for the common case of already having
181  * an anon_vma.
182  *
183  * This must be called with the mmap_lock held for reading.
184  */
__anon_vma_prepare(struct vm_area_struct * vma)185 int __anon_vma_prepare(struct vm_area_struct *vma)
186 {
187 	struct mm_struct *mm = vma->vm_mm;
188 	struct anon_vma *anon_vma, *allocated;
189 	struct anon_vma_chain *avc;
190 
191 	might_sleep();
192 
193 	avc = anon_vma_chain_alloc(GFP_KERNEL);
194 	if (!avc)
195 		goto out_enomem;
196 
197 	anon_vma = find_mergeable_anon_vma(vma);
198 	allocated = NULL;
199 	if (!anon_vma) {
200 		anon_vma = anon_vma_alloc();
201 		if (unlikely(!anon_vma))
202 			goto out_enomem_free_avc;
203 		anon_vma->num_children++; /* self-parent link for new root */
204 		allocated = anon_vma;
205 	}
206 
207 	anon_vma_lock_write(anon_vma);
208 	/* page_table_lock to protect against threads */
209 	spin_lock(&mm->page_table_lock);
210 	if (likely(!vma->anon_vma)) {
211 		vma->anon_vma = anon_vma;
212 		anon_vma_chain_link(vma, avc, anon_vma);
213 		anon_vma->num_active_vmas++;
214 		allocated = NULL;
215 		avc = NULL;
216 	}
217 	spin_unlock(&mm->page_table_lock);
218 	anon_vma_unlock_write(anon_vma);
219 
220 	if (unlikely(allocated))
221 		put_anon_vma(allocated);
222 	if (unlikely(avc))
223 		anon_vma_chain_free(avc);
224 
225 	return 0;
226 
227  out_enomem_free_avc:
228 	anon_vma_chain_free(avc);
229  out_enomem:
230 	return -ENOMEM;
231 }
232 
233 /*
234  * This is a useful helper function for locking the anon_vma root as
235  * we traverse the vma->anon_vma_chain, looping over anon_vma's that
236  * have the same vma.
237  *
238  * Such anon_vma's should have the same root, so you'd expect to see
239  * just a single mutex_lock for the whole traversal.
240  */
lock_anon_vma_root(struct anon_vma * root,struct anon_vma * anon_vma)241 static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
242 {
243 	struct anon_vma *new_root = anon_vma->root;
244 	if (new_root != root) {
245 		if (WARN_ON_ONCE(root))
246 			up_write(&root->rwsem);
247 		root = new_root;
248 		down_write(&root->rwsem);
249 	}
250 	return root;
251 }
252 
unlock_anon_vma_root(struct anon_vma * root)253 static inline void unlock_anon_vma_root(struct anon_vma *root)
254 {
255 	if (root)
256 		up_write(&root->rwsem);
257 }
258 
259 /*
260  * Attach the anon_vmas from src to dst.
261  * Returns 0 on success, -ENOMEM on failure.
262  *
263  * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
264  * anon_vma_fork(). The first three want an exact copy of src, while the last
265  * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
266  * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
267  * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
268  *
269  * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
270  * and reuse existing anon_vma which has no vmas and only one child anon_vma.
271  * This prevents degradation of anon_vma hierarchy to endless linear chain in
272  * case of constantly forking task. On the other hand, an anon_vma with more
273  * than one child isn't reused even if there was no alive vma, thus rmap
274  * walker has a good chance of avoiding scanning the whole hierarchy when it
275  * searches where page is mapped.
276  */
anon_vma_clone(struct vm_area_struct * dst,struct vm_area_struct * src)277 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
278 {
279 	struct anon_vma_chain *avc, *pavc;
280 	struct anon_vma *root = NULL;
281 
282 	list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
283 		struct anon_vma *anon_vma;
284 
285 		avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
286 		if (unlikely(!avc)) {
287 			unlock_anon_vma_root(root);
288 			root = NULL;
289 			avc = anon_vma_chain_alloc(GFP_KERNEL);
290 			if (!avc)
291 				goto enomem_failure;
292 		}
293 		anon_vma = pavc->anon_vma;
294 		root = lock_anon_vma_root(root, anon_vma);
295 		anon_vma_chain_link(dst, avc, anon_vma);
296 
297 		/*
298 		 * Reuse existing anon_vma if it has no vma and only one
299 		 * anon_vma child.
300 		 *
301 		 * Root anon_vma is never reused:
302 		 * it has self-parent reference and at least one child.
303 		 */
304 		if (!dst->anon_vma && src->anon_vma &&
305 		    anon_vma->num_children < 2 &&
306 		    anon_vma->num_active_vmas == 0)
307 			dst->anon_vma = anon_vma;
308 	}
309 	if (dst->anon_vma)
310 		dst->anon_vma->num_active_vmas++;
311 	unlock_anon_vma_root(root);
312 	return 0;
313 
314  enomem_failure:
315 	/*
316 	 * dst->anon_vma is dropped here otherwise its degree can be incorrectly
317 	 * decremented in unlink_anon_vmas().
318 	 * We can safely do this because callers of anon_vma_clone() don't care
319 	 * about dst->anon_vma if anon_vma_clone() failed.
320 	 */
321 	dst->anon_vma = NULL;
322 	unlink_anon_vmas(dst);
323 	return -ENOMEM;
324 }
325 
326 /*
327  * Attach vma to its own anon_vma, as well as to the anon_vmas that
328  * the corresponding VMA in the parent process is attached to.
329  * Returns 0 on success, non-zero on failure.
330  */
anon_vma_fork(struct vm_area_struct * vma,struct vm_area_struct * pvma)331 int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
332 {
333 	struct anon_vma_chain *avc;
334 	struct anon_vma *anon_vma;
335 	int error;
336 
337 	/* Don't bother if the parent process has no anon_vma here. */
338 	if (!pvma->anon_vma)
339 		return 0;
340 
341 	/* Drop inherited anon_vma, we'll reuse existing or allocate new. */
342 	vma->anon_vma = NULL;
343 
344 	/*
345 	 * First, attach the new VMA to the parent VMA's anon_vmas,
346 	 * so rmap can find non-COWed pages in child processes.
347 	 */
348 	error = anon_vma_clone(vma, pvma);
349 	if (error)
350 		return error;
351 
352 	/* An existing anon_vma has been reused, all done then. */
353 	if (vma->anon_vma)
354 		return 0;
355 
356 	/* Then add our own anon_vma. */
357 	anon_vma = anon_vma_alloc();
358 	if (!anon_vma)
359 		goto out_error;
360 	anon_vma->num_active_vmas++;
361 	avc = anon_vma_chain_alloc(GFP_KERNEL);
362 	if (!avc)
363 		goto out_error_free_anon_vma;
364 
365 	/*
366 	 * The root anon_vma's spinlock is the lock actually used when we
367 	 * lock any of the anon_vmas in this anon_vma tree.
368 	 */
369 	anon_vma->root = pvma->anon_vma->root;
370 	anon_vma->parent = pvma->anon_vma;
371 	/*
372 	 * With refcounts, an anon_vma can stay around longer than the
373 	 * process it belongs to. The root anon_vma needs to be pinned until
374 	 * this anon_vma is freed, because the lock lives in the root.
375 	 */
376 	get_anon_vma(anon_vma->root);
377 	/* Mark this anon_vma as the one where our new (COWed) pages go. */
378 	vma->anon_vma = anon_vma;
379 	anon_vma_lock_write(anon_vma);
380 	anon_vma_chain_link(vma, avc, anon_vma);
381 	anon_vma->parent->num_children++;
382 	anon_vma_unlock_write(anon_vma);
383 
384 	return 0;
385 
386  out_error_free_anon_vma:
387 	put_anon_vma(anon_vma);
388  out_error:
389 	unlink_anon_vmas(vma);
390 	return -ENOMEM;
391 }
392 
unlink_anon_vmas(struct vm_area_struct * vma)393 void unlink_anon_vmas(struct vm_area_struct *vma)
394 {
395 	struct anon_vma_chain *avc, *next;
396 	struct anon_vma *root = NULL;
397 
398 	/*
399 	 * Unlink each anon_vma chained to the VMA.  This list is ordered
400 	 * from newest to oldest, ensuring the root anon_vma gets freed last.
401 	 */
402 	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
403 		struct anon_vma *anon_vma = avc->anon_vma;
404 
405 		root = lock_anon_vma_root(root, anon_vma);
406 		anon_vma_interval_tree_remove(avc, &anon_vma->rb_root);
407 
408 		/*
409 		 * Leave empty anon_vmas on the list - we'll need
410 		 * to free them outside the lock.
411 		 */
412 		if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
413 			anon_vma->parent->num_children--;
414 			continue;
415 		}
416 
417 		list_del(&avc->same_vma);
418 		anon_vma_chain_free(avc);
419 	}
420 	if (vma->anon_vma)
421 		vma->anon_vma->num_active_vmas--;
422 
423 	unlock_anon_vma_root(root);
424 
425 	/*
426 	 * Iterate the list once more, it now only contains empty and unlinked
427 	 * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
428 	 * needing to write-acquire the anon_vma->root->rwsem.
429 	 */
430 	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
431 		struct anon_vma *anon_vma = avc->anon_vma;
432 
433 		VM_WARN_ON(anon_vma->num_children);
434 		VM_WARN_ON(anon_vma->num_active_vmas);
435 		put_anon_vma(anon_vma);
436 
437 		list_del(&avc->same_vma);
438 		anon_vma_chain_free(avc);
439 	}
440 }
441 
anon_vma_ctor(void * data)442 static void anon_vma_ctor(void *data)
443 {
444 	struct anon_vma *anon_vma = data;
445 
446 	init_rwsem(&anon_vma->rwsem);
447 	atomic_set(&anon_vma->refcount, 0);
448 	anon_vma->rb_root = RB_ROOT_CACHED;
449 }
450 
anon_vma_init(void)451 void __init anon_vma_init(void)
452 {
453 	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
454 			0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
455 			anon_vma_ctor);
456 	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
457 			SLAB_PANIC|SLAB_ACCOUNT);
458 }
459 
460 /*
461  * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
462  *
463  * Since there is no serialization what so ever against page_remove_rmap()
464  * the best this function can do is return a locked anon_vma that might
465  * have been relevant to this page.
466  *
467  * The page might have been remapped to a different anon_vma or the anon_vma
468  * returned may already be freed (and even reused).
469  *
470  * In case it was remapped to a different anon_vma, the new anon_vma will be a
471  * child of the old anon_vma, and the anon_vma lifetime rules will therefore
472  * ensure that any anon_vma obtained from the page will still be valid for as
473  * long as we observe page_mapped() [ hence all those page_mapped() tests ].
474  *
475  * All users of this function must be very careful when walking the anon_vma
476  * chain and verify that the page in question is indeed mapped in it
477  * [ something equivalent to page_mapped_in_vma() ].
478  *
479  * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
480  * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
481  * if there is a mapcount, we can dereference the anon_vma after observing
482  * those.
483  */
page_get_anon_vma(struct page * page)484 struct anon_vma *page_get_anon_vma(struct page *page)
485 {
486 	struct anon_vma *anon_vma = NULL;
487 	unsigned long anon_mapping;
488 
489 	rcu_read_lock();
490 	anon_mapping = (unsigned long)READ_ONCE(page->mapping);
491 	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
492 		goto out;
493 	if (!page_mapped(page))
494 		goto out;
495 
496 	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
497 	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
498 		anon_vma = NULL;
499 		goto out;
500 	}
501 
502 	/*
503 	 * If this page is still mapped, then its anon_vma cannot have been
504 	 * freed.  But if it has been unmapped, we have no security against the
505 	 * anon_vma structure being freed and reused (for another anon_vma:
506 	 * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero()
507 	 * above cannot corrupt).
508 	 */
509 	if (!page_mapped(page)) {
510 		rcu_read_unlock();
511 		put_anon_vma(anon_vma);
512 		return NULL;
513 	}
514 out:
515 	rcu_read_unlock();
516 
517 	return anon_vma;
518 }
519 
520 /*
521  * Similar to page_get_anon_vma() except it locks the anon_vma.
522  *
523  * Its a little more complex as it tries to keep the fast path to a single
524  * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
525  * reference like with page_get_anon_vma() and then block on the mutex
526  * on !rwc->try_lock case.
527  */
page_lock_anon_vma_read(struct page * page,struct rmap_walk_control * rwc)528 struct anon_vma *page_lock_anon_vma_read(struct page *page,
529 					 struct rmap_walk_control *rwc)
530 {
531 	struct anon_vma *anon_vma = NULL;
532 	struct anon_vma *root_anon_vma;
533 	unsigned long anon_mapping;
534 	bool success = false;
535 
536 	rcu_read_lock();
537 	anon_mapping = (unsigned long)READ_ONCE(page->mapping);
538 	if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
539 		goto out;
540 	if (!page_mapped(page))
541 		goto out;
542 
543 	anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
544 	root_anon_vma = READ_ONCE(anon_vma->root);
545 	if (down_read_trylock(&root_anon_vma->rwsem)) {
546 		/*
547 		 * If the page is still mapped, then this anon_vma is still
548 		 * its anon_vma, and holding the mutex ensures that it will
549 		 * not go away, see anon_vma_free().
550 		 */
551 		if (!page_mapped(page)) {
552 			up_read(&root_anon_vma->rwsem);
553 			anon_vma = NULL;
554 		}
555 		goto out;
556 	}
557 	trace_android_vh_do_page_trylock(page, NULL, NULL, &success);
558 	if (success) {
559 		anon_vma = NULL;
560 		goto out;
561 	}
562 
563 	if (rwc && rwc->try_lock) {
564 		anon_vma = NULL;
565 		rwc->contended = true;
566 		goto out;
567 	}
568 
569 	/* trylock failed, we got to sleep */
570 	if (!atomic_inc_not_zero(&anon_vma->refcount)) {
571 		anon_vma = NULL;
572 		goto out;
573 	}
574 
575 	if (!page_mapped(page)) {
576 		rcu_read_unlock();
577 		put_anon_vma(anon_vma);
578 		return NULL;
579 	}
580 
581 	/* we pinned the anon_vma, its safe to sleep */
582 	rcu_read_unlock();
583 	anon_vma_lock_read(anon_vma);
584 
585 	if (atomic_dec_and_test(&anon_vma->refcount)) {
586 		/*
587 		 * Oops, we held the last refcount, release the lock
588 		 * and bail -- can't simply use put_anon_vma() because
589 		 * we'll deadlock on the anon_vma_lock_write() recursion.
590 		 */
591 		anon_vma_unlock_read(anon_vma);
592 		__put_anon_vma(anon_vma);
593 		anon_vma = NULL;
594 	}
595 
596 	return anon_vma;
597 
598 out:
599 	rcu_read_unlock();
600 	return anon_vma;
601 }
602 
page_unlock_anon_vma_read(struct anon_vma * anon_vma)603 void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
604 {
605 	anon_vma_unlock_read(anon_vma);
606 }
607 
608 #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
609 /*
610  * Flush TLB entries for recently unmapped pages from remote CPUs. It is
611  * important if a PTE was dirty when it was unmapped that it's flushed
612  * before any IO is initiated on the page to prevent lost writes. Similarly,
613  * it must be flushed before freeing to prevent data leakage.
614  */
try_to_unmap_flush(void)615 void try_to_unmap_flush(void)
616 {
617 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
618 
619 	if (!tlb_ubc->flush_required)
620 		return;
621 
622 	arch_tlbbatch_flush(&tlb_ubc->arch);
623 	tlb_ubc->flush_required = false;
624 	tlb_ubc->writable = false;
625 }
626 
627 /* Flush iff there are potentially writable TLB entries that can race with IO */
try_to_unmap_flush_dirty(void)628 void try_to_unmap_flush_dirty(void)
629 {
630 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
631 
632 	if (tlb_ubc->writable)
633 		try_to_unmap_flush();
634 }
635 
set_tlb_ubc_flush_pending(struct mm_struct * mm,bool writable)636 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
637 {
638 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
639 
640 	arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
641 	tlb_ubc->flush_required = true;
642 
643 	/*
644 	 * Ensure compiler does not re-order the setting of tlb_flush_batched
645 	 * before the PTE is cleared.
646 	 */
647 	barrier();
648 	mm->tlb_flush_batched = true;
649 
650 	/*
651 	 * If the PTE was dirty then it's best to assume it's writable. The
652 	 * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
653 	 * before the page is queued for IO.
654 	 */
655 	if (writable)
656 		tlb_ubc->writable = true;
657 }
658 
659 /*
660  * Returns true if the TLB flush should be deferred to the end of a batch of
661  * unmap operations to reduce IPIs.
662  */
should_defer_flush(struct mm_struct * mm,enum ttu_flags flags)663 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
664 {
665 	bool should_defer = false;
666 
667 	if (!(flags & TTU_BATCH_FLUSH))
668 		return false;
669 
670 	/* If remote CPUs need to be flushed then defer batch the flush */
671 	if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
672 		should_defer = true;
673 	put_cpu();
674 
675 	return should_defer;
676 }
677 
678 /*
679  * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
680  * releasing the PTL if TLB flushes are batched. It's possible for a parallel
681  * operation such as mprotect or munmap to race between reclaim unmapping
682  * the page and flushing the page. If this race occurs, it potentially allows
683  * access to data via a stale TLB entry. Tracking all mm's that have TLB
684  * batching in flight would be expensive during reclaim so instead track
685  * whether TLB batching occurred in the past and if so then do a flush here
686  * if required. This will cost one additional flush per reclaim cycle paid
687  * by the first operation at risk such as mprotect and mumap.
688  *
689  * This must be called under the PTL so that an access to tlb_flush_batched
690  * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
691  * via the PTL.
692  */
flush_tlb_batched_pending(struct mm_struct * mm)693 void flush_tlb_batched_pending(struct mm_struct *mm)
694 {
695 	if (data_race(mm->tlb_flush_batched)) {
696 		flush_tlb_mm(mm);
697 
698 		/*
699 		 * Do not allow the compiler to re-order the clearing of
700 		 * tlb_flush_batched before the tlb is flushed.
701 		 */
702 		barrier();
703 		mm->tlb_flush_batched = false;
704 	}
705 }
706 #else
set_tlb_ubc_flush_pending(struct mm_struct * mm,bool writable)707 static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
708 {
709 }
710 
should_defer_flush(struct mm_struct * mm,enum ttu_flags flags)711 static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
712 {
713 	return false;
714 }
715 #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
716 
717 /*
718  * At what user virtual address is page expected in vma?
719  * Caller should check the page is actually part of the vma.
720  */
page_address_in_vma(struct page * page,struct vm_area_struct * vma)721 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
722 {
723 	if (PageAnon(page)) {
724 		struct anon_vma *page__anon_vma = page_anon_vma(page);
725 		/*
726 		 * Note: swapoff's unuse_vma() is more efficient with this
727 		 * check, and needs it to match anon_vma when KSM is active.
728 		 */
729 		if (!vma->anon_vma || !page__anon_vma ||
730 		    vma->anon_vma->root != page__anon_vma->root)
731 			return -EFAULT;
732 	} else if (!vma->vm_file) {
733 		return -EFAULT;
734 	} else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
735 		return -EFAULT;
736 	}
737 
738 	return vma_address(page, vma);
739 }
740 
mm_find_pmd(struct mm_struct * mm,unsigned long address)741 pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
742 {
743 	pgd_t *pgd;
744 	p4d_t *p4d;
745 	pud_t *pud;
746 	pmd_t *pmd = NULL;
747 	pmd_t pmde;
748 
749 	pgd = pgd_offset(mm, address);
750 	if (!pgd_present(*pgd))
751 		goto out;
752 
753 	p4d = p4d_offset(pgd, address);
754 	if (!p4d_present(*p4d))
755 		goto out;
756 
757 	pud = pud_offset(p4d, address);
758 	if (!pud_present(*pud))
759 		goto out;
760 
761 	pmd = pmd_offset(pud, address);
762 	/*
763 	 * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
764 	 * without holding anon_vma lock for write.  So when looking for a
765 	 * genuine pmde (in which to find pte), test present and !THP together.
766 	 */
767 	pmde = *pmd;
768 	barrier();
769 	if (!pmd_present(pmde) || pmd_trans_huge(pmde))
770 		pmd = NULL;
771 out:
772 	return pmd;
773 }
774 
775 struct page_referenced_arg {
776 	int mapcount;
777 	int referenced;
778 	unsigned long vm_flags;
779 	struct mem_cgroup *memcg;
780 };
781 /*
782  * arg: page_referenced_arg will be passed
783  */
page_referenced_one(struct page * page,struct vm_area_struct * vma,unsigned long address,void * arg)784 static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
785 			unsigned long address, void *arg)
786 {
787 	struct page_referenced_arg *pra = arg;
788 	struct page_vma_mapped_walk pvmw = {
789 		.page = page,
790 		.vma = vma,
791 		.address = address,
792 	};
793 	int referenced = 0;
794 
795 	while (page_vma_mapped_walk(&pvmw)) {
796 		address = pvmw.address;
797 
798 		if (vma->vm_flags & VM_LOCKED) {
799 			page_vma_mapped_walk_done(&pvmw);
800 			pra->vm_flags |= VM_LOCKED;
801 			return false; /* To break the loop */
802 		}
803 
804 		if (pvmw.pte) {
805 			trace_android_vh_look_around(&pvmw, page, vma, &referenced);
806 			if (ptep_clear_flush_young_notify(vma, address,
807 						pvmw.pte)) {
808 				/*
809 				 * Don't treat a reference through
810 				 * a sequentially read mapping as such.
811 				 * If the page has been used in another mapping,
812 				 * we will catch it; if this other mapping is
813 				 * already gone, the unmap path will have set
814 				 * PG_referenced or activated the page.
815 				 */
816 				if (likely(!(vma->vm_flags & VM_SEQ_READ)))
817 					referenced++;
818 			}
819 		} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
820 			if (pmdp_clear_flush_young_notify(vma, address,
821 						pvmw.pmd))
822 				referenced++;
823 		} else {
824 			/* unexpected pmd-mapped page? */
825 			WARN_ON_ONCE(1);
826 		}
827 
828 		pra->mapcount--;
829 	}
830 
831 	if (referenced)
832 		clear_page_idle(page);
833 	if (test_and_clear_page_young(page))
834 		referenced++;
835 
836 	if (referenced) {
837 		pra->referenced++;
838 		pra->vm_flags |= vma->vm_flags;
839 	}
840 
841 	trace_android_vh_page_referenced_one_end(vma, page, referenced);
842 	if (!pra->mapcount)
843 		return false; /* To break the loop */
844 
845 	return true;
846 }
847 
invalid_page_referenced_vma(struct vm_area_struct * vma,void * arg)848 static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg)
849 {
850 	struct page_referenced_arg *pra = arg;
851 	struct mem_cgroup *memcg = pra->memcg;
852 
853 	if (!mm_match_cgroup(vma->vm_mm, memcg))
854 		return true;
855 
856 	return false;
857 }
858 
859 /**
860  * page_referenced - test if the page was referenced
861  * @page: the page to test
862  * @is_locked: caller holds lock on the page
863  * @memcg: target memory cgroup
864  * @vm_flags: collect encountered vma->vm_flags who actually referenced the page
865  *
866  * Quick test_and_clear_referenced for all mappings of a page,
867  *
868  * Return: The number of mappings which referenced the page. Return -1 if
869  * the function bailed out due to rmap lock contention.
870  */
page_referenced(struct page * page,int is_locked,struct mem_cgroup * memcg,unsigned long * vm_flags)871 int page_referenced(struct page *page,
872 		    int is_locked,
873 		    struct mem_cgroup *memcg,
874 		    unsigned long *vm_flags)
875 {
876 	int we_locked = 0;
877 	struct page_referenced_arg pra = {
878 		.mapcount = total_mapcount(page),
879 		.memcg = memcg,
880 	};
881 	struct rmap_walk_control rwc = {
882 		.rmap_one = page_referenced_one,
883 		.arg = (void *)&pra,
884 		.anon_lock = page_lock_anon_vma_read,
885 		.try_lock = true,
886 	};
887 
888 	*vm_flags = 0;
889 	if (!pra.mapcount)
890 		return 0;
891 
892 	if (!page_rmapping(page))
893 		return 0;
894 
895 	if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
896 		we_locked = trylock_page(page);
897 		if (!we_locked)
898 			return 1;
899 	}
900 
901 	/*
902 	 * If we are reclaiming on behalf of a cgroup, skip
903 	 * counting on behalf of references from different
904 	 * cgroups
905 	 */
906 	if (memcg) {
907 		rwc.invalid_vma = invalid_page_referenced_vma;
908 	}
909 
910 	rmap_walk(page, &rwc);
911 	*vm_flags = pra.vm_flags;
912 
913 	if (we_locked)
914 		unlock_page(page);
915 
916 	return rwc.contended ? -1 : pra.referenced;
917 }
918 EXPORT_SYMBOL_GPL(page_referenced);
919 
page_mkclean_one(struct page * page,struct vm_area_struct * vma,unsigned long address,void * arg)920 static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
921 			    unsigned long address, void *arg)
922 {
923 	struct page_vma_mapped_walk pvmw = {
924 		.page = page,
925 		.vma = vma,
926 		.address = address,
927 		.flags = PVMW_SYNC,
928 	};
929 	struct mmu_notifier_range range;
930 	int *cleaned = arg;
931 
932 	/*
933 	 * We have to assume the worse case ie pmd for invalidation. Note that
934 	 * the page can not be free from this function.
935 	 */
936 	mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
937 				0, vma, vma->vm_mm, address,
938 				vma_address_end(page, vma));
939 	mmu_notifier_invalidate_range_start(&range);
940 
941 	while (page_vma_mapped_walk(&pvmw)) {
942 		int ret = 0;
943 
944 		address = pvmw.address;
945 		if (pvmw.pte) {
946 			pte_t entry;
947 			pte_t *pte = pvmw.pte;
948 
949 			if (!pte_dirty(*pte) && !pte_write(*pte))
950 				continue;
951 
952 			flush_cache_page(vma, address, pte_pfn(*pte));
953 			entry = ptep_clear_flush(vma, address, pte);
954 			entry = pte_wrprotect(entry);
955 			entry = pte_mkclean(entry);
956 			set_pte_at(vma->vm_mm, address, pte, entry);
957 			ret = 1;
958 		} else {
959 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
960 			pmd_t *pmd = pvmw.pmd;
961 			pmd_t entry;
962 
963 			if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
964 				continue;
965 
966 			flush_cache_page(vma, address, page_to_pfn(page));
967 			entry = pmdp_invalidate(vma, address, pmd);
968 			entry = pmd_wrprotect(entry);
969 			entry = pmd_mkclean(entry);
970 			set_pmd_at(vma->vm_mm, address, pmd, entry);
971 			ret = 1;
972 #else
973 			/* unexpected pmd-mapped page? */
974 			WARN_ON_ONCE(1);
975 #endif
976 		}
977 
978 		/*
979 		 * No need to call mmu_notifier_invalidate_range() as we are
980 		 * downgrading page table protection not changing it to point
981 		 * to a new page.
982 		 *
983 		 * See Documentation/vm/mmu_notifier.rst
984 		 */
985 		if (ret)
986 			(*cleaned)++;
987 	}
988 
989 	mmu_notifier_invalidate_range_end(&range);
990 
991 	return true;
992 }
993 
invalid_mkclean_vma(struct vm_area_struct * vma,void * arg)994 static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
995 {
996 	if (vma->vm_flags & VM_SHARED)
997 		return false;
998 
999 	return true;
1000 }
1001 
page_mkclean(struct page * page)1002 int page_mkclean(struct page *page)
1003 {
1004 	int cleaned = 0;
1005 	struct address_space *mapping;
1006 	struct rmap_walk_control rwc = {
1007 		.arg = (void *)&cleaned,
1008 		.rmap_one = page_mkclean_one,
1009 		.invalid_vma = invalid_mkclean_vma,
1010 	};
1011 
1012 	BUG_ON(!PageLocked(page));
1013 
1014 	if (!page_mapped(page))
1015 		return 0;
1016 
1017 	mapping = page_mapping(page);
1018 	if (!mapping)
1019 		return 0;
1020 
1021 	rmap_walk(page, &rwc);
1022 
1023 	return cleaned;
1024 }
1025 EXPORT_SYMBOL_GPL(page_mkclean);
1026 
1027 /**
1028  * page_move_anon_rmap - move a page to our anon_vma
1029  * @page:	the page to move to our anon_vma
1030  * @vma:	the vma the page belongs to
1031  *
1032  * When a page belongs exclusively to one process after a COW event,
1033  * that page can be moved into the anon_vma that belongs to just that
1034  * process, so the rmap code will not search the parent or sibling
1035  * processes.
1036  */
page_move_anon_rmap(struct page * page,struct vm_area_struct * vma)1037 void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
1038 {
1039 	struct anon_vma *anon_vma = vma->anon_vma;
1040 
1041 	page = compound_head(page);
1042 
1043 	VM_BUG_ON_PAGE(!PageLocked(page), page);
1044 	VM_BUG_ON_VMA(!anon_vma, vma);
1045 
1046 	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1047 	/*
1048 	 * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
1049 	 * simultaneously, so a concurrent reader (eg page_referenced()'s
1050 	 * PageAnon()) will not see one without the other.
1051 	 */
1052 	WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
1053 }
1054 
1055 /**
1056  * __page_set_anon_rmap - set up new anonymous rmap
1057  * @page:	Page or Hugepage to add to rmap
1058  * @vma:	VM area to add page to.
1059  * @address:	User virtual address of the mapping
1060  * @exclusive:	the page is exclusively owned by the current process
1061  */
__page_set_anon_rmap(struct page * page,struct vm_area_struct * vma,unsigned long address,int exclusive)1062 static void __page_set_anon_rmap(struct page *page,
1063 	struct vm_area_struct *vma, unsigned long address, int exclusive)
1064 {
1065 	struct anon_vma *anon_vma = vma->anon_vma;
1066 
1067 	BUG_ON(!anon_vma);
1068 
1069 	if (PageAnon(page))
1070 		return;
1071 
1072 	/*
1073 	 * If the page isn't exclusively mapped into this vma,
1074 	 * we must use the _oldest_ possible anon_vma for the
1075 	 * page mapping!
1076 	 */
1077 	if (!exclusive)
1078 		anon_vma = anon_vma->root;
1079 
1080 	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
1081 	page->mapping = (struct address_space *) anon_vma;
1082 	page->index = linear_page_index(vma, address);
1083 }
1084 
1085 /**
1086  * __page_check_anon_rmap - sanity check anonymous rmap addition
1087  * @page:	the page to add the mapping to
1088  * @vma:	the vm area in which the mapping is added
1089  * @address:	the user virtual address mapped
1090  */
__page_check_anon_rmap(struct page * page,struct vm_area_struct * vma,unsigned long address)1091 static void __page_check_anon_rmap(struct page *page,
1092 	struct vm_area_struct *vma, unsigned long address)
1093 {
1094 	/*
1095 	 * The page's anon-rmap details (mapping and index) are guaranteed to
1096 	 * be set up correctly at this point.
1097 	 *
1098 	 * We have exclusion against page_add_anon_rmap because the caller
1099 	 * always holds the page locked, except if called from page_dup_rmap,
1100 	 * in which case the page is already known to be setup.
1101 	 *
1102 	 * We have exclusion against page_add_new_anon_rmap because those pages
1103 	 * are initially only visible via the pagetables, and the pte is locked
1104 	 * over the call to page_add_new_anon_rmap.
1105 	 */
1106 	VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
1107 	VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
1108 		       page);
1109 }
1110 
1111 /**
1112  * page_add_anon_rmap - add pte mapping to an anonymous page
1113  * @page:	the page to add the mapping to
1114  * @vma:	the vm area in which the mapping is added
1115  * @address:	the user virtual address mapped
1116  * @compound:	charge the page as compound or small page
1117  *
1118  * The caller needs to hold the pte lock, and the page must be locked in
1119  * the anon_vma case: to serialize mapping,index checking after setting,
1120  * and to ensure that PageAnon is not being upgraded racily to PageKsm
1121  * (but PageKsm is never downgraded to PageAnon).
1122  */
page_add_anon_rmap(struct page * page,struct vm_area_struct * vma,unsigned long address,bool compound)1123 void page_add_anon_rmap(struct page *page,
1124 	struct vm_area_struct *vma, unsigned long address, bool compound)
1125 {
1126 	do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
1127 }
1128 
1129 /*
1130  * Special version of the above for do_swap_page, which often runs
1131  * into pages that are exclusively owned by the current process.
1132  * Everybody else should continue to use page_add_anon_rmap above.
1133  */
do_page_add_anon_rmap(struct page * page,struct vm_area_struct * vma,unsigned long address,int flags)1134 void do_page_add_anon_rmap(struct page *page,
1135 	struct vm_area_struct *vma, unsigned long address, int flags)
1136 {
1137 	bool compound = flags & RMAP_COMPOUND;
1138 	bool first;
1139 	bool success = false;
1140 
1141 	if (unlikely(PageKsm(page)))
1142 		lock_page_memcg(page);
1143 	else
1144 		VM_BUG_ON_PAGE(!PageLocked(page), page);
1145 
1146 	if (compound) {
1147 		atomic_t *mapcount;
1148 		VM_BUG_ON_PAGE(!PageLocked(page), page);
1149 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
1150 		mapcount = compound_mapcount_ptr(page);
1151 		first = atomic_inc_and_test(mapcount);
1152 	} else {
1153 		trace_android_vh_update_page_mapcount(page, true, compound,
1154 							&first, &success);
1155 		if (!success)
1156 			first = atomic_inc_and_test(&page->_mapcount);
1157 	}
1158 
1159 	if (first) {
1160 		int nr = compound ? thp_nr_pages(page) : 1;
1161 		/*
1162 		 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1163 		 * these counters are not modified in interrupt context, and
1164 		 * pte lock(a spinlock) is held, which implies preemption
1165 		 * disabled.
1166 		 */
1167 		if (compound)
1168 			__inc_lruvec_page_state(page, NR_ANON_THPS);
1169 		__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
1170 	}
1171 
1172 	if (unlikely(PageKsm(page))) {
1173 		unlock_page_memcg(page);
1174 		return;
1175 	}
1176 
1177 	/* address might be in next vma when migration races vma_adjust */
1178 	if (first)
1179 		__page_set_anon_rmap(page, vma, address,
1180 				flags & RMAP_EXCLUSIVE);
1181 	else
1182 		__page_check_anon_rmap(page, vma, address);
1183 }
1184 
1185 /**
1186  * __page_add_new_anon_rmap - add pte mapping to a new anonymous page
1187  * @page:	the page to add the mapping to
1188  * @vma:	the vm area in which the mapping is added
1189  * @address:	the user virtual address mapped
1190  * @compound:	charge the page as compound or small page
1191  *
1192  * Same as page_add_anon_rmap but must only be called on *new* pages.
1193  * This means the inc-and-test can be bypassed.
1194  * Page does not have to be locked.
1195  */
__page_add_new_anon_rmap(struct page * page,struct vm_area_struct * vma,unsigned long address,bool compound)1196 void __page_add_new_anon_rmap(struct page *page,
1197 	struct vm_area_struct *vma, unsigned long address, bool compound)
1198 {
1199 	int nr = compound ? thp_nr_pages(page) : 1;
1200 
1201 	__SetPageSwapBacked(page);
1202 	if (compound) {
1203 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
1204 		/* increment count (starts at -1) */
1205 		atomic_set(compound_mapcount_ptr(page), 0);
1206 		if (hpage_pincount_available(page))
1207 			atomic_set(compound_pincount_ptr(page), 0);
1208 
1209 		__inc_lruvec_page_state(page, NR_ANON_THPS);
1210 	} else {
1211 		/* Anon THP always mapped first with PMD */
1212 		VM_BUG_ON_PAGE(PageTransCompound(page), page);
1213 		/* increment count (starts at -1) */
1214 		atomic_set(&page->_mapcount, 0);
1215 	}
1216 	__mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
1217 	__page_set_anon_rmap(page, vma, address, 1);
1218 }
1219 
1220 /**
1221  * page_add_file_rmap - add pte mapping to a file page
1222  * @page: the page to add the mapping to
1223  * @compound: charge the page as compound or small page
1224  *
1225  * The caller needs to hold the pte lock.
1226  */
page_add_file_rmap(struct page * page,bool compound)1227 void page_add_file_rmap(struct page *page, bool compound)
1228 {
1229 	int i, nr = 1;
1230 	bool first_mapping;
1231 	bool success = false;
1232 
1233 	VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
1234 	lock_page_memcg(page);
1235 	if (compound && PageTransHuge(page)) {
1236 		for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
1237 			trace_android_vh_update_page_mapcount(&page[i], true,
1238 					compound, &first_mapping, &success);
1239 			if ((success)) {
1240 				if (first_mapping)
1241 					nr++;
1242 			} else {
1243 				if (atomic_inc_and_test(&page[i]._mapcount))
1244 					nr++;
1245 			}
1246 		}
1247 		if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
1248 			goto out;
1249 		if (PageSwapBacked(page))
1250 			__inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
1251 		else
1252 			__inc_node_page_state(page, NR_FILE_PMDMAPPED);
1253 	} else {
1254 		if (PageTransCompound(page) && page_mapping(page)) {
1255 			VM_WARN_ON_ONCE(!PageLocked(page));
1256 
1257 			SetPageDoubleMap(compound_head(page));
1258 			if (PageMlocked(page))
1259 				clear_page_mlock(compound_head(page));
1260 		}
1261 		trace_android_vh_update_page_mapcount(page, true,
1262 					compound, &first_mapping, &success);
1263 		if (success) {
1264 			if (!first_mapping)
1265 				goto out;
1266 		} else {
1267 			if (!atomic_inc_and_test(&page->_mapcount))
1268 				goto out;
1269 		}
1270 	}
1271 	__mod_lruvec_page_state(page, NR_FILE_MAPPED, nr);
1272 out:
1273 	unlock_page_memcg(page);
1274 }
1275 
page_remove_file_rmap(struct page * page,bool compound)1276 static void page_remove_file_rmap(struct page *page, bool compound)
1277 {
1278 	int i, nr = 1;
1279 	bool first_mapping;
1280 	bool success = false;
1281 
1282 	VM_BUG_ON_PAGE(compound && !PageHead(page), page);
1283 
1284 	/* Hugepages are not counted in NR_FILE_MAPPED for now. */
1285 	if (unlikely(PageHuge(page))) {
1286 		/* hugetlb pages are always mapped with pmds */
1287 		atomic_dec(compound_mapcount_ptr(page));
1288 		return;
1289 	}
1290 
1291 	/* page still mapped by someone else? */
1292 	if (compound && PageTransHuge(page)) {
1293 		for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
1294 			trace_android_vh_update_page_mapcount(&page[i], false,
1295 						compound, &first_mapping, &success);
1296 			if (success) {
1297 				if (first_mapping)
1298 					nr++;
1299 			} else {
1300 				if (atomic_add_negative(-1, &page[i]._mapcount))
1301 					nr++;
1302 			}
1303 		}
1304 		if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1305 			return;
1306 		if (PageSwapBacked(page))
1307 			__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
1308 		else
1309 			__dec_node_page_state(page, NR_FILE_PMDMAPPED);
1310 	} else {
1311 		trace_android_vh_update_page_mapcount(page, false,
1312 					compound, &first_mapping, &success);
1313 		if (success) {
1314 			if (!first_mapping)
1315 				return;
1316 		} else {
1317 			if (!atomic_add_negative(-1, &page->_mapcount))
1318 				return;
1319 		}
1320 	}
1321 
1322 	/*
1323 	 * We use the irq-unsafe __{inc|mod}_lruvec_page_state because
1324 	 * these counters are not modified in interrupt context, and
1325 	 * pte lock(a spinlock) is held, which implies preemption disabled.
1326 	 */
1327 	__mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr);
1328 
1329 	if (unlikely(PageMlocked(page)))
1330 		clear_page_mlock(page);
1331 }
1332 
page_remove_anon_compound_rmap(struct page * page)1333 static void page_remove_anon_compound_rmap(struct page *page)
1334 {
1335 	int i, nr;
1336 	bool first_mapping;
1337 	bool success = false;
1338 
1339 	if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
1340 		return;
1341 
1342 	/* Hugepages are not counted in NR_ANON_PAGES for now. */
1343 	if (unlikely(PageHuge(page)))
1344 		return;
1345 
1346 	if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
1347 		return;
1348 
1349 	__dec_lruvec_page_state(page, NR_ANON_THPS);
1350 
1351 	if (TestClearPageDoubleMap(page)) {
1352 		/*
1353 		 * Subpages can be mapped with PTEs too. Check how many of
1354 		 * them are still mapped.
1355 		 */
1356 		for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
1357 			trace_android_vh_update_page_mapcount(&page[i], false,
1358 					false, &first_mapping, &success);
1359 			if (success) {
1360 				if (first_mapping)
1361 					nr++;
1362 			} else {
1363 				if (atomic_add_negative(-1, &page[i]._mapcount))
1364 					nr++;
1365 			}
1366 		}
1367 
1368 		/*
1369 		 * Queue the page for deferred split if at least one small
1370 		 * page of the compound page is unmapped, but at least one
1371 		 * small page is still mapped.
1372 		 */
1373 		if (nr && nr < thp_nr_pages(page))
1374 			deferred_split_huge_page(page);
1375 	} else {
1376 		nr = thp_nr_pages(page);
1377 	}
1378 
1379 	if (unlikely(PageMlocked(page)))
1380 		clear_page_mlock(page);
1381 
1382 	if (nr)
1383 		__mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
1384 }
1385 
1386 /**
1387  * page_remove_rmap - take down pte mapping from a page
1388  * @page:	page to remove mapping from
1389  * @compound:	uncharge the page as compound or small page
1390  *
1391  * The caller needs to hold the pte lock.
1392  */
page_remove_rmap(struct page * page,bool compound)1393 void page_remove_rmap(struct page *page, bool compound)
1394 {
1395 	bool first_mapping;
1396 	bool success = false;
1397 	lock_page_memcg(page);
1398 
1399 	if (!PageAnon(page)) {
1400 		page_remove_file_rmap(page, compound);
1401 		goto out;
1402 	}
1403 
1404 	if (compound) {
1405 		page_remove_anon_compound_rmap(page);
1406 		goto out;
1407 	}
1408 
1409 	trace_android_vh_update_page_mapcount(page, false,
1410 					compound, &first_mapping, &success);
1411 	if (success) {
1412 		if (!first_mapping)
1413 			goto out;
1414 	} else {
1415 		/* page still mapped by someone else? */
1416 		if (!atomic_add_negative(-1, &page->_mapcount))
1417 			goto out;
1418 	}
1419 	/*
1420 	 * We use the irq-unsafe __{inc|mod}_zone_page_stat because
1421 	 * these counters are not modified in interrupt context, and
1422 	 * pte lock(a spinlock) is held, which implies preemption disabled.
1423 	 */
1424 	__dec_lruvec_page_state(page, NR_ANON_MAPPED);
1425 
1426 	if (unlikely(PageMlocked(page)))
1427 		clear_page_mlock(page);
1428 
1429 	if (PageTransCompound(page))
1430 		deferred_split_huge_page(compound_head(page));
1431 
1432 	/*
1433 	 * It would be tidy to reset the PageAnon mapping here,
1434 	 * but that might overwrite a racing page_add_anon_rmap
1435 	 * which increments mapcount after us but sets mapping
1436 	 * before us: so leave the reset to free_unref_page,
1437 	 * and remember that it's only reliable while mapped.
1438 	 * Leaving it set also helps swapoff to reinstate ptes
1439 	 * faster for those pages still in swapcache.
1440 	 */
1441 out:
1442 	unlock_page_memcg(page);
1443 }
1444 
1445 /*
1446  * @arg: enum ttu_flags will be passed to this argument
1447  */
try_to_unmap_one(struct page * page,struct vm_area_struct * vma,unsigned long address,void * arg)1448 static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
1449 		     unsigned long address, void *arg)
1450 {
1451 	struct mm_struct *mm = vma->vm_mm;
1452 	struct page_vma_mapped_walk pvmw = {
1453 		.page = page,
1454 		.vma = vma,
1455 		.address = address,
1456 	};
1457 	pte_t pteval;
1458 	struct page *subpage;
1459 	bool ret = true;
1460 	struct mmu_notifier_range range;
1461 	enum ttu_flags flags = (enum ttu_flags)(long)arg;
1462 
1463 	/*
1464 	 * When racing against e.g. zap_pte_range() on another cpu,
1465 	 * in between its ptep_get_and_clear_full() and page_remove_rmap(),
1466 	 * try_to_unmap() may return false when it is about to become true,
1467 	 * if page table locking is skipped: use TTU_SYNC to wait for that.
1468 	 */
1469 	if (flags & TTU_SYNC)
1470 		pvmw.flags = PVMW_SYNC;
1471 
1472 	/* munlock has nothing to gain from examining un-locked vmas */
1473 	if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
1474 		return true;
1475 
1476 	if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
1477 	    is_zone_device_page(page) && !is_device_private_page(page))
1478 		return true;
1479 
1480 	if (flags & TTU_SPLIT_HUGE_PMD) {
1481 		split_huge_pmd_address(vma, address,
1482 				flags & TTU_SPLIT_FREEZE, page);
1483 	}
1484 
1485 	/*
1486 	 * For THP, we have to assume the worse case ie pmd for invalidation.
1487 	 * For hugetlb, it could be much worse if we need to do pud
1488 	 * invalidation in the case of pmd sharing.
1489 	 *
1490 	 * Note that the page can not be free in this function as call of
1491 	 * try_to_unmap() must hold a reference on the page.
1492 	 */
1493 	range.end = PageKsm(page) ?
1494 			address + PAGE_SIZE : vma_address_end(page, vma);
1495 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
1496 				address, range.end);
1497 	if (PageHuge(page)) {
1498 		/*
1499 		 * If sharing is possible, start and end will be adjusted
1500 		 * accordingly.
1501 		 */
1502 		adjust_range_if_pmd_sharing_possible(vma, &range.start,
1503 						     &range.end);
1504 	}
1505 	mmu_notifier_invalidate_range_start(&range);
1506 
1507 	while (page_vma_mapped_walk(&pvmw)) {
1508 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
1509 		/* PMD-mapped THP migration entry */
1510 		if (!pvmw.pte && (flags & TTU_MIGRATION)) {
1511 			VM_BUG_ON_PAGE(PageHuge(page) || !PageTransCompound(page), page);
1512 
1513 			set_pmd_migration_entry(&pvmw, page);
1514 			continue;
1515 		}
1516 #endif
1517 
1518 		/*
1519 		 * If the page is mlock()d, we cannot swap it out.
1520 		 * If it's recently referenced (perhaps page_referenced
1521 		 * skipped over this mm) then we should reactivate it.
1522 		 */
1523 		if (!(flags & TTU_IGNORE_MLOCK)) {
1524 			if (vma->vm_flags & VM_LOCKED) {
1525 				/* PTE-mapped THP are never mlocked */
1526 				if (!PageTransCompound(page)) {
1527 					/*
1528 					 * Holding pte lock, we do *not* need
1529 					 * mmap_lock here
1530 					 */
1531 					mlock_vma_page(page);
1532 				}
1533 				ret = false;
1534 				page_vma_mapped_walk_done(&pvmw);
1535 				break;
1536 			}
1537 			if (flags & TTU_MUNLOCK)
1538 				continue;
1539 		}
1540 
1541 		/* Unexpected PMD-mapped THP? */
1542 		VM_BUG_ON_PAGE(!pvmw.pte, page);
1543 
1544 		subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
1545 		address = pvmw.address;
1546 
1547 		if (PageHuge(page) && !PageAnon(page)) {
1548 			/*
1549 			 * To call huge_pmd_unshare, i_mmap_rwsem must be
1550 			 * held in write mode.  Caller needs to explicitly
1551 			 * do this outside rmap routines.
1552 			 */
1553 			VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
1554 			if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
1555 				/*
1556 				 * huge_pmd_unshare unmapped an entire PMD
1557 				 * page.  There is no way of knowing exactly
1558 				 * which PMDs may be cached for this mm, so
1559 				 * we must flush them all.  start/end were
1560 				 * already adjusted above to cover this range.
1561 				 */
1562 				flush_cache_range(vma, range.start, range.end);
1563 				flush_tlb_range(vma, range.start, range.end);
1564 				mmu_notifier_invalidate_range(mm, range.start,
1565 							      range.end);
1566 
1567 				/*
1568 				 * The ref count of the PMD page was dropped
1569 				 * which is part of the way map counting
1570 				 * is done for shared PMDs.  Return 'true'
1571 				 * here.  When there is no other sharing,
1572 				 * huge_pmd_unshare returns false and we will
1573 				 * unmap the actual page and drop map count
1574 				 * to zero.
1575 				 */
1576 				page_vma_mapped_walk_done(&pvmw);
1577 				break;
1578 			}
1579 		}
1580 
1581 		if (IS_ENABLED(CONFIG_MIGRATION) &&
1582 		    (flags & TTU_MIGRATION) &&
1583 		    is_zone_device_page(page)) {
1584 			swp_entry_t entry;
1585 			pte_t swp_pte;
1586 
1587 			pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte);
1588 
1589 			/*
1590 			 * Store the pfn of the page in a special migration
1591 			 * pte. do_swap_page() will wait until the migration
1592 			 * pte is removed and then restart fault handling.
1593 			 */
1594 			entry = make_migration_entry(page, 0);
1595 			swp_pte = swp_entry_to_pte(entry);
1596 
1597 			/*
1598 			 * pteval maps a zone device page and is therefore
1599 			 * a swap pte.
1600 			 */
1601 			if (pte_swp_soft_dirty(pteval))
1602 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
1603 			if (pte_swp_uffd_wp(pteval))
1604 				swp_pte = pte_swp_mkuffd_wp(swp_pte);
1605 			set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
1606 			/*
1607 			 * No need to invalidate here it will synchronize on
1608 			 * against the special swap migration pte.
1609 			 *
1610 			 * The assignment to subpage above was computed from a
1611 			 * swap PTE which results in an invalid pointer.
1612 			 * Since only PAGE_SIZE pages can currently be
1613 			 * migrated, just set it to page. This will need to be
1614 			 * changed when hugepage migrations to device private
1615 			 * memory are supported.
1616 			 */
1617 			subpage = page;
1618 			goto discard;
1619 		}
1620 
1621 		/* Nuke the page table entry. */
1622 		flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
1623 		if (should_defer_flush(mm, flags)) {
1624 			/*
1625 			 * We clear the PTE but do not flush so potentially
1626 			 * a remote CPU could still be writing to the page.
1627 			 * If the entry was previously clean then the
1628 			 * architecture must guarantee that a clear->dirty
1629 			 * transition on a cached TLB entry is written through
1630 			 * and traps if the PTE is unmapped.
1631 			 */
1632 			pteval = ptep_get_and_clear(mm, address, pvmw.pte);
1633 
1634 			set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
1635 		} else {
1636 			pteval = ptep_clear_flush(vma, address, pvmw.pte);
1637 		}
1638 
1639 		/* Move the dirty bit to the page. Now the pte is gone. */
1640 		if (pte_dirty(pteval))
1641 			set_page_dirty(page);
1642 
1643 		/* Update high watermark before we lower rss */
1644 		update_hiwater_rss(mm);
1645 
1646 		if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
1647 			pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
1648 			if (PageHuge(page)) {
1649 				hugetlb_count_sub(compound_nr(page), mm);
1650 				set_huge_swap_pte_at(mm, address,
1651 						     pvmw.pte, pteval,
1652 						     vma_mmu_pagesize(vma));
1653 			} else {
1654 				dec_mm_counter(mm, mm_counter(page));
1655 				set_pte_at(mm, address, pvmw.pte, pteval);
1656 			}
1657 
1658 		} else if (pte_unused(pteval) && !userfaultfd_armed(vma)) {
1659 			/*
1660 			 * The guest indicated that the page content is of no
1661 			 * interest anymore. Simply discard the pte, vmscan
1662 			 * will take care of the rest.
1663 			 * A future reference will then fault in a new zero
1664 			 * page. When userfaultfd is active, we must not drop
1665 			 * this page though, as its main user (postcopy
1666 			 * migration) will not expect userfaults on already
1667 			 * copied pages.
1668 			 */
1669 			dec_mm_counter(mm, mm_counter(page));
1670 			/* We have to invalidate as we cleared the pte */
1671 			mmu_notifier_invalidate_range(mm, address,
1672 						      address + PAGE_SIZE);
1673 		} else if (IS_ENABLED(CONFIG_MIGRATION) &&
1674 				(flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
1675 			swp_entry_t entry;
1676 			pte_t swp_pte;
1677 
1678 			if (arch_unmap_one(mm, vma, address, pteval) < 0) {
1679 				set_pte_at(mm, address, pvmw.pte, pteval);
1680 				ret = false;
1681 				page_vma_mapped_walk_done(&pvmw);
1682 				break;
1683 			}
1684 
1685 			/*
1686 			 * Store the pfn of the page in a special migration
1687 			 * pte. do_swap_page() will wait until the migration
1688 			 * pte is removed and then restart fault handling.
1689 			 */
1690 			entry = make_migration_entry(subpage,
1691 					pte_write(pteval));
1692 			swp_pte = swp_entry_to_pte(entry);
1693 			if (pte_soft_dirty(pteval))
1694 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
1695 			if (pte_uffd_wp(pteval))
1696 				swp_pte = pte_swp_mkuffd_wp(swp_pte);
1697 			set_pte_at(mm, address, pvmw.pte, swp_pte);
1698 			/*
1699 			 * No need to invalidate here it will synchronize on
1700 			 * against the special swap migration pte.
1701 			 */
1702 		} else if (PageAnon(page)) {
1703 			swp_entry_t entry = { .val = page_private(subpage) };
1704 			pte_t swp_pte;
1705 			/*
1706 			 * Store the swap location in the pte.
1707 			 * See handle_pte_fault() ...
1708 			 */
1709 			if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) {
1710 				WARN_ON_ONCE(1);
1711 				ret = false;
1712 				/* We have to invalidate as we cleared the pte */
1713 				mmu_notifier_invalidate_range(mm, address,
1714 							address + PAGE_SIZE);
1715 				page_vma_mapped_walk_done(&pvmw);
1716 				break;
1717 			}
1718 
1719 			/* MADV_FREE page check */
1720 			if (!PageSwapBacked(page)) {
1721 				int ref_count, map_count;
1722 
1723 				/*
1724 				 * Synchronize with gup_pte_range():
1725 				 * - clear PTE; barrier; read refcount
1726 				 * - inc refcount; barrier; read PTE
1727 				 */
1728 				smp_mb();
1729 
1730 				ref_count = page_ref_count(page);
1731 				map_count = page_mapcount(page);
1732 
1733 				/*
1734 				 * Order reads for page refcount and dirty flag
1735 				 * (see comments in __remove_mapping()).
1736 				 */
1737 				smp_rmb();
1738 
1739 				/*
1740 				 * The only page refs must be one from isolation
1741 				 * plus the rmap(s) (dropped by discard:).
1742 				 */
1743 				if (ref_count == 1 + map_count &&
1744 				    !PageDirty(page)) {
1745 					/* Invalidate as we cleared the pte */
1746 					mmu_notifier_invalidate_range(mm,
1747 						address, address + PAGE_SIZE);
1748 					dec_mm_counter(mm, MM_ANONPAGES);
1749 					goto discard;
1750 				}
1751 
1752 				/*
1753 				 * If the page was redirtied, it cannot be
1754 				 * discarded. Remap the page to page table.
1755 				 */
1756 				set_pte_at(mm, address, pvmw.pte, pteval);
1757 				SetPageSwapBacked(page);
1758 				ret = false;
1759 				page_vma_mapped_walk_done(&pvmw);
1760 				break;
1761 			}
1762 
1763 			if (swap_duplicate(entry) < 0) {
1764 				set_pte_at(mm, address, pvmw.pte, pteval);
1765 				ret = false;
1766 				page_vma_mapped_walk_done(&pvmw);
1767 				break;
1768 			}
1769 			if (arch_unmap_one(mm, vma, address, pteval) < 0) {
1770 				set_pte_at(mm, address, pvmw.pte, pteval);
1771 				ret = false;
1772 				page_vma_mapped_walk_done(&pvmw);
1773 				break;
1774 			}
1775 			if (list_empty(&mm->mmlist)) {
1776 				spin_lock(&mmlist_lock);
1777 				if (list_empty(&mm->mmlist))
1778 					list_add(&mm->mmlist, &init_mm.mmlist);
1779 				spin_unlock(&mmlist_lock);
1780 			}
1781 			dec_mm_counter(mm, MM_ANONPAGES);
1782 			inc_mm_counter(mm, MM_SWAPENTS);
1783 			swp_pte = swp_entry_to_pte(entry);
1784 			if (pte_soft_dirty(pteval))
1785 				swp_pte = pte_swp_mksoft_dirty(swp_pte);
1786 			if (pte_uffd_wp(pteval))
1787 				swp_pte = pte_swp_mkuffd_wp(swp_pte);
1788 			set_pte_at(mm, address, pvmw.pte, swp_pte);
1789 			/* Invalidate as we cleared the pte */
1790 			mmu_notifier_invalidate_range(mm, address,
1791 						      address + PAGE_SIZE);
1792 		} else {
1793 			/*
1794 			 * This is a locked file-backed page, thus it cannot
1795 			 * be removed from the page cache and replaced by a new
1796 			 * page before mmu_notifier_invalidate_range_end, so no
1797 			 * concurrent thread might update its page table to
1798 			 * point at new page while a device still is using this
1799 			 * page.
1800 			 *
1801 			 * See Documentation/vm/mmu_notifier.rst
1802 			 */
1803 			dec_mm_counter(mm, mm_counter_file(page));
1804 		}
1805 discard:
1806 		/*
1807 		 * No need to call mmu_notifier_invalidate_range() it has be
1808 		 * done above for all cases requiring it to happen under page
1809 		 * table lock before mmu_notifier_invalidate_range_end()
1810 		 *
1811 		 * See Documentation/vm/mmu_notifier.rst
1812 		 */
1813 		page_remove_rmap(subpage, PageHuge(page));
1814 		put_page(page);
1815 	}
1816 
1817 	mmu_notifier_invalidate_range_end(&range);
1818 	trace_android_vh_try_to_unmap_one(vma, page, address, ret);
1819 
1820 	return ret;
1821 }
1822 
invalid_migration_vma(struct vm_area_struct * vma,void * arg)1823 static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
1824 {
1825 	return vma_is_temporary_stack(vma);
1826 }
1827 
page_not_mapped(struct page * page)1828 static int page_not_mapped(struct page *page)
1829 {
1830 	return !page_mapped(page);
1831 }
1832 
1833 /**
1834  * try_to_unmap - try to remove all page table mappings to a page
1835  * @page: the page to get unmapped
1836  * @flags: action and flags
1837  *
1838  * Tries to remove all the page table entries which are mapping this
1839  * page, used in the pageout path.  Caller must hold the page lock.
1840  *
1841  * If unmap is successful, return true. Otherwise, false.
1842  */
try_to_unmap(struct page * page,enum ttu_flags flags)1843 bool try_to_unmap(struct page *page, enum ttu_flags flags)
1844 {
1845 	struct rmap_walk_control rwc = {
1846 		.rmap_one = try_to_unmap_one,
1847 		.arg = (void *)flags,
1848 		.done = page_not_mapped,
1849 		.anon_lock = page_lock_anon_vma_read,
1850 	};
1851 
1852 	/*
1853 	 * During exec, a temporary VMA is setup and later moved.
1854 	 * The VMA is moved under the anon_vma lock but not the
1855 	 * page tables leading to a race where migration cannot
1856 	 * find the migration ptes. Rather than increasing the
1857 	 * locking requirements of exec(), migration skips
1858 	 * temporary VMAs until after exec() completes.
1859 	 */
1860 	if ((flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))
1861 	    && !PageKsm(page) && PageAnon(page))
1862 		rwc.invalid_vma = invalid_migration_vma;
1863 
1864 	if (flags & TTU_RMAP_LOCKED)
1865 		rmap_walk_locked(page, &rwc);
1866 	else
1867 		rmap_walk(page, &rwc);
1868 
1869 	/*
1870 	 * When racing against e.g. zap_pte_range() on another cpu,
1871 	 * in between its ptep_get_and_clear_full() and page_remove_rmap(),
1872 	 * try_to_unmap() may return false when it is about to become true,
1873 	 * if page table locking is skipped: use TTU_SYNC to wait for that.
1874 	 */
1875 	return !page_mapcount(page);
1876 }
1877 
1878 /**
1879  * try_to_munlock - try to munlock a page
1880  * @page: the page to be munlocked
1881  *
1882  * Called from munlock code.  Checks all of the VMAs mapping the page
1883  * to make sure nobody else has this page mlocked. The page will be
1884  * returned with PG_mlocked cleared if no other vmas have it mlocked.
1885  */
1886 
try_to_munlock(struct page * page)1887 void try_to_munlock(struct page *page)
1888 {
1889 	struct rmap_walk_control rwc = {
1890 		.rmap_one = try_to_unmap_one,
1891 		.arg = (void *)TTU_MUNLOCK,
1892 		.done = page_not_mapped,
1893 		.anon_lock = page_lock_anon_vma_read,
1894 
1895 	};
1896 
1897 	VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page);
1898 	VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
1899 
1900 	rmap_walk(page, &rwc);
1901 }
1902 
__put_anon_vma(struct anon_vma * anon_vma)1903 void __put_anon_vma(struct anon_vma *anon_vma)
1904 {
1905 	struct anon_vma *root = anon_vma->root;
1906 
1907 	anon_vma_free(anon_vma);
1908 	if (root != anon_vma && atomic_dec_and_test(&root->refcount))
1909 		anon_vma_free(root);
1910 }
1911 
rmap_walk_anon_lock(struct page * page,struct rmap_walk_control * rwc)1912 static struct anon_vma *rmap_walk_anon_lock(struct page *page,
1913 					struct rmap_walk_control *rwc)
1914 {
1915 	struct anon_vma *anon_vma;
1916 
1917 	if (rwc->anon_lock)
1918 		return rwc->anon_lock(page, rwc);
1919 
1920 	/*
1921 	 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
1922 	 * because that depends on page_mapped(); but not all its usages
1923 	 * are holding mmap_lock. Users without mmap_lock are required to
1924 	 * take a reference count to prevent the anon_vma disappearing
1925 	 */
1926 	anon_vma = page_anon_vma(page);
1927 	if (!anon_vma)
1928 		return NULL;
1929 
1930 	if (anon_vma_trylock_read(anon_vma))
1931 		goto out;
1932 
1933 	if (rwc->try_lock) {
1934 		anon_vma = NULL;
1935 		rwc->contended = true;
1936 		goto out;
1937 	}
1938 
1939 	anon_vma_lock_read(anon_vma);
1940 out:
1941 	return anon_vma;
1942 }
1943 
1944 /*
1945  * rmap_walk_anon - do something to anonymous page using the object-based
1946  * rmap method
1947  * @page: the page to be handled
1948  * @rwc: control variable according to each walk type
1949  *
1950  * Find all the mappings of a page using the mapping pointer and the vma chains
1951  * contained in the anon_vma struct it points to.
1952  *
1953  * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
1954  * where the page was found will be held for write.  So, we won't recheck
1955  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
1956  * LOCKED.
1957  */
rmap_walk_anon(struct page * page,struct rmap_walk_control * rwc,bool locked)1958 static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
1959 		bool locked)
1960 {
1961 	struct anon_vma *anon_vma;
1962 	pgoff_t pgoff_start, pgoff_end;
1963 	struct anon_vma_chain *avc;
1964 
1965 	if (locked) {
1966 		anon_vma = page_anon_vma(page);
1967 		/* anon_vma disappear under us? */
1968 		VM_BUG_ON_PAGE(!anon_vma, page);
1969 	} else {
1970 		anon_vma = rmap_walk_anon_lock(page, rwc);
1971 	}
1972 	if (!anon_vma)
1973 		return;
1974 
1975 	pgoff_start = page_to_pgoff(page);
1976 	pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
1977 	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
1978 			pgoff_start, pgoff_end) {
1979 		struct vm_area_struct *vma = avc->vma;
1980 		unsigned long address = vma_address(page, vma);
1981 
1982 		VM_BUG_ON_VMA(address == -EFAULT, vma);
1983 		cond_resched();
1984 
1985 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
1986 			continue;
1987 
1988 		if (!rwc->rmap_one(page, vma, address, rwc->arg))
1989 			break;
1990 		if (rwc->done && rwc->done(page))
1991 			break;
1992 	}
1993 
1994 	if (!locked)
1995 		anon_vma_unlock_read(anon_vma);
1996 }
1997 
1998 /*
1999  * rmap_walk_file - do something to file page using the object-based rmap method
2000  * @page: the page to be handled
2001  * @rwc: control variable according to each walk type
2002  *
2003  * Find all the mappings of a page using the mapping pointer and the vma chains
2004  * contained in the address_space struct it points to.
2005  *
2006  * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
2007  * where the page was found will be held for write.  So, we won't recheck
2008  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
2009  * LOCKED.
2010  */
rmap_walk_file(struct page * page,struct rmap_walk_control * rwc,bool locked)2011 static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
2012 		bool locked)
2013 {
2014 	struct address_space *mapping = page_mapping(page);
2015 	pgoff_t pgoff_start, pgoff_end;
2016 	struct vm_area_struct *vma;
2017 	bool got_lock = false, success = false;
2018 
2019 	/*
2020 	 * The page lock not only makes sure that page->mapping cannot
2021 	 * suddenly be NULLified by truncation, it makes sure that the
2022 	 * structure at mapping cannot be freed and reused yet,
2023 	 * so we can safely take mapping->i_mmap_rwsem.
2024 	 */
2025 	VM_BUG_ON_PAGE(!PageLocked(page), page);
2026 
2027 	if (!mapping)
2028 		return;
2029 
2030 	pgoff_start = page_to_pgoff(page);
2031 	pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
2032 	if (!locked) {
2033 		trace_android_vh_do_page_trylock(page,
2034 					&mapping->i_mmap_rwsem, &got_lock, &success);
2035 		if (success) {
2036 			if (!got_lock)
2037 				return;
2038 		} else {
2039 			if (i_mmap_trylock_read(mapping))
2040 				goto lookup;
2041 
2042 			if (rwc->try_lock) {
2043 				rwc->contended = true;
2044 				return;
2045 			}
2046 
2047 			i_mmap_lock_read(mapping);
2048 		}
2049 	}
2050 lookup:
2051 	vma_interval_tree_foreach(vma, &mapping->i_mmap,
2052 			pgoff_start, pgoff_end) {
2053 		unsigned long address = vma_address(page, vma);
2054 
2055 		VM_BUG_ON_VMA(address == -EFAULT, vma);
2056 		cond_resched();
2057 
2058 		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
2059 			continue;
2060 
2061 		if (!rwc->rmap_one(page, vma, address, rwc->arg))
2062 			goto done;
2063 		if (rwc->done && rwc->done(page))
2064 			goto done;
2065 	}
2066 
2067 done:
2068 	if (!locked)
2069 		i_mmap_unlock_read(mapping);
2070 }
2071 
rmap_walk(struct page * page,struct rmap_walk_control * rwc)2072 void rmap_walk(struct page *page, struct rmap_walk_control *rwc)
2073 {
2074 	if (unlikely(PageKsm(page)))
2075 		rmap_walk_ksm(page, rwc);
2076 	else if (PageAnon(page))
2077 		rmap_walk_anon(page, rwc, false);
2078 	else
2079 		rmap_walk_file(page, rwc, false);
2080 }
2081 
2082 /* Like rmap_walk, but caller holds relevant rmap lock */
rmap_walk_locked(struct page * page,struct rmap_walk_control * rwc)2083 void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc)
2084 {
2085 	/* no ksm support for now */
2086 	VM_BUG_ON_PAGE(PageKsm(page), page);
2087 	if (PageAnon(page))
2088 		rmap_walk_anon(page, rwc, true);
2089 	else
2090 		rmap_walk_file(page, rwc, true);
2091 }
2092 
2093 #ifdef CONFIG_HUGETLB_PAGE
2094 /*
2095  * The following two functions are for anonymous (private mapped) hugepages.
2096  * Unlike common anonymous pages, anonymous hugepages have no accounting code
2097  * and no lru code, because we handle hugepages differently from common pages.
2098  */
hugepage_add_anon_rmap(struct page * page,struct vm_area_struct * vma,unsigned long address)2099 void hugepage_add_anon_rmap(struct page *page,
2100 			    struct vm_area_struct *vma, unsigned long address)
2101 {
2102 	struct anon_vma *anon_vma = vma->anon_vma;
2103 	int first;
2104 
2105 	BUG_ON(!PageLocked(page));
2106 	BUG_ON(!anon_vma);
2107 	/* address might be in next vma when migration races vma_adjust */
2108 	first = atomic_inc_and_test(compound_mapcount_ptr(page));
2109 	if (first)
2110 		__page_set_anon_rmap(page, vma, address, 0);
2111 }
2112 
hugepage_add_new_anon_rmap(struct page * page,struct vm_area_struct * vma,unsigned long address)2113 void hugepage_add_new_anon_rmap(struct page *page,
2114 			struct vm_area_struct *vma, unsigned long address)
2115 {
2116 	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
2117 	atomic_set(compound_mapcount_ptr(page), 0);
2118 	if (hpage_pincount_available(page))
2119 		atomic_set(compound_pincount_ptr(page), 0);
2120 
2121 	__page_set_anon_rmap(page, vma, address, 1);
2122 }
2123 #endif /* CONFIG_HUGETLB_PAGE */
2124