• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  linux/mm/swap.c
4  *
5  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
6  */
7 
8 /*
9  * This file contains the default values for the operation of the
10  * Linux VM subsystem. Fine-tuning documentation can be found in
11  * Documentation/admin-guide/sysctl/vm.rst.
12  * Started 18.12.91
13  * Swap aging added 23.2.95, Stephen Tweedie.
14  * Buffermem limits added 12.3.98, Rik van Riel.
15  */
16 
17 #include <linux/mm.h>
18 #include <linux/sched.h>
19 #include <linux/kernel_stat.h>
20 #include <linux/swap.h>
21 #include <linux/mman.h>
22 #include <linux/pagemap.h>
23 #include <linux/pagevec.h>
24 #include <linux/init.h>
25 #include <linux/export.h>
26 #include <linux/mm_inline.h>
27 #include <linux/percpu_counter.h>
28 #include <linux/memremap.h>
29 #include <linux/percpu.h>
30 #include <linux/cpu.h>
31 #include <linux/notifier.h>
32 #include <linux/backing-dev.h>
33 #include <linux/memcontrol.h>
34 #include <linux/gfp.h>
35 #include <linux/uio.h>
36 #include <linux/hugetlb.h>
37 #include <linux/page_idle.h>
38 #include <linux/local_lock.h>
39 #include <linux/buffer_head.h>
40 
41 #include "internal.h"
42 
43 #define CREATE_TRACE_POINTS
44 #include <trace/events/pagemap.h>
45 
46 /* How many pages do we try to swap or page in/out together? */
47 int page_cluster;
48 
49 /* Protecting only lru_rotate.pvec which requires disabling interrupts */
50 struct lru_rotate {
51 	local_lock_t lock;
52 	struct pagevec pvec;
53 };
54 static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
55 	.lock = INIT_LOCAL_LOCK(lock),
56 };
57 
58 /*
59  * The following struct pagevec are grouped together because they are protected
60  * by disabling preemption (and interrupts remain enabled).
61  */
62 struct lru_pvecs {
63 	local_lock_t lock;
64 	struct pagevec lru_add;
65 	struct pagevec lru_deactivate_file;
66 	struct pagevec lru_deactivate;
67 	struct pagevec lru_lazyfree;
68 	struct pagevec lru_lazyfree_movetail;
69 #ifdef CONFIG_SMP
70 	struct pagevec activate_page;
71 #endif
72 };
73 static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
74 	.lock = INIT_LOCAL_LOCK(lock),
75 };
76 
77 /*
78  * This path almost never happens for VM activity - pages are normally
79  * freed via pagevecs.  But it gets used by networking.
80  */
__page_cache_release(struct page * page)81 static void __page_cache_release(struct page *page)
82 {
83 	if (PageLRU(page)) {
84 		pg_data_t *pgdat = page_pgdat(page);
85 		struct lruvec *lruvec;
86 		unsigned long flags;
87 
88 		spin_lock_irqsave(&pgdat->lru_lock, flags);
89 		lruvec = mem_cgroup_page_lruvec(page, pgdat);
90 		VM_BUG_ON_PAGE(!PageLRU(page), page);
91 		__ClearPageLRU(page);
92 		del_page_from_lru_list(page, lruvec, page_off_lru(page));
93 		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
94 	}
95 	__ClearPageWaiters(page);
96 }
97 
__put_single_page(struct page * page)98 static void __put_single_page(struct page *page)
99 {
100 	__page_cache_release(page);
101 	mem_cgroup_uncharge(page);
102 	free_unref_page(page);
103 }
104 
__put_compound_page(struct page * page)105 static void __put_compound_page(struct page *page)
106 {
107 	/*
108 	 * __page_cache_release() is supposed to be called for thp, not for
109 	 * hugetlb. This is because hugetlb page does never have PageLRU set
110 	 * (it's never listed to any LRU lists) and no memcg routines should
111 	 * be called for hugetlb (it has a separate hugetlb_cgroup.)
112 	 */
113 	if (!PageHuge(page))
114 		__page_cache_release(page);
115 	destroy_compound_page(page);
116 }
117 
__put_page(struct page * page)118 void __put_page(struct page *page)
119 {
120 	if (is_zone_device_page(page)) {
121 		put_dev_pagemap(page->pgmap);
122 
123 		/*
124 		 * The page belongs to the device that created pgmap. Do
125 		 * not return it to page allocator.
126 		 */
127 		return;
128 	}
129 
130 	if (unlikely(PageCompound(page)))
131 		__put_compound_page(page);
132 	else
133 		__put_single_page(page);
134 }
135 EXPORT_SYMBOL(__put_page);
136 
137 /**
138  * put_pages_list() - release a list of pages
139  * @pages: list of pages threaded on page->lru
140  *
141  * Release a list of pages which are strung together on page.lru.  Currently
142  * used by read_cache_pages() and related error recovery code.
143  */
put_pages_list(struct list_head * pages)144 void put_pages_list(struct list_head *pages)
145 {
146 	while (!list_empty(pages)) {
147 		struct page *victim;
148 
149 		victim = lru_to_page(pages);
150 		list_del(&victim->lru);
151 		put_page(victim);
152 	}
153 }
154 EXPORT_SYMBOL(put_pages_list);
155 
156 /*
157  * get_kernel_pages() - pin kernel pages in memory
158  * @kiov:	An array of struct kvec structures
159  * @nr_segs:	number of segments to pin
160  * @write:	pinning for read/write, currently ignored
161  * @pages:	array that receives pointers to the pages pinned.
162  *		Should be at least nr_segs long.
163  *
164  * Returns number of pages pinned. This may be fewer than the number
165  * requested. If nr_pages is 0 or negative, returns 0. If no pages
166  * were pinned, returns -errno. Each page returned must be released
167  * with a put_page() call when it is finished with.
168  */
get_kernel_pages(const struct kvec * kiov,int nr_segs,int write,struct page ** pages)169 int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
170 		struct page **pages)
171 {
172 	int seg;
173 
174 	for (seg = 0; seg < nr_segs; seg++) {
175 		if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
176 			return seg;
177 
178 		pages[seg] = kmap_to_page(kiov[seg].iov_base);
179 		get_page(pages[seg]);
180 	}
181 
182 	return seg;
183 }
184 EXPORT_SYMBOL_GPL(get_kernel_pages);
185 
186 /*
187  * get_kernel_page() - pin a kernel page in memory
188  * @start:	starting kernel address
189  * @write:	pinning for read/write, currently ignored
190  * @pages:	array that receives pointer to the page pinned.
191  *		Must be at least nr_segs long.
192  *
193  * Returns 1 if page is pinned. If the page was not pinned, returns
194  * -errno. The page returned must be released with a put_page() call
195  * when it is finished with.
196  */
get_kernel_page(unsigned long start,int write,struct page ** pages)197 int get_kernel_page(unsigned long start, int write, struct page **pages)
198 {
199 	const struct kvec kiov = {
200 		.iov_base = (void *)start,
201 		.iov_len = PAGE_SIZE
202 	};
203 
204 	return get_kernel_pages(&kiov, 1, write, pages);
205 }
206 EXPORT_SYMBOL_GPL(get_kernel_page);
207 
pagevec_lru_move_fn(struct pagevec * pvec,void (* move_fn)(struct page * page,struct lruvec * lruvec,void * arg),void * arg)208 static void pagevec_lru_move_fn(struct pagevec *pvec,
209 	void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
210 	void *arg)
211 {
212 	int i;
213 	struct pglist_data *pgdat = NULL;
214 	struct lruvec *lruvec;
215 	unsigned long flags = 0;
216 
217 	for (i = 0; i < pagevec_count(pvec); i++) {
218 		struct page *page = pvec->pages[i];
219 		struct pglist_data *pagepgdat = page_pgdat(page);
220 
221 		if (pagepgdat != pgdat) {
222 			if (pgdat)
223 				spin_unlock_irqrestore(&pgdat->lru_lock, flags);
224 			pgdat = pagepgdat;
225 			spin_lock_irqsave(&pgdat->lru_lock, flags);
226 		}
227 
228 		lruvec = mem_cgroup_page_lruvec(page, pgdat);
229 		(*move_fn)(page, lruvec, arg);
230 	}
231 	if (pgdat)
232 		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
233 	release_pages(pvec->pages, pvec->nr);
234 	pagevec_reinit(pvec);
235 }
236 
pagevec_move_tail_fn(struct page * page,struct lruvec * lruvec,void * arg)237 static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
238 				 void *arg)
239 {
240 	int *pgmoved = arg;
241 
242 	if (PageLRU(page) && !PageUnevictable(page)) {
243 		del_page_from_lru_list(page, lruvec, page_lru(page));
244 		ClearPageActive(page);
245 		add_page_to_lru_list_tail(page, lruvec, page_lru(page));
246 		(*pgmoved) += thp_nr_pages(page);
247 	}
248 }
249 
250 /*
251  * pagevec_move_tail() must be called with IRQ disabled.
252  * Otherwise this may cause nasty races.
253  */
pagevec_move_tail(struct pagevec * pvec)254 static void pagevec_move_tail(struct pagevec *pvec)
255 {
256 	int pgmoved = 0;
257 
258 	pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
259 	__count_vm_events(PGROTATED, pgmoved);
260 }
261 
262 /* return true if pagevec needs to drain */
pagevec_add_and_need_flush(struct pagevec * pvec,struct page * page)263 static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
264 {
265 	bool ret = false;
266 
267 	if (!pagevec_add(pvec, page) || PageCompound(page) ||
268 			lru_cache_disabled())
269 		ret = true;
270 
271 	return ret;
272 }
273 
274 /*
275  * Writeback is about to end against a page which has been marked for immediate
276  * reclaim.  If it still appears to be reclaimable, move it to the tail of the
277  * inactive list.
278  */
rotate_reclaimable_page(struct page * page)279 void rotate_reclaimable_page(struct page *page)
280 {
281 	if (!PageLocked(page) && !PageDirty(page) &&
282 	    !PageUnevictable(page) && PageLRU(page)) {
283 		struct pagevec *pvec;
284 		unsigned long flags;
285 
286 		get_page(page);
287 		local_lock_irqsave(&lru_rotate.lock, flags);
288 		pvec = this_cpu_ptr(&lru_rotate.pvec);
289 		if (pagevec_add_and_need_flush(pvec, page))
290 			pagevec_move_tail(pvec);
291 		local_unlock_irqrestore(&lru_rotate.lock, flags);
292 	}
293 }
294 
lru_note_cost(struct lruvec * lruvec,bool file,unsigned int nr_pages)295 void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
296 {
297 	do {
298 		unsigned long lrusize;
299 
300 		/* Record cost event */
301 		if (file)
302 			lruvec->file_cost += nr_pages;
303 		else
304 			lruvec->anon_cost += nr_pages;
305 
306 		/*
307 		 * Decay previous events
308 		 *
309 		 * Because workloads change over time (and to avoid
310 		 * overflow) we keep these statistics as a floating
311 		 * average, which ends up weighing recent refaults
312 		 * more than old ones.
313 		 */
314 		lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
315 			  lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
316 			  lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
317 			  lruvec_page_state(lruvec, NR_ACTIVE_FILE);
318 
319 		if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
320 			lruvec->file_cost /= 2;
321 			lruvec->anon_cost /= 2;
322 		}
323 	} while ((lruvec = parent_lruvec(lruvec)));
324 }
325 
lru_note_cost_page(struct page * page)326 void lru_note_cost_page(struct page *page)
327 {
328 	lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
329 		      page_is_file_lru(page), thp_nr_pages(page));
330 }
331 
__activate_page(struct page * page,struct lruvec * lruvec,void * arg)332 static void __activate_page(struct page *page, struct lruvec *lruvec,
333 			    void *arg)
334 {
335 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
336 		int lru = page_lru_base_type(page);
337 		int nr_pages = thp_nr_pages(page);
338 
339 		del_page_from_lru_list(page, lruvec, lru);
340 		SetPageActive(page);
341 		lru += LRU_ACTIVE;
342 		add_page_to_lru_list(page, lruvec, lru);
343 		trace_mm_lru_activate(page);
344 
345 		__count_vm_events(PGACTIVATE, nr_pages);
346 		__count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
347 				     nr_pages);
348 	}
349 }
350 
351 #ifdef CONFIG_SMP
activate_page_drain(int cpu)352 static void activate_page_drain(int cpu)
353 {
354 	struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
355 
356 	if (pagevec_count(pvec))
357 		pagevec_lru_move_fn(pvec, __activate_page, NULL);
358 }
359 
need_activate_page_drain(int cpu)360 static bool need_activate_page_drain(int cpu)
361 {
362 	return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
363 }
364 
activate_page(struct page * page)365 static void activate_page(struct page *page)
366 {
367 	page = compound_head(page);
368 	if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
369 		struct pagevec *pvec;
370 
371 		local_lock(&lru_pvecs.lock);
372 		pvec = this_cpu_ptr(&lru_pvecs.activate_page);
373 		get_page(page);
374 		if (pagevec_add_and_need_flush(pvec, page))
375 			pagevec_lru_move_fn(pvec, __activate_page, NULL);
376 		local_unlock(&lru_pvecs.lock);
377 	}
378 }
379 
380 #else
activate_page_drain(int cpu)381 static inline void activate_page_drain(int cpu)
382 {
383 }
384 
activate_page(struct page * page)385 static void activate_page(struct page *page)
386 {
387 	pg_data_t *pgdat = page_pgdat(page);
388 
389 	page = compound_head(page);
390 	spin_lock_irq(&pgdat->lru_lock);
391 	__activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL);
392 	spin_unlock_irq(&pgdat->lru_lock);
393 }
394 #endif
395 
__lru_cache_activate_page(struct page * page)396 static void __lru_cache_activate_page(struct page *page)
397 {
398 	struct pagevec *pvec;
399 	int i;
400 
401 	local_lock(&lru_pvecs.lock);
402 	pvec = this_cpu_ptr(&lru_pvecs.lru_add);
403 
404 	/*
405 	 * Search backwards on the optimistic assumption that the page being
406 	 * activated has just been added to this pagevec. Note that only
407 	 * the local pagevec is examined as a !PageLRU page could be in the
408 	 * process of being released, reclaimed, migrated or on a remote
409 	 * pagevec that is currently being drained. Furthermore, marking
410 	 * a remote pagevec's page PageActive potentially hits a race where
411 	 * a page is marked PageActive just after it is added to the inactive
412 	 * list causing accounting errors and BUG_ON checks to trigger.
413 	 */
414 	for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
415 		struct page *pagevec_page = pvec->pages[i];
416 
417 		if (pagevec_page == page) {
418 			SetPageActive(page);
419 			break;
420 		}
421 	}
422 
423 	local_unlock(&lru_pvecs.lock);
424 }
425 
426 /*
427  * Mark a page as having seen activity.
428  *
429  * inactive,unreferenced	->	inactive,referenced
430  * inactive,referenced		->	active,unreferenced
431  * active,unreferenced		->	active,referenced
432  *
433  * When a newly allocated page is not yet visible, so safe for non-atomic ops,
434  * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
435  */
mark_page_accessed(struct page * page)436 void mark_page_accessed(struct page *page)
437 {
438 	page = compound_head(page);
439 
440 	trace_android_vh_mark_page_accessed(page);
441 	if (!PageReferenced(page)) {
442 		SetPageReferenced(page);
443 	} else if (PageUnevictable(page)) {
444 		/*
445 		 * Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
446 		 * this list is never rotated or maintained, so marking an
447 		 * evictable page accessed has no effect.
448 		 */
449 	} else if (!PageActive(page)) {
450 		/*
451 		 * If the page is on the LRU, queue it for activation via
452 		 * lru_pvecs.activate_page. Otherwise, assume the page is on a
453 		 * pagevec, mark it active and it'll be moved to the active
454 		 * LRU on the next drain.
455 		 */
456 		if (PageLRU(page))
457 			activate_page(page);
458 		else
459 			__lru_cache_activate_page(page);
460 		ClearPageReferenced(page);
461 		workingset_activation(page);
462 	}
463 	if (page_is_idle(page))
464 		clear_page_idle(page);
465 }
466 EXPORT_SYMBOL(mark_page_accessed);
467 
468 /**
469  * lru_cache_add - add a page to a page list
470  * @page: the page to be added to the LRU.
471  *
472  * Queue the page for addition to the LRU via pagevec. The decision on whether
473  * to add the page to the [in]active [file|anon] list is deferred until the
474  * pagevec is drained. This gives a chance for the caller of lru_cache_add()
475  * have the page added to the active list using mark_page_accessed().
476  */
lru_cache_add(struct page * page)477 void lru_cache_add(struct page *page)
478 {
479 	struct pagevec *pvec;
480 
481 	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
482 	VM_BUG_ON_PAGE(PageLRU(page), page);
483 
484 	get_page(page);
485 	local_lock(&lru_pvecs.lock);
486 	pvec = this_cpu_ptr(&lru_pvecs.lru_add);
487 	if (pagevec_add_and_need_flush(pvec, page))
488 		__pagevec_lru_add(pvec);
489 	local_unlock(&lru_pvecs.lock);
490 }
491 EXPORT_SYMBOL(lru_cache_add);
492 
493 /**
494  * lru_cache_add_inactive_or_unevictable
495  * @page:  the page to be added to LRU
496  * @vma:   vma in which page is mapped for determining reclaimability
497  *
498  * Place @page on the inactive or unevictable LRU list, depending on its
499  * evictability.
500  */
__lru_cache_add_inactive_or_unevictable(struct page * page,unsigned long vma_flags)501 void __lru_cache_add_inactive_or_unevictable(struct page *page,
502 					 unsigned long vma_flags)
503 {
504 	bool unevictable;
505 
506 	VM_BUG_ON_PAGE(PageLRU(page), page);
507 
508 	unevictable = (vma_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
509 	if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
510 		int nr_pages = thp_nr_pages(page);
511 		/*
512 		 * We use the irq-unsafe __mod_zone_page_stat because this
513 		 * counter is not modified from interrupt context, and the pte
514 		 * lock is held(spinlock), which implies preemption disabled.
515 		 */
516 		__mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
517 		count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
518 	}
519 	lru_cache_add(page);
520 }
521 
522 /*
523  * If the page can not be invalidated, it is moved to the
524  * inactive list to speed up its reclaim.  It is moved to the
525  * head of the list, rather than the tail, to give the flusher
526  * threads some time to write it out, as this is much more
527  * effective than the single-page writeout from reclaim.
528  *
529  * If the page isn't page_mapped and dirty/writeback, the page
530  * could reclaim asap using PG_reclaim.
531  *
532  * 1. active, mapped page -> none
533  * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
534  * 3. inactive, mapped page -> none
535  * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
536  * 5. inactive, clean -> inactive, tail
537  * 6. Others -> none
538  *
539  * In 4, why it moves inactive's head, the VM expects the page would
540  * be write it out by flusher threads as this is much more effective
541  * than the single-page writeout from reclaim.
542  */
lru_deactivate_file_fn(struct page * page,struct lruvec * lruvec,void * arg)543 static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
544 			      void *arg)
545 {
546 	int lru;
547 	bool active;
548 	int nr_pages = thp_nr_pages(page);
549 
550 	if (!PageLRU(page))
551 		return;
552 
553 	if (PageUnevictable(page))
554 		return;
555 
556 	/* Some processes are using the page */
557 	if (page_mapped(page))
558 		return;
559 
560 	active = PageActive(page);
561 	lru = page_lru_base_type(page);
562 
563 	del_page_from_lru_list(page, lruvec, lru + active);
564 	ClearPageActive(page);
565 	ClearPageReferenced(page);
566 
567 	if (PageWriteback(page) || PageDirty(page)) {
568 		/*
569 		 * PG_reclaim could be raced with end_page_writeback
570 		 * It can make readahead confusing.  But race window
571 		 * is _really_ small and  it's non-critical problem.
572 		 */
573 		add_page_to_lru_list(page, lruvec, lru);
574 		SetPageReclaim(page);
575 	} else {
576 		/*
577 		 * The page's writeback ends up during pagevec
578 		 * We moves tha page into tail of inactive.
579 		 */
580 		add_page_to_lru_list_tail(page, lruvec, lru);
581 		__count_vm_events(PGROTATED, nr_pages);
582 	}
583 
584 	if (active) {
585 		__count_vm_events(PGDEACTIVATE, nr_pages);
586 		__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
587 				     nr_pages);
588 	}
589 }
590 
lru_deactivate_fn(struct page * page,struct lruvec * lruvec,void * arg)591 static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
592 			    void *arg)
593 {
594 	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
595 		int lru = page_lru_base_type(page);
596 		int nr_pages = thp_nr_pages(page);
597 
598 		del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
599 		ClearPageActive(page);
600 		ClearPageReferenced(page);
601 		add_page_to_lru_list(page, lruvec, lru);
602 
603 		__count_vm_events(PGDEACTIVATE, nr_pages);
604 		__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
605 				     nr_pages);
606 	}
607 }
608 
lru_lazyfree_fn(struct page * page,struct lruvec * lruvec,void * arg)609 static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
610 			    void *arg)
611 {
612 	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
613 	    !PageSwapCache(page) && !PageUnevictable(page)) {
614 		bool active = PageActive(page);
615 		int nr_pages = thp_nr_pages(page);
616 
617 		del_page_from_lru_list(page, lruvec,
618 				       LRU_INACTIVE_ANON + active);
619 		ClearPageActive(page);
620 		ClearPageReferenced(page);
621 		/*
622 		 * Lazyfree pages are clean anonymous pages.  They have
623 		 * PG_swapbacked flag cleared, to distinguish them from normal
624 		 * anonymous pages
625 		 */
626 		ClearPageSwapBacked(page);
627 		add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
628 
629 		__count_vm_events(PGLAZYFREE, nr_pages);
630 		__count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
631 				     nr_pages);
632 	}
633 }
634 
lru_lazyfree_movetail_fn(struct page * page,struct lruvec * lruvec,void * arg)635 static void lru_lazyfree_movetail_fn(struct page *page, struct lruvec *lruvec,
636 			    void *arg)
637 {
638 	bool *add_to_tail = (bool *)arg;
639 
640 	if (PageLRU(page) && !PageUnevictable(page) && PageSwapBacked(page) &&
641 		!PageSwapCache(page)) {
642 		bool active = PageActive(page);
643 
644 		del_page_from_lru_list(page, lruvec,
645 				       LRU_INACTIVE_ANON + active);
646 		ClearPageActive(page);
647 		ClearPageReferenced(page);
648 		if (add_to_tail && *add_to_tail)
649 			add_page_to_lru_list_tail(page, lruvec, LRU_INACTIVE_FILE);
650 		else
651 			add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
652 	}
653 }
654 
655 /*
656  * Drain pages out of the cpu's pagevecs.
657  * Either "cpu" is the current CPU, and preemption has already been
658  * disabled; or "cpu" is being hot-unplugged, and is already dead.
659  */
lru_add_drain_cpu(int cpu)660 void lru_add_drain_cpu(int cpu)
661 {
662 	struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu);
663 
664 	if (pagevec_count(pvec))
665 		__pagevec_lru_add(pvec);
666 
667 	pvec = &per_cpu(lru_rotate.pvec, cpu);
668 	/* Disabling interrupts below acts as a compiler barrier. */
669 	if (data_race(pagevec_count(pvec))) {
670 		unsigned long flags;
671 
672 		/* No harm done if a racing interrupt already did this */
673 		local_lock_irqsave(&lru_rotate.lock, flags);
674 		pagevec_move_tail(pvec);
675 		local_unlock_irqrestore(&lru_rotate.lock, flags);
676 	}
677 
678 	pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
679 	if (pagevec_count(pvec))
680 		pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
681 
682 	pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
683 	if (pagevec_count(pvec))
684 		pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
685 
686 	pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
687 	if (pagevec_count(pvec))
688 		pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
689 
690 	pvec = &per_cpu(lru_pvecs.lru_lazyfree_movetail, cpu);
691 	if (pagevec_count(pvec))
692 		pagevec_lru_move_fn(pvec, lru_lazyfree_movetail_fn, NULL);
693 
694 	activate_page_drain(cpu);
695 }
696 
697 /**
698  * deactivate_file_page - forcefully deactivate a file page
699  * @page: page to deactivate
700  *
701  * This function hints the VM that @page is a good reclaim candidate,
702  * for example if its invalidation fails due to the page being dirty
703  * or under writeback.
704  */
deactivate_file_page(struct page * page)705 void deactivate_file_page(struct page *page)
706 {
707 	/*
708 	 * In a workload with many unevictable page such as mprotect,
709 	 * unevictable page deactivation for accelerating reclaim is pointless.
710 	 */
711 	if (PageUnevictable(page))
712 		return;
713 
714 	if (likely(get_page_unless_zero(page))) {
715 		struct pagevec *pvec;
716 
717 		local_lock(&lru_pvecs.lock);
718 		pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
719 
720 		if (pagevec_add_and_need_flush(pvec, page))
721 			pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
722 		local_unlock(&lru_pvecs.lock);
723 	}
724 }
725 
726 /*
727  * deactivate_page - deactivate a page
728  * @page: page to deactivate
729  *
730  * deactivate_page() moves @page to the inactive list if @page was on the active
731  * list and was not an unevictable page.  This is done to accelerate the reclaim
732  * of @page.
733  */
deactivate_page(struct page * page)734 void deactivate_page(struct page *page)
735 {
736 	if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
737 		struct pagevec *pvec;
738 
739 		local_lock(&lru_pvecs.lock);
740 		pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
741 		get_page(page);
742 		if (pagevec_add_and_need_flush(pvec, page))
743 			pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
744 		local_unlock(&lru_pvecs.lock);
745 	}
746 }
747 
748 /**
749  * mark_page_lazyfree - make an anon page lazyfree
750  * @page: page to deactivate
751  *
752  * mark_page_lazyfree() moves @page to the inactive file list.
753  * This is done to accelerate the reclaim of @page.
754  */
mark_page_lazyfree(struct page * page)755 void mark_page_lazyfree(struct page *page)
756 {
757 	if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
758 	    !PageSwapCache(page) && !PageUnevictable(page)) {
759 		struct pagevec *pvec;
760 
761 		local_lock(&lru_pvecs.lock);
762 		pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
763 		get_page(page);
764 		if (pagevec_add_and_need_flush(pvec, page))
765 			pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
766 		local_unlock(&lru_pvecs.lock);
767 	}
768 }
769 
770 /**
771  * mark_page_lazyfree_movetail - make a swapbacked page lazyfree
772  * @page: page to deactivate
773  *
774  * mark_page_lazyfree_movetail() moves @page to the tail of inactive file list.
775  * This is done to accelerate the reclaim of @page.
776  */
mark_page_lazyfree_movetail(struct page * page,bool tail)777 void mark_page_lazyfree_movetail(struct page *page, bool tail)
778 {
779 	if (PageLRU(page) && !PageUnevictable(page) && PageSwapBacked(page) &&
780 		!PageSwapCache(page)) {
781 		struct pagevec *pvec;
782 
783 		local_lock(&lru_pvecs.lock);
784 		pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree_movetail);
785 		get_page(page);
786 		if (pagevec_add_and_need_flush(pvec, page))
787 			pagevec_lru_move_fn(pvec,
788 					lru_lazyfree_movetail_fn, &tail);
789 		local_unlock(&lru_pvecs.lock);
790 	}
791 }
792 
lru_add_drain(void)793 void lru_add_drain(void)
794 {
795 	local_lock(&lru_pvecs.lock);
796 	lru_add_drain_cpu(smp_processor_id());
797 	local_unlock(&lru_pvecs.lock);
798 }
799 
800 /*
801  * It's called from per-cpu workqueue context in SMP case so
802  * lru_add_drain_cpu and invalidate_bh_lrus_cpu should run on
803  * the same cpu. It shouldn't be a problem in !SMP case since
804  * the core is only one and the locks will disable preemption.
805  */
lru_add_and_bh_lrus_drain(void)806 static void lru_add_and_bh_lrus_drain(void)
807 {
808 	local_lock(&lru_pvecs.lock);
809 	lru_add_drain_cpu(smp_processor_id());
810 	local_unlock(&lru_pvecs.lock);
811 	invalidate_bh_lrus_cpu();
812 }
813 
lru_add_drain_cpu_zone(struct zone * zone)814 void lru_add_drain_cpu_zone(struct zone *zone)
815 {
816 	local_lock(&lru_pvecs.lock);
817 	lru_add_drain_cpu(smp_processor_id());
818 	drain_local_pages(zone);
819 	local_unlock(&lru_pvecs.lock);
820 }
821 
822 #ifdef CONFIG_SMP
823 
824 static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
825 
lru_add_drain_per_cpu(struct work_struct * dummy)826 static void lru_add_drain_per_cpu(struct work_struct *dummy)
827 {
828 	lru_add_and_bh_lrus_drain();
829 }
830 
831 /*
832  * Doesn't need any cpu hotplug locking because we do rely on per-cpu
833  * kworkers being shut down before our page_alloc_cpu_dead callback is
834  * executed on the offlined cpu.
835  * Calling this function with cpu hotplug locks held can actually lead
836  * to obscure indirect dependencies via WQ context.
837  */
__lru_add_drain_all(bool force_all_cpus)838 inline void __lru_add_drain_all(bool force_all_cpus)
839 {
840 	/*
841 	 * lru_drain_gen - Global pages generation number
842 	 *
843 	 * (A) Definition: global lru_drain_gen = x implies that all generations
844 	 *     0 < n <= x are already *scheduled* for draining.
845 	 *
846 	 * This is an optimization for the highly-contended use case where a
847 	 * user space workload keeps constantly generating a flow of pages for
848 	 * each CPU.
849 	 */
850 	static unsigned int lru_drain_gen;
851 	static struct cpumask has_work;
852 	static DEFINE_MUTEX(lock);
853 	unsigned cpu, this_gen;
854 
855 	/*
856 	 * Make sure nobody triggers this path before mm_percpu_wq is fully
857 	 * initialized.
858 	 */
859 	if (WARN_ON(!mm_percpu_wq))
860 		return;
861 
862 	/*
863 	 * Guarantee pagevec counter stores visible by this CPU are visible to
864 	 * other CPUs before loading the current drain generation.
865 	 */
866 	smp_mb();
867 
868 	/*
869 	 * (B) Locally cache global LRU draining generation number
870 	 *
871 	 * The read barrier ensures that the counter is loaded before the mutex
872 	 * is taken. It pairs with smp_mb() inside the mutex critical section
873 	 * at (D).
874 	 */
875 	this_gen = smp_load_acquire(&lru_drain_gen);
876 
877 	mutex_lock(&lock);
878 
879 	/*
880 	 * (C) Exit the draining operation if a newer generation, from another
881 	 * lru_add_drain_all(), was already scheduled for draining. Check (A).
882 	 */
883 	if (unlikely(this_gen != lru_drain_gen && !force_all_cpus))
884 		goto done;
885 
886 	/*
887 	 * (D) Increment global generation number
888 	 *
889 	 * Pairs with smp_load_acquire() at (B), outside of the critical
890 	 * section. Use a full memory barrier to guarantee that the new global
891 	 * drain generation number is stored before loading pagevec counters.
892 	 *
893 	 * This pairing must be done here, before the for_each_online_cpu loop
894 	 * below which drains the page vectors.
895 	 *
896 	 * Let x, y, and z represent some system CPU numbers, where x < y < z.
897 	 * Assume CPU #z is is in the middle of the for_each_online_cpu loop
898 	 * below and has already reached CPU #y's per-cpu data. CPU #x comes
899 	 * along, adds some pages to its per-cpu vectors, then calls
900 	 * lru_add_drain_all().
901 	 *
902 	 * If the paired barrier is done at any later step, e.g. after the
903 	 * loop, CPU #x will just exit at (C) and miss flushing out all of its
904 	 * added pages.
905 	 */
906 	WRITE_ONCE(lru_drain_gen, lru_drain_gen + 1);
907 	smp_mb();
908 
909 	cpumask_clear(&has_work);
910 	for_each_online_cpu(cpu) {
911 		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
912 
913 		if (force_all_cpus ||
914 		    pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
915 		    data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
916 		    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
917 		    pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
918 		    pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
919 		    pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree_movetail, cpu)) ||
920 		    need_activate_page_drain(cpu) ||
921 		    has_bh_in_lru(cpu, NULL)) {
922 			INIT_WORK(work, lru_add_drain_per_cpu);
923 			queue_work_on(cpu, mm_percpu_wq, work);
924 			__cpumask_set_cpu(cpu, &has_work);
925 		}
926 	}
927 
928 	for_each_cpu(cpu, &has_work)
929 		flush_work(&per_cpu(lru_add_drain_work, cpu));
930 
931 done:
932 	mutex_unlock(&lock);
933 }
934 
lru_add_drain_all(void)935 void lru_add_drain_all(void)
936 {
937 	__lru_add_drain_all(false);
938 }
939 #else
lru_add_drain_all(void)940 void lru_add_drain_all(void)
941 {
942 	lru_add_drain();
943 }
944 #endif /* CONFIG_SMP */
945 
946 static atomic_t lru_disable_count = ATOMIC_INIT(0);
947 
lru_cache_disabled(void)948 bool lru_cache_disabled(void)
949 {
950 	return atomic_read(&lru_disable_count) != 0;
951 }
952 
lru_cache_enable(void)953 void lru_cache_enable(void)
954 {
955 	atomic_dec(&lru_disable_count);
956 }
957 EXPORT_SYMBOL_GPL(lru_cache_enable);
958 
959 /*
960  * lru_cache_disable() needs to be called before we start compiling
961  * a list of pages to be migrated using isolate_lru_page().
962  * It drains pages on LRU cache and then disable on all cpus until
963  * lru_cache_enable is called.
964  *
965  * Must be paired with a call to lru_cache_enable().
966  */
lru_cache_disable(void)967 void lru_cache_disable(void)
968 {
969 	/*
970 	 * If someone is already disabled lru_cache, just return with
971 	 * increasing the lru_disable_count.
972 	 */
973 	if (atomic_inc_not_zero(&lru_disable_count))
974 		return;
975 #ifdef CONFIG_SMP
976 	/*
977 	 * lru_add_drain_all in the force mode will schedule draining on
978 	 * all online CPUs so any calls of lru_cache_disabled wrapped by
979 	 * local_lock or preemption disabled would be ordered by that.
980 	 * The atomic operation doesn't need to have stronger ordering
981 	 * requirements because that is enforeced by the scheduling
982 	 * guarantees.
983 	 */
984 	__lru_add_drain_all(true);
985 #else
986 	lru_add_and_bh_lrus_drain();
987 #endif
988 	atomic_inc(&lru_disable_count);
989 }
990 EXPORT_SYMBOL_GPL(lru_cache_disable);
991 
992 /**
993  * release_pages - batched put_page()
994  * @pages: array of pages to release
995  * @nr: number of pages
996  *
997  * Decrement the reference count on all the pages in @pages.  If it
998  * fell to zero, remove the page from the LRU and free it.
999  */
release_pages(struct page ** pages,int nr)1000 void release_pages(struct page **pages, int nr)
1001 {
1002 	int i;
1003 	LIST_HEAD(pages_to_free);
1004 	struct pglist_data *locked_pgdat = NULL;
1005 	struct lruvec *lruvec;
1006 	unsigned long flags;
1007 	unsigned int lock_batch;
1008 
1009 	for (i = 0; i < nr; i++) {
1010 		struct page *page = pages[i];
1011 
1012 		/*
1013 		 * Make sure the IRQ-safe lock-holding time does not get
1014 		 * excessive with a continuous string of pages from the
1015 		 * same pgdat. The lock is held only if pgdat != NULL.
1016 		 */
1017 		if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
1018 			spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
1019 			locked_pgdat = NULL;
1020 		}
1021 
1022 		page = compound_head(page);
1023 		if (is_huge_zero_page(page))
1024 			continue;
1025 
1026 		if (is_zone_device_page(page)) {
1027 			if (locked_pgdat) {
1028 				spin_unlock_irqrestore(&locked_pgdat->lru_lock,
1029 						       flags);
1030 				locked_pgdat = NULL;
1031 			}
1032 			/*
1033 			 * ZONE_DEVICE pages that return 'false' from
1034 			 * page_is_devmap_managed() do not require special
1035 			 * processing, and instead, expect a call to
1036 			 * put_page_testzero().
1037 			 */
1038 			if (page_is_devmap_managed(page)) {
1039 				put_devmap_managed_page(page);
1040 				continue;
1041 			}
1042 		}
1043 
1044 		if (!put_page_testzero(page))
1045 			continue;
1046 
1047 		if (PageCompound(page)) {
1048 			if (locked_pgdat) {
1049 				spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
1050 				locked_pgdat = NULL;
1051 			}
1052 			__put_compound_page(page);
1053 			continue;
1054 		}
1055 
1056 		if (PageLRU(page)) {
1057 			struct pglist_data *pgdat = page_pgdat(page);
1058 
1059 			if (pgdat != locked_pgdat) {
1060 				if (locked_pgdat)
1061 					spin_unlock_irqrestore(&locked_pgdat->lru_lock,
1062 									flags);
1063 				lock_batch = 0;
1064 				locked_pgdat = pgdat;
1065 				spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
1066 			}
1067 
1068 			lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
1069 			VM_BUG_ON_PAGE(!PageLRU(page), page);
1070 			__ClearPageLRU(page);
1071 			del_page_from_lru_list(page, lruvec, page_off_lru(page));
1072 		}
1073 
1074 		__ClearPageWaiters(page);
1075 
1076 		list_add(&page->lru, &pages_to_free);
1077 	}
1078 	if (locked_pgdat)
1079 		spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
1080 
1081 	mem_cgroup_uncharge_list(&pages_to_free);
1082 	free_unref_page_list(&pages_to_free);
1083 }
1084 EXPORT_SYMBOL(release_pages);
1085 
1086 /*
1087  * The pages which we're about to release may be in the deferred lru-addition
1088  * queues.  That would prevent them from really being freed right now.  That's
1089  * OK from a correctness point of view but is inefficient - those pages may be
1090  * cache-warm and we want to give them back to the page allocator ASAP.
1091  *
1092  * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
1093  * and __pagevec_lru_add_active() call release_pages() directly to avoid
1094  * mutual recursion.
1095  */
__pagevec_release(struct pagevec * pvec)1096 void __pagevec_release(struct pagevec *pvec)
1097 {
1098 	if (!pvec->percpu_pvec_drained) {
1099 		lru_add_drain();
1100 		pvec->percpu_pvec_drained = true;
1101 	}
1102 	release_pages(pvec->pages, pagevec_count(pvec));
1103 	pagevec_reinit(pvec);
1104 }
1105 EXPORT_SYMBOL(__pagevec_release);
1106 
1107 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1108 /* used by __split_huge_page_refcount() */
lru_add_page_tail(struct page * page,struct page * page_tail,struct lruvec * lruvec,struct list_head * list)1109 void lru_add_page_tail(struct page *page, struct page *page_tail,
1110 		       struct lruvec *lruvec, struct list_head *list)
1111 {
1112 	VM_BUG_ON_PAGE(!PageHead(page), page);
1113 	VM_BUG_ON_PAGE(PageCompound(page_tail), page);
1114 	VM_BUG_ON_PAGE(PageLRU(page_tail), page);
1115 	lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
1116 
1117 	if (!list)
1118 		SetPageLRU(page_tail);
1119 
1120 	if (likely(PageLRU(page)))
1121 		list_add_tail(&page_tail->lru, &page->lru);
1122 	else if (list) {
1123 		/* page reclaim is reclaiming a huge page */
1124 		get_page(page_tail);
1125 		list_add_tail(&page_tail->lru, list);
1126 	} else {
1127 		/*
1128 		 * Head page has not yet been counted, as an hpage,
1129 		 * so we must account for each subpage individually.
1130 		 *
1131 		 * Put page_tail on the list at the correct position
1132 		 * so they all end up in order.
1133 		 */
1134 		add_page_to_lru_list_tail(page_tail, lruvec,
1135 					  page_lru(page_tail));
1136 	}
1137 }
1138 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1139 
__pagevec_lru_add_fn(struct page * page,struct lruvec * lruvec,void * arg)1140 static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
1141 				 void *arg)
1142 {
1143 	enum lru_list lru;
1144 	int was_unevictable = TestClearPageUnevictable(page);
1145 	int nr_pages = thp_nr_pages(page);
1146 
1147 	VM_BUG_ON_PAGE(PageLRU(page), page);
1148 
1149 	/*
1150 	 * Page becomes evictable in two ways:
1151 	 * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
1152 	 * 2) Before acquiring LRU lock to put the page to correct LRU and then
1153 	 *   a) do PageLRU check with lock [check_move_unevictable_pages]
1154 	 *   b) do PageLRU check before lock [clear_page_mlock]
1155 	 *
1156 	 * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
1157 	 * following strict ordering:
1158 	 *
1159 	 * #0: __pagevec_lru_add_fn		#1: clear_page_mlock
1160 	 *
1161 	 * SetPageLRU()				TestClearPageMlocked()
1162 	 * smp_mb() // explicit ordering	// above provides strict
1163 	 *					// ordering
1164 	 * PageMlocked()			PageLRU()
1165 	 *
1166 	 *
1167 	 * if '#1' does not observe setting of PG_lru by '#0' and fails
1168 	 * isolation, the explicit barrier will make sure that page_evictable
1169 	 * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
1170 	 * can be reordered after PageMlocked check and can make '#1' to fail
1171 	 * the isolation of the page whose Mlocked bit is cleared (#0 is also
1172 	 * looking at the same page) and the evictable page will be stranded
1173 	 * in an unevictable LRU.
1174 	 */
1175 	SetPageLRU(page);
1176 	smp_mb__after_atomic();
1177 
1178 	if (page_evictable(page)) {
1179 		lru = page_lru(page);
1180 		if (was_unevictable)
1181 			__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
1182 	} else {
1183 		lru = LRU_UNEVICTABLE;
1184 		ClearPageActive(page);
1185 		SetPageUnevictable(page);
1186 		if (!was_unevictable)
1187 			__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
1188 	}
1189 
1190 	add_page_to_lru_list(page, lruvec, lru);
1191 	trace_mm_lru_insertion(page, lru);
1192 }
1193 
1194 /*
1195  * Add the passed pages to the LRU, then drop the caller's refcount
1196  * on them.  Reinitialises the caller's pagevec.
1197  */
__pagevec_lru_add(struct pagevec * pvec)1198 void __pagevec_lru_add(struct pagevec *pvec)
1199 {
1200 	pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
1201 }
1202 
1203 /**
1204  * pagevec_lookup_entries - gang pagecache lookup
1205  * @pvec:	Where the resulting entries are placed
1206  * @mapping:	The address_space to search
1207  * @start:	The starting entry index
1208  * @nr_entries:	The maximum number of pages
1209  * @indices:	The cache indices corresponding to the entries in @pvec
1210  *
1211  * pagevec_lookup_entries() will search for and return a group of up
1212  * to @nr_pages pages and shadow entries in the mapping.  All
1213  * entries are placed in @pvec.  pagevec_lookup_entries() takes a
1214  * reference against actual pages in @pvec.
1215  *
1216  * The search returns a group of mapping-contiguous entries with
1217  * ascending indexes.  There may be holes in the indices due to
1218  * not-present entries.
1219  *
1220  * Only one subpage of a Transparent Huge Page is returned in one call:
1221  * allowing truncate_inode_pages_range() to evict the whole THP without
1222  * cycling through a pagevec of extra references.
1223  *
1224  * pagevec_lookup_entries() returns the number of entries which were
1225  * found.
1226  */
pagevec_lookup_entries(struct pagevec * pvec,struct address_space * mapping,pgoff_t start,unsigned nr_entries,pgoff_t * indices)1227 unsigned pagevec_lookup_entries(struct pagevec *pvec,
1228 				struct address_space *mapping,
1229 				pgoff_t start, unsigned nr_entries,
1230 				pgoff_t *indices)
1231 {
1232 	pvec->nr = find_get_entries(mapping, start, nr_entries,
1233 				    pvec->pages, indices);
1234 	return pagevec_count(pvec);
1235 }
1236 
1237 /**
1238  * pagevec_remove_exceptionals - pagevec exceptionals pruning
1239  * @pvec:	The pagevec to prune
1240  *
1241  * pagevec_lookup_entries() fills both pages and exceptional radix
1242  * tree entries into the pagevec.  This function prunes all
1243  * exceptionals from @pvec without leaving holes, so that it can be
1244  * passed on to page-only pagevec operations.
1245  */
pagevec_remove_exceptionals(struct pagevec * pvec)1246 void pagevec_remove_exceptionals(struct pagevec *pvec)
1247 {
1248 	int i, j;
1249 
1250 	for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
1251 		struct page *page = pvec->pages[i];
1252 		if (!xa_is_value(page))
1253 			pvec->pages[j++] = page;
1254 	}
1255 	pvec->nr = j;
1256 }
1257 
1258 /**
1259  * pagevec_lookup_range - gang pagecache lookup
1260  * @pvec:	Where the resulting pages are placed
1261  * @mapping:	The address_space to search
1262  * @start:	The starting page index
1263  * @end:	The final page index
1264  *
1265  * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
1266  * pages in the mapping starting from index @start and upto index @end
1267  * (inclusive).  The pages are placed in @pvec.  pagevec_lookup() takes a
1268  * reference against the pages in @pvec.
1269  *
1270  * The search returns a group of mapping-contiguous pages with ascending
1271  * indexes.  There may be holes in the indices due to not-present pages. We
1272  * also update @start to index the next page for the traversal.
1273  *
1274  * pagevec_lookup_range() returns the number of pages which were found. If this
1275  * number is smaller than PAGEVEC_SIZE, the end of specified range has been
1276  * reached.
1277  */
pagevec_lookup_range(struct pagevec * pvec,struct address_space * mapping,pgoff_t * start,pgoff_t end)1278 unsigned pagevec_lookup_range(struct pagevec *pvec,
1279 		struct address_space *mapping, pgoff_t *start, pgoff_t end)
1280 {
1281 	pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
1282 					pvec->pages);
1283 	return pagevec_count(pvec);
1284 }
1285 EXPORT_SYMBOL(pagevec_lookup_range);
1286 
pagevec_lookup_range_tag(struct pagevec * pvec,struct address_space * mapping,pgoff_t * index,pgoff_t end,xa_mark_t tag)1287 unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
1288 		struct address_space *mapping, pgoff_t *index, pgoff_t end,
1289 		xa_mark_t tag)
1290 {
1291 	pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
1292 					PAGEVEC_SIZE, pvec->pages);
1293 	return pagevec_count(pvec);
1294 }
1295 EXPORT_SYMBOL(pagevec_lookup_range_tag);
1296 
pagevec_lookup_range_nr_tag(struct pagevec * pvec,struct address_space * mapping,pgoff_t * index,pgoff_t end,xa_mark_t tag,unsigned max_pages)1297 unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
1298 		struct address_space *mapping, pgoff_t *index, pgoff_t end,
1299 		xa_mark_t tag, unsigned max_pages)
1300 {
1301 	pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
1302 		min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
1303 	return pagevec_count(pvec);
1304 }
1305 EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
1306 /*
1307  * Perform any setup for the swap system
1308  */
swap_setup(void)1309 void __init swap_setup(void)
1310 {
1311 	unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
1312 
1313 	/* Use a smaller cluster for small-memory machines */
1314 	if (megs < 16)
1315 		page_cluster = 2;
1316 	else
1317 		page_cluster = 3;
1318 	/*
1319 	 * Right now other parts of the system means that we
1320 	 * _really_ don't want to cluster much more
1321 	 */
1322 }
1323 
1324 #ifdef CONFIG_DEV_PAGEMAP_OPS
put_devmap_managed_page(struct page * page)1325 void put_devmap_managed_page(struct page *page)
1326 {
1327 	int count;
1328 
1329 	if (WARN_ON_ONCE(!page_is_devmap_managed(page)))
1330 		return;
1331 
1332 	count = page_ref_dec_return(page);
1333 
1334 	/*
1335 	 * devmap page refcounts are 1-based, rather than 0-based: if
1336 	 * refcount is 1, then the page is free and the refcount is
1337 	 * stable because nobody holds a reference on the page.
1338 	 */
1339 	if (count == 1)
1340 		free_devmap_managed_page(page);
1341 	else if (!count)
1342 		__put_page(page);
1343 }
1344 EXPORT_SYMBOL(put_devmap_managed_page);
1345 #endif
1346