• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Workingset detection
4  *
5  * Copyright (C) 2013 Red Hat, Inc., Johannes Weiner
6  */
7 
8 #include <linux/memcontrol.h>
9 #include <linux/mm_inline.h>
10 #include <linux/writeback.h>
11 #include <linux/shmem_fs.h>
12 #include <linux/pagemap.h>
13 #include <linux/atomic.h>
14 #include <linux/module.h>
15 #include <linux/swap.h>
16 #include <linux/dax.h>
17 #include <linux/fs.h>
18 #include <linux/mm.h>
19 #include "internal.h"
20 #include <trace/hooks/mm.h>
21 
22 /*
23  *		Double CLOCK lists
24  *
25  * Per node, two clock lists are maintained for file pages: the
26  * inactive and the active list.  Freshly faulted pages start out at
27  * the head of the inactive list and page reclaim scans pages from the
28  * tail.  Pages that are accessed multiple times on the inactive list
29  * are promoted to the active list, to protect them from reclaim,
30  * whereas active pages are demoted to the inactive list when the
31  * active list grows too big.
32  *
33  *   fault ------------------------+
34  *                                 |
35  *              +--------------+   |            +-------------+
36  *   reclaim <- |   inactive   | <-+-- demotion |    active   | <--+
37  *              +--------------+                +-------------+    |
38  *                     |                                           |
39  *                     +-------------- promotion ------------------+
40  *
41  *
42  *		Access frequency and refault distance
43  *
44  * A workload is thrashing when its pages are frequently used but they
45  * are evicted from the inactive list every time before another access
46  * would have promoted them to the active list.
47  *
48  * In cases where the average access distance between thrashing pages
49  * is bigger than the size of memory there is nothing that can be
50  * done - the thrashing set could never fit into memory under any
51  * circumstance.
52  *
53  * However, the average access distance could be bigger than the
54  * inactive list, yet smaller than the size of memory.  In this case,
55  * the set could fit into memory if it weren't for the currently
56  * active pages - which may be used more, hopefully less frequently:
57  *
58  *      +-memory available to cache-+
59  *      |                           |
60  *      +-inactive------+-active----+
61  *  a b | c d e f g h i | J K L M N |
62  *      +---------------+-----------+
63  *
64  * It is prohibitively expensive to accurately track access frequency
65  * of pages.  But a reasonable approximation can be made to measure
66  * thrashing on the inactive list, after which refaulting pages can be
67  * activated optimistically to compete with the existing active pages.
68  *
69  * Approximating inactive page access frequency - Observations:
70  *
71  * 1. When a page is accessed for the first time, it is added to the
72  *    head of the inactive list, slides every existing inactive page
73  *    towards the tail by one slot, and pushes the current tail page
74  *    out of memory.
75  *
76  * 2. When a page is accessed for the second time, it is promoted to
77  *    the active list, shrinking the inactive list by one slot.  This
78  *    also slides all inactive pages that were faulted into the cache
79  *    more recently than the activated page towards the tail of the
80  *    inactive list.
81  *
82  * Thus:
83  *
84  * 1. The sum of evictions and activations between any two points in
85  *    time indicate the minimum number of inactive pages accessed in
86  *    between.
87  *
88  * 2. Moving one inactive page N page slots towards the tail of the
89  *    list requires at least N inactive page accesses.
90  *
91  * Combining these:
92  *
93  * 1. When a page is finally evicted from memory, the number of
94  *    inactive pages accessed while the page was in cache is at least
95  *    the number of page slots on the inactive list.
96  *
97  * 2. In addition, measuring the sum of evictions and activations (E)
98  *    at the time of a page's eviction, and comparing it to another
99  *    reading (R) at the time the page faults back into memory tells
100  *    the minimum number of accesses while the page was not cached.
101  *    This is called the refault distance.
102  *
103  * Because the first access of the page was the fault and the second
104  * access the refault, we combine the in-cache distance with the
105  * out-of-cache distance to get the complete minimum access distance
106  * of this page:
107  *
108  *      NR_inactive + (R - E)
109  *
110  * And knowing the minimum access distance of a page, we can easily
111  * tell if the page would be able to stay in cache assuming all page
112  * slots in the cache were available:
113  *
114  *   NR_inactive + (R - E) <= NR_inactive + NR_active
115  *
116  * If we have swap we should consider about NR_inactive_anon and
117  * NR_active_anon, so for page cache and anonymous respectively:
118  *
119  *   NR_inactive_file + (R - E) <= NR_inactive_file + NR_active_file
120  *   + NR_inactive_anon + NR_active_anon
121  *
122  *   NR_inactive_anon + (R - E) <= NR_inactive_anon + NR_active_anon
123  *   + NR_inactive_file + NR_active_file
124  *
125  * Which can be further simplified to:
126  *
127  *   (R - E) <= NR_active_file + NR_inactive_anon + NR_active_anon
128  *
129  *   (R - E) <= NR_active_anon + NR_inactive_file + NR_active_file
130  *
131  * Put into words, the refault distance (out-of-cache) can be seen as
132  * a deficit in inactive list space (in-cache).  If the inactive list
133  * had (R - E) more page slots, the page would not have been evicted
134  * in between accesses, but activated instead.  And on a full system,
135  * the only thing eating into inactive list space is active pages.
136  *
137  *
138  *		Refaulting inactive pages
139  *
140  * All that is known about the active list is that the pages have been
141  * accessed more than once in the past.  This means that at any given
142  * time there is actually a good chance that pages on the active list
143  * are no longer in active use.
144  *
145  * So when a refault distance of (R - E) is observed and there are at
146  * least (R - E) pages in the userspace workingset, the refaulting page
147  * is activated optimistically in the hope that (R - E) pages are actually
148  * used less frequently than the refaulting page - or even not used at
149  * all anymore.
150  *
151  * That means if inactive cache is refaulting with a suitable refault
152  * distance, we assume the cache workingset is transitioning and put
153  * pressure on the current workingset.
154  *
155  * If this is wrong and demotion kicks in, the pages which are truly
156  * used more frequently will be reactivated while the less frequently
157  * used once will be evicted from memory.
158  *
159  * But if this is right, the stale pages will be pushed out of memory
160  * and the used pages get to stay in cache.
161  *
162  *		Refaulting active pages
163  *
164  * If on the other hand the refaulting pages have recently been
165  * deactivated, it means that the active list is no longer protecting
166  * actively used cache from reclaim. The cache is NOT transitioning to
167  * a different workingset; the existing workingset is thrashing in the
168  * space allocated to the page cache.
169  *
170  *
171  *		Implementation
172  *
173  * For each node's LRU lists, a counter for inactive evictions and
174  * activations is maintained (node->nonresident_age).
175  *
176  * On eviction, a snapshot of this counter (along with some bits to
177  * identify the node) is stored in the now empty page cache
178  * slot of the evicted page.  This is called a shadow entry.
179  *
180  * On cache misses for which there are shadow entries, an eligible
181  * refault distance will immediately activate the refaulting page.
182  */
183 
184 #define WORKINGSET_SHIFT 1
185 #define EVICTION_SHIFT	((BITS_PER_LONG - BITS_PER_XA_VALUE) +	\
186 			 WORKINGSET_SHIFT + NODES_SHIFT + \
187 			 MEM_CGROUP_ID_SHIFT)
188 #define EVICTION_MASK	(~0UL >> EVICTION_SHIFT)
189 
190 /*
191  * Eviction timestamps need to be able to cover the full range of
192  * actionable refaults. However, bits are tight in the xarray
193  * entry, and after storing the identifier for the lruvec there might
194  * not be enough left to represent every single actionable refault. In
195  * that case, we have to sacrifice granularity for distance, and group
196  * evictions into coarser buckets by shaving off lower timestamp bits.
197  */
198 static unsigned int bucket_order __read_mostly;
199 
pack_shadow(int memcgid,pg_data_t * pgdat,unsigned long eviction,bool workingset)200 static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
201 			 bool workingset)
202 {
203 	eviction &= EVICTION_MASK;
204 	eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
205 	eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
206 	eviction = (eviction << WORKINGSET_SHIFT) | workingset;
207 
208 	return xa_mk_value(eviction);
209 }
210 
unpack_shadow(void * shadow,int * memcgidp,pg_data_t ** pgdat,unsigned long * evictionp,bool * workingsetp)211 static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
212 			  unsigned long *evictionp, bool *workingsetp)
213 {
214 	unsigned long entry = xa_to_value(shadow);
215 	int memcgid, nid;
216 	bool workingset;
217 
218 	workingset = entry & ((1UL << WORKINGSET_SHIFT) - 1);
219 	entry >>= WORKINGSET_SHIFT;
220 	nid = entry & ((1UL << NODES_SHIFT) - 1);
221 	entry >>= NODES_SHIFT;
222 	memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
223 	entry >>= MEM_CGROUP_ID_SHIFT;
224 
225 	*memcgidp = memcgid;
226 	*pgdat = NODE_DATA(nid);
227 	*evictionp = entry;
228 	*workingsetp = workingset;
229 }
230 
231 #ifdef CONFIG_LRU_GEN
232 
lru_gen_eviction(struct folio * folio)233 static void *lru_gen_eviction(struct folio *folio)
234 {
235 	int hist;
236 	unsigned long token;
237 	unsigned long min_seq;
238 	struct lruvec *lruvec;
239 	struct lru_gen_folio *lrugen;
240 	int type = folio_is_file_lru(folio);
241 	int delta = folio_nr_pages(folio);
242 	int refs = folio_lru_refs(folio);
243 	bool workingset = folio_test_workingset(folio);
244 	int tier = lru_tier_from_refs(refs, workingset);
245 	struct mem_cgroup *memcg = folio_memcg(folio);
246 	struct pglist_data *pgdat = folio_pgdat(folio);
247 
248 	BUILD_BUG_ON(LRU_GEN_WIDTH + LRU_REFS_WIDTH > BITS_PER_LONG - EVICTION_SHIFT);
249 
250 	lruvec = mem_cgroup_lruvec(memcg, pgdat);
251 	lrugen = &lruvec->lrugen;
252 	min_seq = READ_ONCE(lrugen->min_seq[type]);
253 	token = (min_seq << LRU_REFS_WIDTH) | max(refs - 1, 0);
254 
255 	hist = lru_hist_from_seq(min_seq);
256 	atomic_long_add(delta, &lrugen->evicted[hist][type][tier]);
257 
258 	return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset);
259 }
260 
261 /*
262  * Tests if the shadow entry is for a folio that was recently evicted.
263  * Fills in @lruvec, @token, @workingset with the values unpacked from shadow.
264  */
lru_gen_test_recent(void * shadow,struct lruvec ** lruvec,unsigned long * token,bool * workingset)265 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
266 				unsigned long *token, bool *workingset)
267 {
268 	int memcg_id;
269 	unsigned long max_seq;
270 	struct mem_cgroup *memcg;
271 	struct pglist_data *pgdat;
272 
273 	unpack_shadow(shadow, &memcg_id, &pgdat, token, workingset);
274 
275 	memcg = mem_cgroup_from_id(memcg_id);
276 	*lruvec = mem_cgroup_lruvec(memcg, pgdat);
277 
278 	max_seq = READ_ONCE((*lruvec)->lrugen.max_seq);
279 	max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH;
280 
281 	return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS;
282 }
283 
lru_gen_refault(struct folio * folio,void * shadow)284 static void lru_gen_refault(struct folio *folio, void *shadow)
285 {
286 	bool recent;
287 	int hist, tier, refs;
288 	bool workingset;
289 	unsigned long token;
290 	struct lruvec *lruvec;
291 	struct lru_gen_folio *lrugen;
292 	int type = folio_is_file_lru(folio);
293 	int delta = folio_nr_pages(folio);
294 
295 	rcu_read_lock();
296 
297 	recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset);
298 	if (lruvec != folio_lruvec(folio))
299 		goto unlock;
300 
301 	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + type, delta);
302 
303 	if (!recent)
304 		goto unlock;
305 
306 	lrugen = &lruvec->lrugen;
307 
308 	hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type]));
309 	refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1;
310 	tier = lru_tier_from_refs(refs, workingset);
311 
312 	atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]);
313 
314 	/* see folio_add_lru() where folio_set_active() will be called */
315 	if (lru_gen_in_fault())
316 		mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta);
317 
318 	if (workingset) {
319 		folio_set_workingset(folio);
320 		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta);
321 	} else
322 		set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF);
323 unlock:
324 	rcu_read_unlock();
325 }
326 
327 #else /* !CONFIG_LRU_GEN */
328 
lru_gen_eviction(struct folio * folio)329 static void *lru_gen_eviction(struct folio *folio)
330 {
331 	return NULL;
332 }
333 
lru_gen_test_recent(void * shadow,struct lruvec ** lruvec,unsigned long * token,bool * workingset)334 static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec,
335 				unsigned long *token, bool *workingset)
336 {
337 	return false;
338 }
339 
lru_gen_refault(struct folio * folio,void * shadow)340 static void lru_gen_refault(struct folio *folio, void *shadow)
341 {
342 }
343 
344 #endif /* CONFIG_LRU_GEN */
345 
346 /**
347  * workingset_age_nonresident - age non-resident entries as LRU ages
348  * @lruvec: the lruvec that was aged
349  * @nr_pages: the number of pages to count
350  *
351  * As in-memory pages are aged, non-resident pages need to be aged as
352  * well, in order for the refault distances later on to be comparable
353  * to the in-memory dimensions. This function allows reclaim and LRU
354  * operations to drive the non-resident aging along in parallel.
355  */
workingset_age_nonresident(struct lruvec * lruvec,unsigned long nr_pages)356 void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
357 {
358 	/*
359 	 * Reclaiming a cgroup means reclaiming all its children in a
360 	 * round-robin fashion. That means that each cgroup has an LRU
361 	 * order that is composed of the LRU orders of its child
362 	 * cgroups; and every page has an LRU position not just in the
363 	 * cgroup that owns it, but in all of that group's ancestors.
364 	 *
365 	 * So when the physical inactive list of a leaf cgroup ages,
366 	 * the virtual inactive lists of all its parents, including
367 	 * the root cgroup's, age as well.
368 	 */
369 	do {
370 		atomic_long_add(nr_pages, &lruvec->nonresident_age);
371 	} while ((lruvec = parent_lruvec(lruvec)));
372 }
373 
374 /**
375  * workingset_eviction - note the eviction of a folio from memory
376  * @target_memcg: the cgroup that is causing the reclaim
377  * @folio: the folio being evicted
378  *
379  * Return: a shadow entry to be stored in @folio->mapping->i_pages in place
380  * of the evicted @folio so that a later refault can be detected.
381  */
workingset_eviction(struct folio * folio,struct mem_cgroup * target_memcg)382 void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg)
383 {
384 	struct pglist_data *pgdat = folio_pgdat(folio);
385 	unsigned long eviction;
386 	struct lruvec *lruvec;
387 	int memcgid;
388 
389 	/* Folio is fully exclusive and pins folio's memory cgroup pointer */
390 	VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
391 	VM_BUG_ON_FOLIO(folio_ref_count(folio), folio);
392 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
393 
394 	if (lru_gen_enabled())
395 		return lru_gen_eviction(folio);
396 
397 	lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
398 	/* XXX: target_memcg can be NULL, go through lruvec */
399 	memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
400 	eviction = atomic_long_read(&lruvec->nonresident_age);
401 	eviction >>= bucket_order;
402 	workingset_age_nonresident(lruvec, folio_nr_pages(folio));
403 	return pack_shadow(memcgid, pgdat, eviction,
404 				folio_test_workingset(folio));
405 }
406 
407 /**
408  * workingset_test_recent - tests if the shadow entry is for a folio that was
409  * recently evicted. Also fills in @workingset with the value unpacked from
410  * shadow.
411  * @shadow: the shadow entry to be tested.
412  * @file: whether the corresponding folio is from the file lru.
413  * @workingset: where the workingset value unpacked from shadow should
414  * be stored.
415  * @flush: whether to flush cgroup rstat.
416  *
417  * Return: true if the shadow is for a recently evicted folio; false otherwise.
418  */
workingset_test_recent(void * shadow,bool file,bool * workingset,bool flush)419 bool workingset_test_recent(void *shadow, bool file, bool *workingset,
420 				bool flush)
421 {
422 	struct mem_cgroup *eviction_memcg;
423 	struct lruvec *eviction_lruvec;
424 	unsigned long refault_distance;
425 	unsigned long workingset_size;
426 	unsigned long refault;
427 	int memcgid;
428 	struct pglist_data *pgdat;
429 	unsigned long eviction;
430 
431 	if (lru_gen_enabled()) {
432 		bool recent;
433 
434 		rcu_read_lock();
435 		recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset);
436 		rcu_read_unlock();
437 		return recent;
438 	}
439 
440 	rcu_read_lock();
441 	unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset);
442 	eviction <<= bucket_order;
443 
444 	/*
445 	 * Look up the memcg associated with the stored ID. It might
446 	 * have been deleted since the folio's eviction.
447 	 *
448 	 * Note that in rare events the ID could have been recycled
449 	 * for a new cgroup that refaults a shared folio. This is
450 	 * impossible to tell from the available data. However, this
451 	 * should be a rare and limited disturbance, and activations
452 	 * are always speculative anyway. Ultimately, it's the aging
453 	 * algorithm's job to shake out the minimum access frequency
454 	 * for the active cache.
455 	 *
456 	 * XXX: On !CONFIG_MEMCG, this will always return NULL; it
457 	 * would be better if the root_mem_cgroup existed in all
458 	 * configurations instead.
459 	 */
460 	eviction_memcg = mem_cgroup_from_id(memcgid);
461 	if (!mem_cgroup_tryget(eviction_memcg))
462 		eviction_memcg = NULL;
463 	rcu_read_unlock();
464 
465 	if (!mem_cgroup_disabled() && !eviction_memcg)
466 		return false;
467 	/*
468 	 * Flush stats (and potentially sleep) outside the RCU read section.
469 	 *
470 	 * Note that workingset_test_recent() itself might be called in RCU read
471 	 * section (for e.g, in cachestat) - these callers need to skip flushing
472 	 * stats (via the flush argument).
473 	 *
474 	 * XXX: With per-memcg flushing and thresholding, is ratelimiting
475 	 * still needed here?
476 	 */
477 	if (flush)
478 		mem_cgroup_flush_stats_ratelimited(eviction_memcg);
479 
480 	eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
481 	refault = atomic_long_read(&eviction_lruvec->nonresident_age);
482 
483 	/*
484 	 * Calculate the refault distance
485 	 *
486 	 * The unsigned subtraction here gives an accurate distance
487 	 * across nonresident_age overflows in most cases. There is a
488 	 * special case: usually, shadow entries have a short lifetime
489 	 * and are either refaulted or reclaimed along with the inode
490 	 * before they get too old.  But it is not impossible for the
491 	 * nonresident_age to lap a shadow entry in the field, which
492 	 * can then result in a false small refault distance, leading
493 	 * to a false activation should this old entry actually
494 	 * refault again.  However, earlier kernels used to deactivate
495 	 * unconditionally with *every* reclaim invocation for the
496 	 * longest time, so the occasional inappropriate activation
497 	 * leading to pressure on the active list is not a problem.
498 	 */
499 	refault_distance = (refault - eviction) & EVICTION_MASK;
500 
501 	/*
502 	 * Compare the distance to the existing workingset size. We
503 	 * don't activate pages that couldn't stay resident even if
504 	 * all the memory was available to the workingset. Whether
505 	 * workingset competition needs to consider anon or not depends
506 	 * on having free swap space.
507 	 */
508 	workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
509 	if (!file) {
510 		workingset_size += lruvec_page_state(eviction_lruvec,
511 						     NR_INACTIVE_FILE);
512 	}
513 	if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) {
514 		workingset_size += lruvec_page_state(eviction_lruvec,
515 						     NR_ACTIVE_ANON);
516 		if (file) {
517 			workingset_size += lruvec_page_state(eviction_lruvec,
518 						     NR_INACTIVE_ANON);
519 		}
520 	}
521 
522 	mem_cgroup_put(eviction_memcg);
523 	return refault_distance <= workingset_size;
524 }
525 
526 /**
527  * workingset_refault - Evaluate the refault of a previously evicted folio.
528  * @folio: The freshly allocated replacement folio.
529  * @shadow: Shadow entry of the evicted folio.
530  *
531  * Calculates and evaluates the refault distance of the previously
532  * evicted folio in the context of the node and the memcg whose memory
533  * pressure caused the eviction.
534  */
workingset_refault(struct folio * folio,void * shadow)535 void workingset_refault(struct folio *folio, void *shadow)
536 {
537 	bool file = folio_is_file_lru(folio);
538 	struct pglist_data *pgdat;
539 	struct mem_cgroup *memcg;
540 	struct lruvec *lruvec;
541 	bool workingset;
542 	long nr;
543 
544 	trace_android_vh_count_workingset_refault(folio);
545 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
546 	if (lru_gen_enabled()) {
547 		lru_gen_refault(folio, shadow);
548 		return;
549 	}
550 
551 	/*
552 	 * The activation decision for this folio is made at the level
553 	 * where the eviction occurred, as that is where the LRU order
554 	 * during folio reclaim is being determined.
555 	 *
556 	 * However, the cgroup that will own the folio is the one that
557 	 * is actually experiencing the refault event. Make sure the folio is
558 	 * locked to guarantee folio_memcg() stability throughout.
559 	 */
560 	nr = folio_nr_pages(folio);
561 	memcg = folio_memcg(folio);
562 	pgdat = folio_pgdat(folio);
563 	lruvec = mem_cgroup_lruvec(memcg, pgdat);
564 
565 	mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
566 
567 	if (!workingset_test_recent(shadow, file, &workingset, true))
568 		return;
569 
570 	folio_set_active(folio);
571 	workingset_age_nonresident(lruvec, nr);
572 	mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
573 
574 	/* Folio was active prior to eviction */
575 	if (workingset) {
576 		folio_set_workingset(folio);
577 		/*
578 		 * XXX: Move to folio_add_lru() when it supports new vs
579 		 * putback
580 		 */
581 		lru_note_cost_refault(folio);
582 		mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
583 	}
584 }
585 
586 /**
587  * workingset_activation - note a page activation
588  * @folio: Folio that is being activated.
589  */
workingset_activation(struct folio * folio)590 void workingset_activation(struct folio *folio)
591 {
592 	struct mem_cgroup *memcg;
593 
594 	rcu_read_lock();
595 	/*
596 	 * Filter non-memcg pages here, e.g. unmap can call
597 	 * mark_page_accessed() on VDSO pages.
598 	 *
599 	 * XXX: See workingset_refault() - this should return
600 	 * root_mem_cgroup even for !CONFIG_MEMCG.
601 	 */
602 	memcg = folio_memcg_rcu(folio);
603 	if (!mem_cgroup_disabled() && !memcg)
604 		goto out;
605 	workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
606 out:
607 	rcu_read_unlock();
608 }
609 
610 /*
611  * Shadow entries reflect the share of the working set that does not
612  * fit into memory, so their number depends on the access pattern of
613  * the workload.  In most cases, they will refault or get reclaimed
614  * along with the inode, but a (malicious) workload that streams
615  * through files with a total size several times that of available
616  * memory, while preventing the inodes from being reclaimed, can
617  * create excessive amounts of shadow nodes.  To keep a lid on this,
618  * track shadow nodes and reclaim them when they grow way past the
619  * point where they would still be useful.
620  */
621 
622 struct list_lru shadow_nodes;
623 
workingset_update_node(struct xa_node * node)624 void workingset_update_node(struct xa_node *node)
625 {
626 	struct address_space *mapping;
627 	struct page *page = virt_to_page(node);
628 
629 	/*
630 	 * Track non-empty nodes that contain only shadow entries;
631 	 * unlink those that contain pages or are being freed.
632 	 *
633 	 * Avoid acquiring the list_lru lock when the nodes are
634 	 * already where they should be. The list_empty() test is safe
635 	 * as node->private_list is protected by the i_pages lock.
636 	 */
637 	mapping = container_of(node->array, struct address_space, i_pages);
638 	lockdep_assert_held(&mapping->i_pages.xa_lock);
639 
640 	if (node->count && node->count == node->nr_values) {
641 		if (list_empty(&node->private_list)) {
642 			list_lru_add_obj(&shadow_nodes, &node->private_list);
643 			__inc_node_page_state(page, WORKINGSET_NODES);
644 		}
645 	} else {
646 		if (!list_empty(&node->private_list)) {
647 			list_lru_del_obj(&shadow_nodes, &node->private_list);
648 			__dec_node_page_state(page, WORKINGSET_NODES);
649 		}
650 	}
651 }
652 
count_shadow_nodes(struct shrinker * shrinker,struct shrink_control * sc)653 static unsigned long count_shadow_nodes(struct shrinker *shrinker,
654 					struct shrink_control *sc)
655 {
656 	unsigned long max_nodes;
657 	unsigned long nodes;
658 	unsigned long pages;
659 
660 	nodes = list_lru_shrink_count(&shadow_nodes, sc);
661 	if (!nodes)
662 		return SHRINK_EMPTY;
663 
664 	/*
665 	 * Approximate a reasonable limit for the nodes
666 	 * containing shadow entries. We don't need to keep more
667 	 * shadow entries than possible pages on the active list,
668 	 * since refault distances bigger than that are dismissed.
669 	 *
670 	 * The size of the active list converges toward 100% of
671 	 * overall page cache as memory grows, with only a tiny
672 	 * inactive list. Assume the total cache size for that.
673 	 *
674 	 * Nodes might be sparsely populated, with only one shadow
675 	 * entry in the extreme case. Obviously, we cannot keep one
676 	 * node for every eligible shadow entry, so compromise on a
677 	 * worst-case density of 1/8th. Below that, not all eligible
678 	 * refaults can be detected anymore.
679 	 *
680 	 * On 64-bit with 7 xa_nodes per page and 64 slots
681 	 * each, this will reclaim shadow entries when they consume
682 	 * ~1.8% of available memory:
683 	 *
684 	 * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
685 	 */
686 #ifdef CONFIG_MEMCG
687 	if (sc->memcg) {
688 		struct lruvec *lruvec;
689 		int i;
690 
691 		mem_cgroup_flush_stats_ratelimited(sc->memcg);
692 		lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
693 		for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
694 			pages += lruvec_page_state_local(lruvec,
695 							 NR_LRU_BASE + i);
696 		pages += lruvec_page_state_local(
697 			lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
698 		pages += lruvec_page_state_local(
699 			lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
700 	} else
701 #endif
702 		pages = node_present_pages(sc->nid);
703 
704 	max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
705 
706 	if (nodes <= max_nodes)
707 		return 0;
708 	return nodes - max_nodes;
709 }
710 
shadow_lru_isolate(struct list_head * item,struct list_lru_one * lru,spinlock_t * lru_lock,void * arg)711 static enum lru_status shadow_lru_isolate(struct list_head *item,
712 					  struct list_lru_one *lru,
713 					  spinlock_t *lru_lock,
714 					  void *arg) __must_hold(lru_lock)
715 {
716 	struct xa_node *node = container_of(item, struct xa_node, private_list);
717 	struct address_space *mapping;
718 	int ret;
719 
720 	/*
721 	 * Page cache insertions and deletions synchronously maintain
722 	 * the shadow node LRU under the i_pages lock and the
723 	 * lru_lock.  Because the page cache tree is emptied before
724 	 * the inode can be destroyed, holding the lru_lock pins any
725 	 * address_space that has nodes on the LRU.
726 	 *
727 	 * We can then safely transition to the i_pages lock to
728 	 * pin only the address_space of the particular node we want
729 	 * to reclaim, take the node off-LRU, and drop the lru_lock.
730 	 */
731 
732 	mapping = container_of(node->array, struct address_space, i_pages);
733 
734 	/* Coming from the list, invert the lock order */
735 	if (!xa_trylock(&mapping->i_pages)) {
736 		spin_unlock_irq(lru_lock);
737 		ret = LRU_RETRY;
738 		goto out;
739 	}
740 
741 	/* For page cache we need to hold i_lock */
742 	if (mapping->host != NULL) {
743 		if (!spin_trylock(&mapping->host->i_lock)) {
744 			xa_unlock(&mapping->i_pages);
745 			spin_unlock_irq(lru_lock);
746 			ret = LRU_RETRY;
747 			goto out;
748 		}
749 	}
750 
751 	list_lru_isolate(lru, item);
752 	__dec_node_page_state(virt_to_page(node), WORKINGSET_NODES);
753 
754 	spin_unlock(lru_lock);
755 
756 	/*
757 	 * The nodes should only contain one or more shadow entries,
758 	 * no pages, so we expect to be able to remove them all and
759 	 * delete and free the empty node afterwards.
760 	 */
761 	if (WARN_ON_ONCE(!node->nr_values))
762 		goto out_invalid;
763 	if (WARN_ON_ONCE(node->count != node->nr_values))
764 		goto out_invalid;
765 	xa_delete_node(node, workingset_update_node);
766 	__inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
767 
768 out_invalid:
769 	xa_unlock_irq(&mapping->i_pages);
770 	if (mapping->host != NULL) {
771 		if (mapping_shrinkable(mapping))
772 			inode_add_lru(mapping->host);
773 		spin_unlock(&mapping->host->i_lock);
774 	}
775 	ret = LRU_REMOVED_RETRY;
776 out:
777 	cond_resched();
778 	spin_lock_irq(lru_lock);
779 	return ret;
780 }
781 
scan_shadow_nodes(struct shrinker * shrinker,struct shrink_control * sc)782 static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
783 				       struct shrink_control *sc)
784 {
785 	/* list_lru lock nests inside the IRQ-safe i_pages lock */
786 	return list_lru_shrink_walk_irq(&shadow_nodes, sc, shadow_lru_isolate,
787 					NULL);
788 }
789 
790 /*
791  * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
792  * i_pages lock.
793  */
794 static struct lock_class_key shadow_nodes_key;
795 
workingset_init(void)796 static int __init workingset_init(void)
797 {
798 	struct shrinker *workingset_shadow_shrinker;
799 	unsigned int timestamp_bits;
800 	unsigned int max_order;
801 	int ret = -ENOMEM;
802 
803 	BUILD_BUG_ON(BITS_PER_LONG < EVICTION_SHIFT);
804 	/*
805 	 * Calculate the eviction bucket size to cover the longest
806 	 * actionable refault distance, which is currently half of
807 	 * memory (totalram_pages/2). However, memory hotplug may add
808 	 * some more pages at runtime, so keep working with up to
809 	 * double the initial memory by using totalram_pages as-is.
810 	 */
811 	timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
812 	max_order = fls_long(totalram_pages() - 1);
813 	if (max_order > timestamp_bits)
814 		bucket_order = max_order - timestamp_bits;
815 	pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
816 	       timestamp_bits, max_order, bucket_order);
817 
818 	workingset_shadow_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE |
819 						    SHRINKER_MEMCG_AWARE,
820 						    "mm-shadow");
821 	if (!workingset_shadow_shrinker)
822 		goto err;
823 
824 	ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
825 			      workingset_shadow_shrinker);
826 	if (ret)
827 		goto err_list_lru;
828 
829 	workingset_shadow_shrinker->count_objects = count_shadow_nodes;
830 	workingset_shadow_shrinker->scan_objects = scan_shadow_nodes;
831 	/* ->count reports only fully expendable nodes */
832 	workingset_shadow_shrinker->seeks = 0;
833 
834 	shrinker_register(workingset_shadow_shrinker);
835 	return 0;
836 err_list_lru:
837 	shrinker_free(workingset_shadow_shrinker);
838 err:
839 	return ret;
840 }
841 module_init(workingset_init);
842