• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * mm/memcg_reclaim.c
4  *
5  * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd.
6  */
7 #include <linux/mm.h>
8 #include <linux/backing-dev.h>
9 #include <linux/hyperhold_inf.h>
10 
11 #ifdef CONFIG_HYPERHOLD_FILE_LRU
12 #include <linux/memcg_policy.h>
13 #include "internal.h"
14 #endif
15 
is_swap_not_allowed(struct scan_control * sc,int swappiness)16 static inline bool is_swap_not_allowed(struct scan_control *sc, int swappiness)
17 {
18 	return !sc->may_swap || !swappiness || !get_nr_swap_pages();
19 }
20 
21 /*
22  * From 0 .. 100.  Higher means more swappy.
23  */
24 #define HYPERHOLD_SWAPPINESS 100
25 
get_hyperhold_swappiness(void)26 static int get_hyperhold_swappiness(void)
27 {
28 	return is_hyperhold_enable() ? HYPERHOLD_SWAPPINESS : vm_swappiness;
29 }
30 
get_scan_count_hyperhold(struct pglist_data * pgdat,struct scan_control * sc,unsigned long * nr,unsigned long * lru_pages)31 static void get_scan_count_hyperhold(struct pglist_data *pgdat,
32 		struct scan_control *sc, unsigned long *nr,
33 		unsigned long *lru_pages)
34 {
35 	int swappiness = get_hyperhold_swappiness();
36 	struct lruvec *lruvec = node_lruvec(pgdat);
37 	u64 fraction[2];
38 	u64 denominator;
39 	enum scan_balance scan_balance;
40 	unsigned long ap, fp;
41 	enum lru_list lru;
42 	unsigned long pgdatfile;
43 	unsigned long pgdatfree;
44 	int z;
45 	unsigned long anon_cost, file_cost, total_cost;
46 	unsigned long total_high_wmark = 0;
47 
48 
49 	if (cgroup_reclaim(sc) && !swappiness) {
50 		scan_balance = SCAN_FILE;
51 		goto out;
52 	}
53 
54 	/*
55 	 * Do not apply any pressure balancing cleverness when the
56 	 * system is close to OOM, scan both anon and file equally
57 	 * (unless the swappiness setting disagrees with swapping).
58 	 */
59 	if (!sc->priority && swappiness) {
60 		scan_balance = SCAN_EQUAL;
61 		goto out;
62 	}
63 
64 	if (!cgroup_reclaim(sc)) {
65 		pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
66 		pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
67 			node_page_state(pgdat, NR_INACTIVE_FILE);
68 
69 		for (z = 0; z < MAX_NR_ZONES; z++) {
70 			struct zone *zone = &pgdat->node_zones[z];
71 
72 			if (!managed_zone(zone))
73 				continue;
74 
75 			total_high_wmark += high_wmark_pages(zone);
76 		}
77 
78 		if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
79 			/*
80 			 * Force SCAN_ANON if there are enough inactive
81 			 * anonymous pages on the LRU in eligible zones.
82 			 * Otherwise, the small LRU gets thrashed.
83 			 */
84 			if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON) &&
85 				(lruvec_lru_size(lruvec, LRU_INACTIVE_ANON,
86 					sc->reclaim_idx) >>
87 					(unsigned int)sc->priority)) {
88 				scan_balance = SCAN_ANON;
89 				goto out;
90 			}
91 		}
92 	}
93 
94 	/*
95 	 * If there is enough inactive page cache, i.e. if the size of the
96 	 * inactive list is greater than that of the active list *and* the
97 	 * inactive list actually has some pages to scan on this priority, we
98 	 * do not reclaim anything from the anonymous working set right now.
99 	 * Without the second condition we could end up never scanning an
100 	 * lruvec even if it has plenty of old anonymous pages unless the
101 	 * system is under heavy pressure.
102 	 */
103 
104 	if (!IS_ENABLED(CONFIG_BALANCE_ANON_FILE_RECLAIM) &&
105 	    !inactive_is_low(lruvec, LRU_INACTIVE_FILE) &&
106 	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
107 		scan_balance = SCAN_FILE;
108 		goto out;
109 	}
110 
111 	scan_balance = SCAN_FRACT;
112 
113 	/*
114 	 * Calculate the pressure balance between anon and file pages.
115 	 *
116 	 * The amount of pressure we put on each LRU is inversely
117 	 * proportional to the cost of reclaiming each list, as
118 	 * determined by the share of pages that are refaulting, times
119 	 * the relative IO cost of bringing back a swapped out
120 	 * anonymous page vs reloading a filesystem page (swappiness).
121 	 *
122 	 * Although we limit that influence to ensure no list gets
123 	 * left behind completely: at least a third of the pressure is
124 	 * applied, before swappiness.
125 	 *
126 	 * With swappiness at 100, anon and file have equal IO cost.
127 	 */
128 	total_cost = sc->anon_cost + sc->file_cost;
129 	anon_cost = total_cost + sc->anon_cost;
130 	file_cost = total_cost + sc->file_cost;
131 	total_cost = anon_cost + file_cost;
132 
133 	ap = swappiness * (total_cost + 1);
134 	ap /= anon_cost + 1;
135 
136 	fp = (200 - swappiness) * (total_cost + 1);
137 	fp /= file_cost + 1;
138 
139 	fraction[0] = ap;
140 	fraction[1] = fp;
141 	denominator = ap + fp;
142 
143 out:
144 	*lru_pages = 0;
145 	for_each_evictable_lru(lru) {
146 		int file = is_file_lru(lru);
147 		unsigned long lruvec_size;
148 		unsigned long scan;
149 
150 		lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
151 		scan = lruvec_size;
152 		*lru_pages += scan;
153 		scan >>= sc->priority;
154 
155 		switch (scan_balance) {
156 		case SCAN_EQUAL:
157 			/* Scan lists relative to size */
158 			break;
159 		case SCAN_FRACT:
160 			/*
161 			 * Scan types proportional to swappiness and
162 			 * their relative recent reclaim efficiency.
163 			 * Make sure we don't miss the last page on
164 			 * the offlined memory cgroups because of a
165 			 * round-off error.
166 			 */
167 			scan = DIV64_U64_ROUND_UP(scan * fraction[file],
168 						  denominator);
169 			break;
170 		case SCAN_FILE:
171 		case SCAN_ANON:
172 			/* Scan one type exclusively */
173 			if ((scan_balance == SCAN_FILE) != file)
174 				scan = 0;
175 			break;
176 		default:
177 			/* Look ma, no brain */
178 			BUG();
179 		}
180 
181 		nr[lru] = scan;
182 	}
183 }
184 
185 #define ISOLATE_LIMIT_CNT 5
shrink_anon_memcg(struct pglist_data * pgdat,struct mem_cgroup * memcg,struct scan_control * sc,unsigned long * nr)186 void shrink_anon_memcg(struct pglist_data *pgdat,
187 		struct mem_cgroup *memcg, struct scan_control *sc,
188 		unsigned long *nr)
189 {
190 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
191 	unsigned long nr_to_scan;
192 	enum lru_list lru;
193 	unsigned long nr_reclaimed = 0;
194 	struct blk_plug plug;
195 
196 	blk_start_plug(&plug);
197 
198 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) {
199 		for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) {
200 			if (nr[lru]) {
201 				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
202 				nr[lru] -= nr_to_scan;
203 				nr_reclaimed +=
204 					shrink_list(lru, nr_to_scan,
205 							lruvec, sc);
206 			}
207 		}
208 		if (sc->nr_reclaimed >= sc->nr_to_reclaim ||
209 				(sc->isolate_count > ISOLATE_LIMIT_CNT &&
210 				sc->invoker == DIRECT_RECLAIM))
211 			break;
212 	}
213 	blk_finish_plug(&plug);
214 	sc->nr_reclaimed += nr_reclaimed;
215 	sc->nr_reclaimed_anon += nr_reclaimed;
216 }
217 
memcg_is_child_of(struct mem_cgroup * mcg,struct mem_cgroup * tmcg)218 static inline bool memcg_is_child_of(struct mem_cgroup *mcg, struct mem_cgroup *tmcg)
219 {
220 	while (!mem_cgroup_is_root(mcg)) {
221 		if (mcg == tmcg)
222 			break;
223 
224 		mcg = parent_mem_cgroup(mcg);
225 	}
226 
227 	return (mcg == tmcg);
228 }
229 
shrink_anon(struct pglist_data * pgdat,struct scan_control * sc,unsigned long * nr)230 static void shrink_anon(struct pglist_data *pgdat,
231 		struct scan_control *sc, unsigned long *nr)
232 {
233 	unsigned long reclaimed;
234 	unsigned long scanned;
235 	struct mem_cgroup *memcg = NULL;
236 	struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
237 	unsigned long nr_memcg[NR_LRU_LISTS];
238 	unsigned long nr_node_active = lruvec_lru_size(
239 			node_lruvec(pgdat), LRU_ACTIVE_ANON, MAX_NR_ZONES);
240 	unsigned long nr_node_inactive = lruvec_lru_size(
241 			node_lruvec(pgdat), LRU_INACTIVE_ANON, MAX_NR_ZONES);
242 
243 	while ((memcg = get_next_memcg(memcg))) {
244 		struct lruvec *lruvec = NULL;
245 
246 		if (!memcg_is_child_of(memcg, target_memcg))
247 			continue;
248 
249 		lruvec = mem_cgroup_lruvec(memcg, pgdat);
250 
251 		reclaimed = sc->nr_reclaimed;
252 		scanned = sc->nr_scanned;
253 
254 		nr_memcg[LRU_ACTIVE_ANON] = nr[LRU_ACTIVE_ANON] *
255 			lruvec_lru_size(lruvec, LRU_ACTIVE_ANON,
256 					MAX_NR_ZONES) / (nr_node_active + 1);
257 		nr_memcg[LRU_INACTIVE_ANON] = nr[LRU_INACTIVE_ANON] *
258 			lruvec_lru_size(lruvec, LRU_INACTIVE_ANON,
259 					MAX_NR_ZONES) / (nr_node_inactive + 1);
260 		nr_memcg[LRU_ACTIVE_FILE] = 0;
261 		nr_memcg[LRU_INACTIVE_FILE] = 0;
262 
263 		/*
264 		 * This loop can become CPU-bound when target memcgs
265 		 * aren't eligible for reclaim - either because they
266 		 * don't have any reclaimable pages, or because their
267 		 * memory is explicitly protected. Avoid soft lockups.
268 		 */
269 		cond_resched();
270 
271 		mem_cgroup_calculate_protection(target_memcg, memcg);
272 
273 		if (mem_cgroup_below_min(memcg)) {
274 			/*
275 			 * Hard protection.
276 			 * If there is no reclaimable memory, OOM.
277 			 */
278 			continue;
279 		} else if (mem_cgroup_below_low(memcg)) {
280 			/*
281 			 * Soft protection.
282 			 * Respect the protection only as long as
283 			 * there is an unprotected supply
284 			 * of reclaimable memory from other cgroups.
285 			 */
286 			if (!sc->memcg_low_reclaim) {
287 				sc->memcg_low_skipped = 1;
288 				continue;
289 			}
290 			memcg_memory_event(memcg, MEMCG_LOW);
291 		}
292 
293 		shrink_anon_memcg(pgdat, memcg, sc, nr_memcg);
294 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
295 					sc->priority);
296 
297 		vmpressure(sc->gfp_mask, memcg, false,
298 				sc->nr_scanned - scanned,
299 				sc->nr_reclaimed - reclaimed);
300 
301 		if (sc->nr_reclaimed >= sc->nr_to_reclaim ||
302 			(sc->isolate_count > ISOLATE_LIMIT_CNT &&
303 			sc->invoker == DIRECT_RECLAIM)) {
304 			get_next_memcg_break(memcg);
305 			break;
306 		}
307 	}
308 }
309 
shrink_file(struct pglist_data * pgdat,struct scan_control * sc,unsigned long * nr)310 static void shrink_file(struct pglist_data *pgdat,
311 		struct scan_control *sc, unsigned long *nr)
312 {
313 	struct lruvec *lruvec = node_lruvec(pgdat);
314 	unsigned long nr_to_scan;
315 	enum lru_list lru;
316 	unsigned long nr_reclaimed = 0;
317 	struct blk_plug plug;
318 
319 	blk_start_plug(&plug);
320 
321 	while (nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) {
322 		for (lru = LRU_INACTIVE_FILE; lru <= LRU_ACTIVE_FILE; lru++) {
323 			if (nr[lru]) {
324 				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
325 				nr[lru] -= nr_to_scan;
326 				nr_reclaimed +=
327 					shrink_list(lru,
328 							nr_to_scan,
329 							lruvec, sc);
330 			}
331 		}
332 	}
333 	blk_finish_plug(&plug);
334 	sc->nr_reclaimed += nr_reclaimed;
335 	sc->nr_reclaimed_file += nr_reclaimed;
336 }
337 
shrink_node_hyperhold(struct pglist_data * pgdat,struct scan_control * sc)338 bool shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc)
339 {
340 	unsigned long nr_reclaimed, nr_scanned;
341 	struct lruvec *target_lruvec;
342 	bool reclaimable = false;
343 	unsigned long file;
344 
345 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
346 	do {
347 		/* Get scan count for file and anon */
348 		unsigned long node_lru_pages = 0;
349 		unsigned long nr[NR_LRU_LISTS] = {0};
350 
351 		memset(&sc->nr, 0, sizeof(sc->nr));
352 		nr_reclaimed = sc->nr_reclaimed;
353 		nr_scanned = sc->nr_scanned;
354 
355 		/*
356 		 * Determine the scan balance between anon and file LRUs.
357 		 */
358 		spin_lock_irq(&pgdat->lru_lock);
359 		sc->anon_cost = mem_cgroup_lruvec(NULL, pgdat)->anon_cost;
360 		sc->file_cost = node_lruvec(pgdat)->file_cost;
361 		spin_unlock_irq(&pgdat->lru_lock);
362 
363 		/*
364 		 * Target desirable inactive:active list ratios for the anon
365 		 * and file LRU lists.
366 		 */
367 		if (!sc->force_deactivate) {
368 			unsigned long refaults;
369 
370 			refaults = lruvec_page_state(target_lruvec,
371 					WORKINGSET_ACTIVATE_ANON);
372 			if (refaults != target_lruvec->refaults[0] ||
373 					inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
374 				sc->may_deactivate |= DEACTIVATE_ANON;
375 			else
376 				sc->may_deactivate &= ~DEACTIVATE_ANON;
377 
378 			/*
379 			 * When refaults are being observed, it means a new
380 			 * workingset is being established. Deactivate to get
381 			 * rid of any stale active pages quickly.
382 			 */
383 #ifdef CONFIG_HYPERHOLD_FILE_LRU
384 			refaults = lruvec_page_state(node_lruvec(pgdat),
385 					WORKINGSET_ACTIVATE_FILE);
386 			if (refaults != node_lruvec(pgdat)->refaults[1] ||
387 					inactive_is_low(node_lruvec(pgdat), LRU_INACTIVE_FILE))
388 				sc->may_deactivate |= DEACTIVATE_FILE;
389 #else
390 			refaults = lruvec_page_state(target_lruvec,
391 					WORKINGSET_ACTIVATE_FILE);
392 			if (refaults != target_lruvec->refaults[1] ||
393 					inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
394 				sc->may_deactivate |= DEACTIVATE_FILE;
395 #endif
396 			else
397 				sc->may_deactivate &= ~DEACTIVATE_FILE;
398 		} else
399 			sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
400 
401 		/*
402 		 * If we have plenty of inactive file pages that aren't
403 		 * thrashing, try to reclaim those first before touching
404 		 * anonymous pages.
405 		 */
406 #ifdef CONFIG_HYPERHOLD_FILE_LRU
407 		file = lruvec_page_state(node_lruvec(pgdat), NR_INACTIVE_FILE);
408 #else
409 		file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
410 #endif
411 		if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
412 			sc->cache_trim_mode = 1;
413 		else
414 			sc->cache_trim_mode = 0;
415 
416 		/*
417 		 * Prevent the reclaimer from falling into the cache trap: as
418 		 * cache pages start out inactive, every cache fault will tip
419 		 * the scan balance towards the file LRU.  And as the file LRU
420 		 * shrinks, so does the window for rotation from references.
421 		 * This means we have a runaway feedback loop where a tiny
422 		 * thrashing file LRU becomes infinitely more attractive than
423 		 * anon pages.  Try to detect this based on file LRU size.
424 		 */
425 		if (!cgroup_reclaim(sc)) {
426 			unsigned long total_high_wmark = 0;
427 			unsigned long free, anon;
428 			int z;
429 
430 			free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
431 			file = node_page_state(pgdat, NR_ACTIVE_FILE) +
432 				node_page_state(pgdat, NR_INACTIVE_FILE);
433 
434 			for (z = 0; z < MAX_NR_ZONES; z++) {
435 				struct zone *zone = &pgdat->node_zones[z];
436 
437 				if (!managed_zone(zone))
438 					continue;
439 
440 				total_high_wmark += high_wmark_pages(zone);
441 			}
442 
443 			/*
444 			 * Consider anon: if that's low too, this isn't a
445 			 * runaway file reclaim problem, but rather just
446 			 * extreme pressure. Reclaim as per usual then.
447 			 */
448 			anon = node_page_state(pgdat, NR_INACTIVE_ANON);
449 
450 			sc->file_is_tiny =
451 				file + free <= total_high_wmark &&
452 				!(sc->may_deactivate & DEACTIVATE_ANON) &&
453 				anon >> sc->priority;
454 		}
455 
456 		get_scan_count_hyperhold(pgdat, sc, nr, &node_lru_pages);
457 
458 		if (!cgroup_reclaim(sc)) {
459 			/* Shrink the Total-File-LRU */
460 			shrink_file(pgdat, sc, nr);
461 		}
462 
463 		/* Shrink Anon by iterating score_list */
464 		shrink_anon(pgdat, sc, nr);
465 
466 		if (sc->nr_reclaimed - nr_reclaimed)
467 			reclaimable = true;
468 
469 		if (current_is_kswapd()) {
470 			/*
471 			 * If reclaim is isolating dirty pages under writeback,
472 			 * it implies that the long-lived page allocation rate
473 			 * is exceeding the page laundering rate. Either the
474 			 * global limits are not being effective at throttling
475 			 * processes due to the page distribution throughout
476 			 * zones or there is heavy usage of a slow backing
477 			 * device. The only option is to throttle from reclaim
478 			 * context which is not ideal as there is no guarantee
479 			 * the dirtying process is throttled in the same way
480 			 * balance_dirty_pages() manages.
481 			 *
482 			 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
483 			 * count the number of pages under pages flagged for
484 			 * immediate reclaim and stall if any are encountered
485 			 * in the nr_immediate check below.
486 			 */
487 			if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
488 				set_bit(PGDAT_WRITEBACK, &pgdat->flags);
489 
490 			/* Allow kswapd to start writing pages during reclaim. */
491 			if (sc->nr.unqueued_dirty == sc->nr.file_taken)
492 				set_bit(PGDAT_DIRTY, &pgdat->flags);
493 
494 			/*
495 			 * If kswapd scans pages marked for immediate
496 			 * reclaim and under writeback (nr_immediate), it
497 			 * implies that pages are cycling through the LRU
498 			 * faster than they are written so also forcibly stall.
499 			 */
500 			if (sc->nr.immediate)
501 				congestion_wait(BLK_RW_ASYNC, HZ/10);
502 		}
503 		/*
504 		 * Legacy memcg will stall in page writeback so avoid forcibly
505 		 * stalling in wait_iff_congested().
506 		 */
507 		if ((current_is_kswapd() ||
508 		    (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
509 		    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
510 			set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
511 
512 		/*
513 		 * Stall direct reclaim for IO completions if underlying BDIs
514 		 * and node is congested. Allow kswapd to continue until it
515 		 * starts encountering unqueued dirty pages or cycling through
516 		 * the LRU too quickly.
517 		 */
518 		if (!current_is_kswapd() && current_may_throttle() &&
519 		    !sc->hibernation_mode &&
520 		    test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
521 			wait_iff_congested(BLK_RW_ASYNC, HZ/10);
522 
523 	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
524 					 sc));
525 	/*
526 	 * Kswapd gives up on balancing particular nodes after too
527 	 * many failures to reclaim anything from them and goes to
528 	 * sleep. On reclaim progress, reset the failure counter. A
529 	 * successful direct reclaim run will revive a dormant kswapd.
530 	 */
531 	if (reclaimable)
532 		pgdat->kswapd_failures = 0;
533 
534 	return reclaimable;
535 }
536