• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * mm/memcg_reclaim.c
4  *
5  * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd.
6  */
7 #include <linux/mm.h>
8 #include <linux/backing-dev.h>
9 #include <linux/hyperhold_inf.h>
10 
11 #ifdef CONFIG_HYPERHOLD_FILE_LRU
12 #include <linux/memcg_policy.h>
13 #include "internal.h"
14 #endif
15 
is_swap_not_allowed(struct scan_control * sc,int swappiness)16 static inline bool is_swap_not_allowed(struct scan_control *sc, int swappiness)
17 {
18 	return !sc->may_swap || !swappiness || !get_nr_swap_pages();
19 }
20 
21 /*
22  * From 0 .. 100.  Higher means more swappy.
23  */
24 #define HYPERHOLD_SWAPPINESS 100
25 
get_hyperhold_swappiness(void)26 static int get_hyperhold_swappiness(void)
27 {
28 	return is_hyperhold_enable() ? HYPERHOLD_SWAPPINESS : vm_swappiness;
29 }
30 
get_scan_count_hyperhold(struct pglist_data * pgdat,struct scan_control * sc,unsigned long * nr,unsigned long * lru_pages)31 static void get_scan_count_hyperhold(struct pglist_data *pgdat,
32 		struct scan_control *sc, unsigned long *nr,
33 		unsigned long *lru_pages)
34 {
35 	int swappiness = get_hyperhold_swappiness();
36 	struct lruvec *lruvec = node_lruvec(pgdat);
37 	u64 fraction[2];
38 	u64 denominator;
39 	enum scan_balance scan_balance;
40 	unsigned long ap, fp;
41 	enum lru_list lru;
42 	unsigned long pgdatfile;
43 	unsigned long pgdatfree;
44 	int z;
45 	unsigned long anon_cost, file_cost, total_cost;
46 	unsigned long total_high_wmark = 0;
47 
48 
49 	if (cgroup_reclaim(sc) && !swappiness) {
50 		scan_balance = SCAN_FILE;
51 		goto out;
52 	}
53 
54 	/*
55 	 * Do not apply any pressure balancing cleverness when the
56 	 * system is close to OOM, scan both anon and file equally
57 	 * (unless the swappiness setting disagrees with swapping).
58 	 */
59 	if (!sc->priority && swappiness) {
60 		scan_balance = SCAN_EQUAL;
61 		goto out;
62 	}
63 
64 	if (!cgroup_reclaim(sc)) {
65 		pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
66 		pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
67 			node_page_state(pgdat, NR_INACTIVE_FILE);
68 
69 		for (z = 0; z < MAX_NR_ZONES; z++) {
70 			struct zone *zone = &pgdat->node_zones[z];
71 
72 			if (!managed_zone(zone))
73 				continue;
74 
75 			total_high_wmark += high_wmark_pages(zone);
76 		}
77 
78 		if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
79 			/*
80 			 * Force SCAN_ANON if there are enough inactive
81 			 * anonymous pages on the LRU in eligible zones.
82 			 * Otherwise, the small LRU gets thrashed.
83 			 */
84 			if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON) &&
85 				(lruvec_lru_size(lruvec, LRU_INACTIVE_ANON,
86 					sc->reclaim_idx) >>
87 					(unsigned int)sc->priority)) {
88 				scan_balance = SCAN_ANON;
89 				goto out;
90 			}
91 		}
92 	}
93 
94 	/*
95 	 * If there is enough inactive page cache, i.e. if the size of the
96 	 * inactive list is greater than that of the active list *and* the
97 	 * inactive list actually has some pages to scan on this priority, we
98 	 * do not reclaim anything from the anonymous working set right now.
99 	 * Without the second condition we could end up never scanning an
100 	 * lruvec even if it has plenty of old anonymous pages unless the
101 	 * system is under heavy pressure.
102 	 */
103 
104 	if (!IS_ENABLED(CONFIG_BALANCE_ANON_FILE_RECLAIM) &&
105 	    !inactive_is_low(lruvec, LRU_INACTIVE_FILE) &&
106 	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
107 		scan_balance = SCAN_FILE;
108 		goto out;
109 	}
110 
111 	scan_balance = SCAN_FRACT;
112 
113 	/*
114 	 * Calculate the pressure balance between anon and file pages.
115 	 *
116 	 * The amount of pressure we put on each LRU is inversely
117 	 * proportional to the cost of reclaiming each list, as
118 	 * determined by the share of pages that are refaulting, times
119 	 * the relative IO cost of bringing back a swapped out
120 	 * anonymous page vs reloading a filesystem page (swappiness).
121 	 *
122 	 * Although we limit that influence to ensure no list gets
123 	 * left behind completely: at least a third of the pressure is
124 	 * applied, before swappiness.
125 	 *
126 	 * With swappiness at 100, anon and file have equal IO cost.
127 	 */
128 	total_cost = sc->anon_cost + sc->file_cost;
129 	anon_cost = total_cost + sc->anon_cost;
130 	file_cost = total_cost + sc->file_cost;
131 	total_cost = anon_cost + file_cost;
132 
133 	ap = swappiness * (total_cost + 1);
134 	ap /= anon_cost + 1;
135 
136 	fp = (200 - swappiness) * (total_cost + 1);
137 	fp /= file_cost + 1;
138 
139 	fraction[0] = ap;
140 	fraction[1] = fp;
141 	denominator = ap + fp;
142 
143 out:
144 	*lru_pages = 0;
145 	for_each_evictable_lru(lru) {
146 		int file = is_file_lru(lru);
147 		unsigned long lruvec_size;
148 		unsigned long scan;
149 
150 		lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
151 		scan = lruvec_size;
152 		*lru_pages += scan;
153 		scan >>= sc->priority;
154 
155 		switch (scan_balance) {
156 		case SCAN_EQUAL:
157 			/* Scan lists relative to size */
158 			break;
159 		case SCAN_FRACT:
160 			/*
161 			 * Scan types proportional to swappiness and
162 			 * their relative recent reclaim efficiency.
163 			 * Make sure we don't miss the last page on
164 			 * the offlined memory cgroups because of a
165 			 * round-off error.
166 			 */
167 			scan = DIV64_U64_ROUND_UP(scan * fraction[file],
168 						  denominator);
169 			break;
170 		case SCAN_FILE:
171 		case SCAN_ANON:
172 			/* Scan one type exclusively */
173 			if ((scan_balance == SCAN_FILE) != file)
174 				scan = 0;
175 			break;
176 		default:
177 			/* Look ma, no brain */
178 			BUG();
179 		}
180 
181 		nr[lru] = scan;
182 	}
183 }
184 
185 #define ISOLATE_LIMIT_CNT 5
shrink_anon_memcg(struct pglist_data * pgdat,struct mem_cgroup * memcg,struct scan_control * sc,unsigned long * nr)186 void shrink_anon_memcg(struct pglist_data *pgdat,
187 		struct mem_cgroup *memcg, struct scan_control *sc,
188 		unsigned long *nr)
189 {
190 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
191 	unsigned long nr_to_scan;
192 	enum lru_list lru;
193 	unsigned long nr_reclaimed = 0;
194 	struct blk_plug plug;
195 
196 	blk_start_plug(&plug);
197 
198 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) {
199 		for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) {
200 			if (nr[lru]) {
201 				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
202 				nr[lru] -= nr_to_scan;
203 				nr_reclaimed +=
204 					shrink_list(lru, nr_to_scan,
205 							lruvec, sc);
206 			}
207 		}
208 		if (sc->nr_reclaimed >= sc->nr_to_reclaim ||
209 				(sc->isolate_count > ISOLATE_LIMIT_CNT &&
210 				sc->invoker == DIRECT_RECLAIM))
211 			break;
212 	}
213 	blk_finish_plug(&plug);
214 	sc->nr_reclaimed += nr_reclaimed;
215 	sc->nr_reclaimed_anon += nr_reclaimed;
216 }
217 
memcg_is_child_of(struct mem_cgroup * mcg,struct mem_cgroup * tmcg)218 static inline bool memcg_is_child_of(struct mem_cgroup *mcg, struct mem_cgroup *tmcg)
219 {
220 	if (tmcg == NULL)
221 		return true;
222 
223 	while (!mem_cgroup_is_root(mcg)) {
224 		if (mcg == tmcg)
225 			break;
226 
227 		mcg = parent_mem_cgroup(mcg);
228 	}
229 
230 	return (mcg == tmcg);
231 }
232 
shrink_anon(struct pglist_data * pgdat,struct scan_control * sc,unsigned long * nr)233 static void shrink_anon(struct pglist_data *pgdat,
234 		struct scan_control *sc, unsigned long *nr)
235 {
236 	unsigned long reclaimed;
237 	unsigned long scanned;
238 	struct mem_cgroup *memcg = NULL;
239 	struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
240 	unsigned long nr_memcg[NR_LRU_LISTS];
241 	unsigned long nr_node_active = lruvec_lru_size(
242 			node_lruvec(pgdat), LRU_ACTIVE_ANON, MAX_NR_ZONES);
243 	unsigned long nr_node_inactive = lruvec_lru_size(
244 			node_lruvec(pgdat), LRU_INACTIVE_ANON, MAX_NR_ZONES);
245 
246 	while ((memcg = get_next_memcg(memcg))) {
247 		struct lruvec *lruvec = NULL;
248 
249 		if (!memcg_is_child_of(memcg, target_memcg))
250 			continue;
251 
252 		lruvec = mem_cgroup_lruvec(memcg, pgdat);
253 
254 		reclaimed = sc->nr_reclaimed;
255 		scanned = sc->nr_scanned;
256 
257 		nr_memcg[LRU_ACTIVE_ANON] = nr[LRU_ACTIVE_ANON] *
258 			lruvec_lru_size(lruvec, LRU_ACTIVE_ANON,
259 					MAX_NR_ZONES) / (nr_node_active + 1);
260 		nr_memcg[LRU_INACTIVE_ANON] = nr[LRU_INACTIVE_ANON] *
261 			lruvec_lru_size(lruvec, LRU_INACTIVE_ANON,
262 					MAX_NR_ZONES) / (nr_node_inactive + 1);
263 		nr_memcg[LRU_ACTIVE_FILE] = 0;
264 		nr_memcg[LRU_INACTIVE_FILE] = 0;
265 
266 		/*
267 		 * This loop can become CPU-bound when target memcgs
268 		 * aren't eligible for reclaim - either because they
269 		 * don't have any reclaimable pages, or because their
270 		 * memory is explicitly protected. Avoid soft lockups.
271 		 */
272 		cond_resched();
273 
274 		mem_cgroup_calculate_protection(target_memcg, memcg);
275 
276 		if (mem_cgroup_below_min(memcg)) {
277 			/*
278 			 * Hard protection.
279 			 * If there is no reclaimable memory, OOM.
280 			 */
281 			continue;
282 		} else if (mem_cgroup_below_low(memcg)) {
283 			/*
284 			 * Soft protection.
285 			 * Respect the protection only as long as
286 			 * there is an unprotected supply
287 			 * of reclaimable memory from other cgroups.
288 			 */
289 			if (!sc->memcg_low_reclaim) {
290 				sc->memcg_low_skipped = 1;
291 				continue;
292 			}
293 			memcg_memory_event(memcg, MEMCG_LOW);
294 		}
295 
296 		shrink_anon_memcg(pgdat, memcg, sc, nr_memcg);
297 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
298 					sc->priority);
299 
300 		vmpressure(sc->gfp_mask, memcg, false,
301 				sc->nr_scanned - scanned,
302 				sc->nr_reclaimed - reclaimed);
303 
304 		if (sc->nr_reclaimed >= sc->nr_to_reclaim ||
305 			(sc->isolate_count > ISOLATE_LIMIT_CNT &&
306 			sc->invoker == DIRECT_RECLAIM)) {
307 			get_next_memcg_break(memcg);
308 			break;
309 		}
310 	}
311 }
312 
shrink_file(struct pglist_data * pgdat,struct scan_control * sc,unsigned long * nr)313 static void shrink_file(struct pglist_data *pgdat,
314 		struct scan_control *sc, unsigned long *nr)
315 {
316 	struct lruvec *lruvec = node_lruvec(pgdat);
317 	unsigned long nr_to_scan;
318 	enum lru_list lru;
319 	unsigned long nr_reclaimed = 0;
320 	struct blk_plug plug;
321 
322 	blk_start_plug(&plug);
323 
324 	while (nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) {
325 		for (lru = LRU_INACTIVE_FILE; lru <= LRU_ACTIVE_FILE; lru++) {
326 			if (nr[lru]) {
327 				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
328 				nr[lru] -= nr_to_scan;
329 				nr_reclaimed +=
330 					shrink_list(lru,
331 							nr_to_scan,
332 							lruvec, sc);
333 			}
334 		}
335 	}
336 	blk_finish_plug(&plug);
337 	sc->nr_reclaimed += nr_reclaimed;
338 	sc->nr_reclaimed_file += nr_reclaimed;
339 }
340 
shrink_node_hyperhold(struct pglist_data * pgdat,struct scan_control * sc)341 bool shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc)
342 {
343 	unsigned long nr_reclaimed;
344 	struct lruvec *target_lruvec;
345 	bool reclaimable = false;
346 	unsigned long file;
347 
348 	target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
349 	do {
350 		/* Get scan count for file and anon */
351 		unsigned long node_lru_pages = 0;
352 		unsigned long nr[NR_LRU_LISTS] = {0};
353 
354 		memset(&sc->nr, 0, sizeof(sc->nr));
355 		nr_reclaimed = sc->nr_reclaimed;
356 
357 		/*
358 		 * Determine the scan balance between anon and file LRUs.
359 		 */
360 		spin_lock_irq(&pgdat->lru_lock);
361 		sc->anon_cost = mem_cgroup_lruvec(NULL, pgdat)->anon_cost;
362 		sc->file_cost = node_lruvec(pgdat)->file_cost;
363 		spin_unlock_irq(&pgdat->lru_lock);
364 
365 		/*
366 		 * Target desirable inactive:active list ratios for the anon
367 		 * and file LRU lists.
368 		 */
369 		if (!sc->force_deactivate) {
370 			unsigned long refaults;
371 
372 			refaults = lruvec_page_state(target_lruvec,
373 					WORKINGSET_ACTIVATE_ANON);
374 			if (refaults != target_lruvec->refaults[0] ||
375 					inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
376 				sc->may_deactivate |= DEACTIVATE_ANON;
377 			else
378 				sc->may_deactivate &= ~DEACTIVATE_ANON;
379 
380 			/*
381 			 * When refaults are being observed, it means a new
382 			 * workingset is being established. Deactivate to get
383 			 * rid of any stale active pages quickly.
384 			 */
385 #ifdef CONFIG_HYPERHOLD_FILE_LRU
386 			refaults = lruvec_page_state(node_lruvec(pgdat),
387 					WORKINGSET_ACTIVATE_FILE);
388 			if (refaults != node_lruvec(pgdat)->refaults[1] ||
389 					inactive_is_low(node_lruvec(pgdat), LRU_INACTIVE_FILE))
390 				sc->may_deactivate |= DEACTIVATE_FILE;
391 #else
392 			refaults = lruvec_page_state(target_lruvec,
393 					WORKINGSET_ACTIVATE_FILE);
394 			if (refaults != target_lruvec->refaults[1] ||
395 					inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
396 				sc->may_deactivate |= DEACTIVATE_FILE;
397 #endif
398 			else
399 				sc->may_deactivate &= ~DEACTIVATE_FILE;
400 		} else
401 			sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
402 
403 		/*
404 		 * If we have plenty of inactive file pages that aren't
405 		 * thrashing, try to reclaim those first before touching
406 		 * anonymous pages.
407 		 */
408 #ifdef CONFIG_HYPERHOLD_FILE_LRU
409 		file = lruvec_page_state(node_lruvec(pgdat), NR_INACTIVE_FILE);
410 #else
411 		file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
412 #endif
413 		if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
414 			sc->cache_trim_mode = 1;
415 		else
416 			sc->cache_trim_mode = 0;
417 
418 		/*
419 		 * Prevent the reclaimer from falling into the cache trap: as
420 		 * cache pages start out inactive, every cache fault will tip
421 		 * the scan balance towards the file LRU.  And as the file LRU
422 		 * shrinks, so does the window for rotation from references.
423 		 * This means we have a runaway feedback loop where a tiny
424 		 * thrashing file LRU becomes infinitely more attractive than
425 		 * anon pages.  Try to detect this based on file LRU size.
426 		 */
427 		if (!cgroup_reclaim(sc)) {
428 			unsigned long total_high_wmark = 0;
429 			unsigned long free, anon;
430 			int z;
431 
432 			free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
433 			file = node_page_state(pgdat, NR_ACTIVE_FILE) +
434 				node_page_state(pgdat, NR_INACTIVE_FILE);
435 
436 			for (z = 0; z < MAX_NR_ZONES; z++) {
437 				struct zone *zone = &pgdat->node_zones[z];
438 
439 				if (!managed_zone(zone))
440 					continue;
441 
442 				total_high_wmark += high_wmark_pages(zone);
443 			}
444 
445 			/*
446 			 * Consider anon: if that's low too, this isn't a
447 			 * runaway file reclaim problem, but rather just
448 			 * extreme pressure. Reclaim as per usual then.
449 			 */
450 			anon = node_page_state(pgdat, NR_INACTIVE_ANON);
451 
452 			sc->file_is_tiny =
453 				file + free <= total_high_wmark &&
454 				!(sc->may_deactivate & DEACTIVATE_ANON) &&
455 				anon >> sc->priority;
456 		}
457 
458 		get_scan_count_hyperhold(pgdat, sc, nr, &node_lru_pages);
459 
460 		if (!cgroup_reclaim(sc)) {
461 			/* Shrink the Total-File-LRU */
462 			shrink_file(pgdat, sc, nr);
463 		}
464 
465 		/* Shrink Anon by iterating score_list */
466 		shrink_anon(pgdat, sc, nr);
467 
468 		if (sc->nr_reclaimed - nr_reclaimed)
469 			reclaimable = true;
470 
471 		if (current_is_kswapd()) {
472 			/*
473 			 * If reclaim is isolating dirty pages under writeback,
474 			 * it implies that the long-lived page allocation rate
475 			 * is exceeding the page laundering rate. Either the
476 			 * global limits are not being effective at throttling
477 			 * processes due to the page distribution throughout
478 			 * zones or there is heavy usage of a slow backing
479 			 * device. The only option is to throttle from reclaim
480 			 * context which is not ideal as there is no guarantee
481 			 * the dirtying process is throttled in the same way
482 			 * balance_dirty_pages() manages.
483 			 *
484 			 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
485 			 * count the number of pages under pages flagged for
486 			 * immediate reclaim and stall if any are encountered
487 			 * in the nr_immediate check below.
488 			 */
489 			if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
490 				set_bit(PGDAT_WRITEBACK, &pgdat->flags);
491 
492 			/* Allow kswapd to start writing pages during reclaim. */
493 			if (sc->nr.unqueued_dirty == sc->nr.file_taken)
494 				set_bit(PGDAT_DIRTY, &pgdat->flags);
495 
496 			/*
497 			 * If kswapd scans pages marked for immediate
498 			 * reclaim and under writeback (nr_immediate), it
499 			 * implies that pages are cycling through the LRU
500 			 * faster than they are written so also forcibly stall.
501 			 */
502 			if (sc->nr.immediate)
503 				congestion_wait(BLK_RW_ASYNC, HZ/10);
504 		}
505 		/*
506 		 * Legacy memcg will stall in page writeback so avoid forcibly
507 		 * stalling in wait_iff_congested().
508 		 */
509 		if ((current_is_kswapd() ||
510 		    (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
511 		    sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
512 			set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
513 
514 		/*
515 		 * Stall direct reclaim for IO completions if underlying BDIs
516 		 * and node is congested. Allow kswapd to continue until it
517 		 * starts encountering unqueued dirty pages or cycling through
518 		 * the LRU too quickly.
519 		 */
520 		if (!current_is_kswapd() && current_may_throttle() &&
521 		    !sc->hibernation_mode &&
522 		    test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
523 			wait_iff_congested(BLK_RW_ASYNC, HZ/10);
524 
525 	} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
526 					 sc));
527 	/*
528 	 * Kswapd gives up on balancing particular nodes after too
529 	 * many failures to reclaim anything from them and goes to
530 	 * sleep. On reclaim progress, reset the failure counter. A
531 	 * successful direct reclaim run will revive a dormant kswapd.
532 	 */
533 	if (reclaimable)
534 		pgdat->kswapd_failures = 0;
535 
536 	return reclaimable;
537 }
538