1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * mm/memcg_reclaim.c
4 *
5 * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd.
6 */
7 #include <linux/mm.h>
8 #include <linux/backing-dev.h>
9 #include <linux/hyperhold_inf.h>
10
11 #ifdef CONFIG_HYPERHOLD_FILE_LRU
12 #include <linux/memcg_policy.h>
13 #include "internal.h"
14 #endif
15
is_swap_not_allowed(struct scan_control * sc,int swappiness)16 static inline bool is_swap_not_allowed(struct scan_control *sc, int swappiness)
17 {
18 return !sc->may_swap || !swappiness || !get_nr_swap_pages();
19 }
20
21 /*
22 * From 0 .. 100. Higher means more swappy.
23 */
24 #define HYPERHOLD_SWAPPINESS 100
25
get_hyperhold_swappiness(void)26 static int get_hyperhold_swappiness(void)
27 {
28 return is_hyperhold_enable() ? HYPERHOLD_SWAPPINESS : vm_swappiness;
29 }
30
get_scan_count_hyperhold(struct pglist_data * pgdat,struct scan_control * sc,unsigned long * nr,unsigned long * lru_pages)31 static void get_scan_count_hyperhold(struct pglist_data *pgdat,
32 struct scan_control *sc, unsigned long *nr,
33 unsigned long *lru_pages)
34 {
35 int swappiness = get_hyperhold_swappiness();
36 struct lruvec *lruvec = node_lruvec(pgdat);
37 u64 fraction[2];
38 u64 denominator;
39 enum scan_balance scan_balance;
40 unsigned long ap, fp;
41 enum lru_list lru;
42 unsigned long pgdatfile;
43 unsigned long pgdatfree;
44 int z;
45 unsigned long anon_cost, file_cost, total_cost;
46 unsigned long total_high_wmark = 0;
47
48
49 if (cgroup_reclaim(sc) && !swappiness) {
50 scan_balance = SCAN_FILE;
51 goto out;
52 }
53
54 /*
55 * Do not apply any pressure balancing cleverness when the
56 * system is close to OOM, scan both anon and file equally
57 * (unless the swappiness setting disagrees with swapping).
58 */
59 if (!sc->priority && swappiness) {
60 scan_balance = SCAN_EQUAL;
61 goto out;
62 }
63
64 if (!cgroup_reclaim(sc)) {
65 pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
66 pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
67 node_page_state(pgdat, NR_INACTIVE_FILE);
68
69 for (z = 0; z < MAX_NR_ZONES; z++) {
70 struct zone *zone = &pgdat->node_zones[z];
71
72 if (!managed_zone(zone))
73 continue;
74
75 total_high_wmark += high_wmark_pages(zone);
76 }
77
78 if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
79 /*
80 * Force SCAN_ANON if there are enough inactive
81 * anonymous pages on the LRU in eligible zones.
82 * Otherwise, the small LRU gets thrashed.
83 */
84 if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON) &&
85 (lruvec_lru_size(lruvec, LRU_INACTIVE_ANON,
86 sc->reclaim_idx) >>
87 (unsigned int)sc->priority)) {
88 scan_balance = SCAN_ANON;
89 goto out;
90 }
91 }
92 }
93
94 /*
95 * If there is enough inactive page cache, i.e. if the size of the
96 * inactive list is greater than that of the active list *and* the
97 * inactive list actually has some pages to scan on this priority, we
98 * do not reclaim anything from the anonymous working set right now.
99 * Without the second condition we could end up never scanning an
100 * lruvec even if it has plenty of old anonymous pages unless the
101 * system is under heavy pressure.
102 */
103
104 if (!IS_ENABLED(CONFIG_BALANCE_ANON_FILE_RECLAIM) &&
105 !inactive_is_low(lruvec, LRU_INACTIVE_FILE) &&
106 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
107 scan_balance = SCAN_FILE;
108 goto out;
109 }
110
111 scan_balance = SCAN_FRACT;
112
113 /*
114 * Calculate the pressure balance between anon and file pages.
115 *
116 * The amount of pressure we put on each LRU is inversely
117 * proportional to the cost of reclaiming each list, as
118 * determined by the share of pages that are refaulting, times
119 * the relative IO cost of bringing back a swapped out
120 * anonymous page vs reloading a filesystem page (swappiness).
121 *
122 * Although we limit that influence to ensure no list gets
123 * left behind completely: at least a third of the pressure is
124 * applied, before swappiness.
125 *
126 * With swappiness at 100, anon and file have equal IO cost.
127 */
128 total_cost = sc->anon_cost + sc->file_cost;
129 anon_cost = total_cost + sc->anon_cost;
130 file_cost = total_cost + sc->file_cost;
131 total_cost = anon_cost + file_cost;
132
133 ap = swappiness * (total_cost + 1);
134 ap /= anon_cost + 1;
135
136 fp = (200 - swappiness) * (total_cost + 1);
137 fp /= file_cost + 1;
138
139 fraction[0] = ap;
140 fraction[1] = fp;
141 denominator = ap + fp;
142
143 out:
144 *lru_pages = 0;
145 for_each_evictable_lru(lru) {
146 int file = is_file_lru(lru);
147 unsigned long lruvec_size;
148 unsigned long scan;
149
150 lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
151 scan = lruvec_size;
152 *lru_pages += scan;
153 scan >>= sc->priority;
154
155 switch (scan_balance) {
156 case SCAN_EQUAL:
157 /* Scan lists relative to size */
158 break;
159 case SCAN_FRACT:
160 /*
161 * Scan types proportional to swappiness and
162 * their relative recent reclaim efficiency.
163 * Make sure we don't miss the last page on
164 * the offlined memory cgroups because of a
165 * round-off error.
166 */
167 scan = DIV64_U64_ROUND_UP(scan * fraction[file],
168 denominator);
169 break;
170 case SCAN_FILE:
171 case SCAN_ANON:
172 /* Scan one type exclusively */
173 if ((scan_balance == SCAN_FILE) != file)
174 scan = 0;
175 break;
176 default:
177 /* Look ma, no brain */
178 BUG();
179 }
180
181 nr[lru] = scan;
182 }
183 }
184
185 #define ISOLATE_LIMIT_CNT 5
shrink_anon_memcg(struct pglist_data * pgdat,struct mem_cgroup * memcg,struct scan_control * sc,unsigned long * nr)186 void shrink_anon_memcg(struct pglist_data *pgdat,
187 struct mem_cgroup *memcg, struct scan_control *sc,
188 unsigned long *nr)
189 {
190 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
191 unsigned long nr_to_scan;
192 enum lru_list lru;
193 unsigned long nr_reclaimed = 0;
194 struct blk_plug plug;
195
196 blk_start_plug(&plug);
197
198 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) {
199 for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) {
200 if (nr[lru]) {
201 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
202 nr[lru] -= nr_to_scan;
203 nr_reclaimed +=
204 shrink_list(lru, nr_to_scan,
205 lruvec, sc);
206 }
207 }
208 if (sc->nr_reclaimed >= sc->nr_to_reclaim ||
209 (sc->isolate_count > ISOLATE_LIMIT_CNT &&
210 sc->invoker == DIRECT_RECLAIM))
211 break;
212 }
213 blk_finish_plug(&plug);
214 sc->nr_reclaimed += nr_reclaimed;
215 sc->nr_reclaimed_anon += nr_reclaimed;
216 }
217
memcg_is_child_of(struct mem_cgroup * mcg,struct mem_cgroup * tmcg)218 static inline bool memcg_is_child_of(struct mem_cgroup *mcg, struct mem_cgroup *tmcg)
219 {
220 while (!mem_cgroup_is_root(mcg)) {
221 if (mcg == tmcg)
222 break;
223
224 mcg = parent_mem_cgroup(mcg);
225 }
226
227 return (mcg == tmcg);
228 }
229
shrink_anon(struct pglist_data * pgdat,struct scan_control * sc,unsigned long * nr)230 static void shrink_anon(struct pglist_data *pgdat,
231 struct scan_control *sc, unsigned long *nr)
232 {
233 unsigned long reclaimed;
234 unsigned long scanned;
235 struct mem_cgroup *memcg = NULL;
236 struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
237 unsigned long nr_memcg[NR_LRU_LISTS];
238 unsigned long nr_node_active = lruvec_lru_size(
239 node_lruvec(pgdat), LRU_ACTIVE_ANON, MAX_NR_ZONES);
240 unsigned long nr_node_inactive = lruvec_lru_size(
241 node_lruvec(pgdat), LRU_INACTIVE_ANON, MAX_NR_ZONES);
242
243 while ((memcg = get_next_memcg(memcg))) {
244 struct lruvec *lruvec = NULL;
245
246 if (!memcg_is_child_of(memcg, target_memcg))
247 continue;
248
249 lruvec = mem_cgroup_lruvec(memcg, pgdat);
250
251 reclaimed = sc->nr_reclaimed;
252 scanned = sc->nr_scanned;
253
254 nr_memcg[LRU_ACTIVE_ANON] = nr[LRU_ACTIVE_ANON] *
255 lruvec_lru_size(lruvec, LRU_ACTIVE_ANON,
256 MAX_NR_ZONES) / (nr_node_active + 1);
257 nr_memcg[LRU_INACTIVE_ANON] = nr[LRU_INACTIVE_ANON] *
258 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON,
259 MAX_NR_ZONES) / (nr_node_inactive + 1);
260 nr_memcg[LRU_ACTIVE_FILE] = 0;
261 nr_memcg[LRU_INACTIVE_FILE] = 0;
262
263 /*
264 * This loop can become CPU-bound when target memcgs
265 * aren't eligible for reclaim - either because they
266 * don't have any reclaimable pages, or because their
267 * memory is explicitly protected. Avoid soft lockups.
268 */
269 cond_resched();
270
271 mem_cgroup_calculate_protection(target_memcg, memcg);
272
273 if (mem_cgroup_below_min(memcg)) {
274 /*
275 * Hard protection.
276 * If there is no reclaimable memory, OOM.
277 */
278 continue;
279 } else if (mem_cgroup_below_low(memcg)) {
280 /*
281 * Soft protection.
282 * Respect the protection only as long as
283 * there is an unprotected supply
284 * of reclaimable memory from other cgroups.
285 */
286 if (!sc->memcg_low_reclaim) {
287 sc->memcg_low_skipped = 1;
288 continue;
289 }
290 memcg_memory_event(memcg, MEMCG_LOW);
291 }
292
293 shrink_anon_memcg(pgdat, memcg, sc, nr_memcg);
294 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
295 sc->priority);
296
297 vmpressure(sc->gfp_mask, memcg, false,
298 sc->nr_scanned - scanned,
299 sc->nr_reclaimed - reclaimed);
300
301 if (sc->nr_reclaimed >= sc->nr_to_reclaim ||
302 (sc->isolate_count > ISOLATE_LIMIT_CNT &&
303 sc->invoker == DIRECT_RECLAIM)) {
304 get_next_memcg_break(memcg);
305 break;
306 }
307 }
308 }
309
shrink_file(struct pglist_data * pgdat,struct scan_control * sc,unsigned long * nr)310 static void shrink_file(struct pglist_data *pgdat,
311 struct scan_control *sc, unsigned long *nr)
312 {
313 struct lruvec *lruvec = node_lruvec(pgdat);
314 unsigned long nr_to_scan;
315 enum lru_list lru;
316 unsigned long nr_reclaimed = 0;
317 struct blk_plug plug;
318
319 blk_start_plug(&plug);
320
321 while (nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) {
322 for (lru = LRU_INACTIVE_FILE; lru <= LRU_ACTIVE_FILE; lru++) {
323 if (nr[lru]) {
324 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
325 nr[lru] -= nr_to_scan;
326 nr_reclaimed +=
327 shrink_list(lru,
328 nr_to_scan,
329 lruvec, sc);
330 }
331 }
332 }
333 blk_finish_plug(&plug);
334 sc->nr_reclaimed += nr_reclaimed;
335 sc->nr_reclaimed_file += nr_reclaimed;
336 }
337
shrink_node_hyperhold(struct pglist_data * pgdat,struct scan_control * sc)338 bool shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc)
339 {
340 unsigned long nr_reclaimed, nr_scanned;
341 struct lruvec *target_lruvec;
342 bool reclaimable = false;
343 unsigned long file;
344
345 target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
346 do {
347 /* Get scan count for file and anon */
348 unsigned long node_lru_pages = 0;
349 unsigned long nr[NR_LRU_LISTS] = {0};
350
351 memset(&sc->nr, 0, sizeof(sc->nr));
352 nr_reclaimed = sc->nr_reclaimed;
353 nr_scanned = sc->nr_scanned;
354
355 /*
356 * Determine the scan balance between anon and file LRUs.
357 */
358 spin_lock_irq(&pgdat->lru_lock);
359 sc->anon_cost = mem_cgroup_lruvec(NULL, pgdat)->anon_cost;
360 sc->file_cost = node_lruvec(pgdat)->file_cost;
361 spin_unlock_irq(&pgdat->lru_lock);
362
363 /*
364 * Target desirable inactive:active list ratios for the anon
365 * and file LRU lists.
366 */
367 if (!sc->force_deactivate) {
368 unsigned long refaults;
369
370 refaults = lruvec_page_state(target_lruvec,
371 WORKINGSET_ACTIVATE_ANON);
372 if (refaults != target_lruvec->refaults[0] ||
373 inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
374 sc->may_deactivate |= DEACTIVATE_ANON;
375 else
376 sc->may_deactivate &= ~DEACTIVATE_ANON;
377
378 /*
379 * When refaults are being observed, it means a new
380 * workingset is being established. Deactivate to get
381 * rid of any stale active pages quickly.
382 */
383 #ifdef CONFIG_HYPERHOLD_FILE_LRU
384 refaults = lruvec_page_state(node_lruvec(pgdat),
385 WORKINGSET_ACTIVATE_FILE);
386 if (refaults != node_lruvec(pgdat)->refaults[1] ||
387 inactive_is_low(node_lruvec(pgdat), LRU_INACTIVE_FILE))
388 sc->may_deactivate |= DEACTIVATE_FILE;
389 #else
390 refaults = lruvec_page_state(target_lruvec,
391 WORKINGSET_ACTIVATE_FILE);
392 if (refaults != target_lruvec->refaults[1] ||
393 inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
394 sc->may_deactivate |= DEACTIVATE_FILE;
395 #endif
396 else
397 sc->may_deactivate &= ~DEACTIVATE_FILE;
398 } else
399 sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
400
401 /*
402 * If we have plenty of inactive file pages that aren't
403 * thrashing, try to reclaim those first before touching
404 * anonymous pages.
405 */
406 #ifdef CONFIG_HYPERHOLD_FILE_LRU
407 file = lruvec_page_state(node_lruvec(pgdat), NR_INACTIVE_FILE);
408 #else
409 file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
410 #endif
411 if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
412 sc->cache_trim_mode = 1;
413 else
414 sc->cache_trim_mode = 0;
415
416 /*
417 * Prevent the reclaimer from falling into the cache trap: as
418 * cache pages start out inactive, every cache fault will tip
419 * the scan balance towards the file LRU. And as the file LRU
420 * shrinks, so does the window for rotation from references.
421 * This means we have a runaway feedback loop where a tiny
422 * thrashing file LRU becomes infinitely more attractive than
423 * anon pages. Try to detect this based on file LRU size.
424 */
425 if (!cgroup_reclaim(sc)) {
426 unsigned long total_high_wmark = 0;
427 unsigned long free, anon;
428 int z;
429
430 free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
431 file = node_page_state(pgdat, NR_ACTIVE_FILE) +
432 node_page_state(pgdat, NR_INACTIVE_FILE);
433
434 for (z = 0; z < MAX_NR_ZONES; z++) {
435 struct zone *zone = &pgdat->node_zones[z];
436
437 if (!managed_zone(zone))
438 continue;
439
440 total_high_wmark += high_wmark_pages(zone);
441 }
442
443 /*
444 * Consider anon: if that's low too, this isn't a
445 * runaway file reclaim problem, but rather just
446 * extreme pressure. Reclaim as per usual then.
447 */
448 anon = node_page_state(pgdat, NR_INACTIVE_ANON);
449
450 sc->file_is_tiny =
451 file + free <= total_high_wmark &&
452 !(sc->may_deactivate & DEACTIVATE_ANON) &&
453 anon >> sc->priority;
454 }
455
456 get_scan_count_hyperhold(pgdat, sc, nr, &node_lru_pages);
457
458 if (!cgroup_reclaim(sc)) {
459 /* Shrink the Total-File-LRU */
460 shrink_file(pgdat, sc, nr);
461 }
462
463 /* Shrink Anon by iterating score_list */
464 shrink_anon(pgdat, sc, nr);
465
466 if (sc->nr_reclaimed - nr_reclaimed)
467 reclaimable = true;
468
469 if (current_is_kswapd()) {
470 /*
471 * If reclaim is isolating dirty pages under writeback,
472 * it implies that the long-lived page allocation rate
473 * is exceeding the page laundering rate. Either the
474 * global limits are not being effective at throttling
475 * processes due to the page distribution throughout
476 * zones or there is heavy usage of a slow backing
477 * device. The only option is to throttle from reclaim
478 * context which is not ideal as there is no guarantee
479 * the dirtying process is throttled in the same way
480 * balance_dirty_pages() manages.
481 *
482 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
483 * count the number of pages under pages flagged for
484 * immediate reclaim and stall if any are encountered
485 * in the nr_immediate check below.
486 */
487 if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
488 set_bit(PGDAT_WRITEBACK, &pgdat->flags);
489
490 /* Allow kswapd to start writing pages during reclaim. */
491 if (sc->nr.unqueued_dirty == sc->nr.file_taken)
492 set_bit(PGDAT_DIRTY, &pgdat->flags);
493
494 /*
495 * If kswapd scans pages marked for immediate
496 * reclaim and under writeback (nr_immediate), it
497 * implies that pages are cycling through the LRU
498 * faster than they are written so also forcibly stall.
499 */
500 if (sc->nr.immediate)
501 congestion_wait(BLK_RW_ASYNC, HZ/10);
502 }
503 /*
504 * Legacy memcg will stall in page writeback so avoid forcibly
505 * stalling in wait_iff_congested().
506 */
507 if ((current_is_kswapd() ||
508 (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
509 sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
510 set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
511
512 /*
513 * Stall direct reclaim for IO completions if underlying BDIs
514 * and node is congested. Allow kswapd to continue until it
515 * starts encountering unqueued dirty pages or cycling through
516 * the LRU too quickly.
517 */
518 if (!current_is_kswapd() && current_may_throttle() &&
519 !sc->hibernation_mode &&
520 test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
521 wait_iff_congested(BLK_RW_ASYNC, HZ/10);
522
523 } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
524 sc));
525 /*
526 * Kswapd gives up on balancing particular nodes after too
527 * many failures to reclaim anything from them and goes to
528 * sleep. On reclaim progress, reset the failure counter. A
529 * successful direct reclaim run will revive a dormant kswapd.
530 */
531 if (reclaimable)
532 pgdat->kswapd_failures = 0;
533
534 return reclaimable;
535 }
536