1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * mm/zswapd.c
4 *
5 * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd.
6 */
7
8 #include <linux/freezer.h>
9 #include <linux/memcg_policy.h>
10 #include <trace/events/vmscan.h>
11 #include <uapi/linux/sched/types.h>
12 #include <linux/zswapd.h>
13 #ifdef CONFIG_RECLAIM_ACCT
14 #include <linux/reclaim_acct.h>
15 #endif
16
17 #include "zswapd_internal.h"
18 #include "internal.h"
19
20 #define UNSET_ZRAM_WM_RATIO 0
21 #define ESWAP_PERCENT_CONSTANT 100
22 #define DEFAULT_ZRAM_WM_RATIO 37
23 #define SWAP_MORE_ZRAM (50 * (SZ_1M))
24
25 static wait_queue_head_t snapshotd_wait;
26 static atomic_t snapshotd_wait_flag;
27 static atomic_t snapshotd_init_flag = ATOMIC_INIT(0);
28 static struct task_struct *snapshotd_task;
29
30 static pid_t zswapd_pid = -1;
31 static unsigned long long last_anon_pagefault;
32 static unsigned long long anon_refault_ratio;
33 static unsigned long long zswapd_skip_interval;
34 static unsigned long last_zswapd_time;
35 static unsigned long last_snapshot_time;
36 bool last_round_is_empty;
37
38
39 DECLARE_RWSEM(gs_lock);
40 LIST_HEAD(gs_list);
41
unregister_group_swap(struct group_swap_device * gsdev)42 void unregister_group_swap(struct group_swap_device *gsdev)
43 {
44 down_write(&gs_lock);
45 list_del(&gsdev->list);
46 up_write(&gs_lock);
47
48 kfree(gsdev);
49 }
50 EXPORT_SYMBOL(unregister_group_swap);
51
register_group_swap(struct group_swap_ops * ops,void * priv)52 struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv)
53 {
54 struct group_swap_device *gsdev = kzalloc(sizeof(struct group_swap_device), GFP_KERNEL);
55
56 if (!gsdev)
57 return NULL;
58
59 gsdev->priv = priv;
60 gsdev->ops = ops;
61
62 down_write(&gs_lock);
63 list_add(&gsdev->list, &gs_list);
64 up_write(&gs_lock);
65
66 return gsdev;
67 }
68 EXPORT_SYMBOL(register_group_swap);
69
memcg_data_size(struct mem_cgroup * memcg,int type)70 u64 memcg_data_size(struct mem_cgroup *memcg, int type)
71 {
72 struct group_swap_device *gsdev = NULL;
73 u64 size = 0;
74
75 down_read(&gs_lock);
76 list_for_each_entry(gsdev, &gs_list, list)
77 size += gsdev->ops->group_data_size(memcg->id.id, type, gsdev->priv);
78 up_read(&gs_lock);
79
80 return size;
81 }
82
swapin_memcg(struct mem_cgroup * memcg,u64 req_size)83 u64 swapin_memcg(struct mem_cgroup *memcg, u64 req_size)
84 {
85 u64 swap_size = memcg_data_size(memcg, SWAP_SIZE);
86 u64 read_size = 0;
87 u64 ratio = atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio);
88 struct group_swap_device *gsdev = NULL;
89
90 if (req_size > div_u64(swap_size * ratio, ESWAP_PERCENT_CONSTANT))
91 req_size = div_u64(swap_size * ratio, ESWAP_PERCENT_CONSTANT);
92 down_read(&gs_lock);
93 list_for_each_entry(gsdev, &gs_list, list) {
94 read_size += gsdev->ops->group_read(memcg->id.id, req_size - read_size,
95 gsdev->priv);
96 if (read_size >= req_size)
97 break;
98 }
99 up_read(&gs_lock);
100
101 return read_size;
102 }
103
swapout_memcg(struct mem_cgroup * memcg,u64 req_size)104 static u64 swapout_memcg(struct mem_cgroup *memcg, u64 req_size)
105 {
106 u64 cache_size = memcg_data_size(memcg, CACHE_SIZE);
107 u64 swap_size = memcg_data_size(memcg, SWAP_SIZE);
108 u64 all_size = cache_size + swap_size;
109 u64 write_size = 0;
110 u32 ratio = atomic_read(&memcg->memcg_reclaimed.ub_zram2ufs_ratio);
111 struct group_swap_device *gsdev = NULL;
112
113 if (div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) <= swap_size)
114 return 0;
115 if (req_size > div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) - swap_size)
116 req_size = div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) - swap_size;
117 down_read(&gs_lock);
118 list_for_each_entry(gsdev, &gs_list, list) {
119 write_size += gsdev->ops->group_write(memcg->id.id, req_size - write_size,
120 gsdev->priv);
121 if (write_size >= req_size)
122 break;
123 }
124 up_read(&gs_lock);
125
126 return write_size;
127 }
128
swapout(u64 req_size)129 static u64 swapout(u64 req_size)
130 {
131 struct mem_cgroup *memcg = NULL;
132 u64 write_size = 0;
133
134 while ((memcg = get_next_memcg(memcg)) != NULL) {
135 write_size += swapout_memcg(memcg, req_size - write_size);
136 if (write_size >= req_size)
137 break;
138 }
139
140 return write_size;
141 }
142
get_zram_used_pages(void)143 static unsigned long long get_zram_used_pages(void)
144 {
145 struct mem_cgroup *memcg = NULL;
146 unsigned long long zram_pages = 0;
147
148 while ((memcg = get_next_memcg(memcg)) != NULL)
149 zram_pages += memcg_data_size(memcg, CACHE_PAGE);
150
151 return zram_pages;
152 }
153
get_eswap_used_pages(void)154 static unsigned long long get_eswap_used_pages(void)
155 {
156 struct mem_cgroup *memcg = NULL;
157 unsigned long long eswap_pages = 0;
158
159 while ((memcg = get_next_memcg(memcg)) != NULL)
160 eswap_pages += memcg_data_size(memcg, SWAP_PAGE);
161
162 return eswap_pages;
163 }
164
get_zram_pagefault(void)165 static unsigned long long get_zram_pagefault(void)
166 {
167 struct mem_cgroup *memcg = NULL;
168 unsigned long long cache_fault = 0;
169
170 while ((memcg = get_next_memcg(memcg)) != NULL)
171 cache_fault += memcg_data_size(memcg, CACHE_FAULT);
172
173 return cache_fault;
174 }
175
calc_sys_cur_avail_buffers(void)176 static unsigned int calc_sys_cur_avail_buffers(void)
177 {
178 const unsigned int percent_constant = 100;
179 unsigned long freemem;
180 unsigned long active_file;
181 unsigned long inactive_file;
182 unsigned long buffers;
183
184 freemem = global_zone_page_state(NR_FREE_PAGES) * PAGE_SIZE / SZ_1K;
185 active_file = global_node_page_state(NR_ACTIVE_FILE) * PAGE_SIZE / SZ_1K;
186 inactive_file = global_node_page_state(NR_INACTIVE_FILE) * PAGE_SIZE / SZ_1K;
187
188 buffers = freemem + inactive_file * get_inactive_file_ratio() / percent_constant +
189 active_file * get_active_file_ratio() / percent_constant;
190
191 return (buffers * SZ_1K / SZ_1M); /* kb to mb */
192 }
193
zswapd_status_show(struct seq_file * m)194 void zswapd_status_show(struct seq_file *m)
195 {
196 unsigned int buffers = calc_sys_cur_avail_buffers();
197
198 seq_printf(m, "buffer_size:%u\n", buffers);
199 seq_printf(m, "recent_refault:%llu\n", anon_refault_ratio);
200 }
201
get_zswapd_pid(void)202 pid_t get_zswapd_pid(void)
203 {
204 return zswapd_pid;
205 }
206
min_buffer_is_suitable(void)207 static bool min_buffer_is_suitable(void)
208 {
209 unsigned int buffers = calc_sys_cur_avail_buffers();
210
211 if (buffers >= get_min_avail_buffers())
212 return true;
213
214 return false;
215 }
216
buffer_is_suitable(void)217 static bool buffer_is_suitable(void)
218 {
219 unsigned int buffers = calc_sys_cur_avail_buffers();
220
221 if (buffers >= get_avail_buffers())
222 return true;
223
224 return false;
225 }
226
high_buffer_is_suitable(void)227 static bool high_buffer_is_suitable(void)
228 {
229 unsigned int buffers = calc_sys_cur_avail_buffers();
230
231 if (buffers >= get_high_avail_buffers())
232 return true;
233
234 return false;
235 }
236
snapshot_anon_refaults(void)237 static void snapshot_anon_refaults(void)
238 {
239 struct mem_cgroup *memcg = NULL;
240
241 while ((memcg = get_next_memcg(memcg)) != NULL)
242 memcg->memcg_reclaimed.reclaimed_pagefault = memcg_data_size(memcg, CACHE_FAULT);
243
244 last_anon_pagefault = get_zram_pagefault();
245 last_snapshot_time = jiffies;
246 }
247
248 /*
249 * Return true if refault changes between two read operations.
250 */
get_memcg_anon_refault_status(struct mem_cgroup * memcg)251 static bool get_memcg_anon_refault_status(struct mem_cgroup *memcg)
252 {
253 const unsigned int percent_constant = 100;
254 unsigned long long anon_pagefault;
255 unsigned long long anon_total;
256 unsigned long long ratio;
257 struct mem_cgroup_per_node *mz = NULL;
258 struct lruvec *lruvec = NULL;
259
260 if (!memcg)
261 return false;
262
263 anon_pagefault = memcg_data_size(memcg, CACHE_FAULT);
264 if (anon_pagefault == memcg->memcg_reclaimed.reclaimed_pagefault)
265 return false;
266
267 mz = mem_cgroup_nodeinfo(memcg, 0);
268 if (!mz)
269 return false;
270
271 lruvec = &mz->lruvec;
272 if (!lruvec)
273 return false;
274
275 anon_total = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
276 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES) +
277 memcg_data_size(memcg, SWAP_PAGE) + memcg_data_size(memcg, CACHE_PAGE);
278
279 ratio = div64_u64((anon_pagefault - memcg->memcg_reclaimed.reclaimed_pagefault) *
280 percent_constant, (anon_total + 1));
281 if (ratio > atomic_read(&memcg->memcg_reclaimed.refault_threshold))
282 return true;
283
284 return false;
285 }
286
get_area_anon_refault_status(void)287 static bool get_area_anon_refault_status(void)
288 {
289 const unsigned int percent_constant = 1000;
290 unsigned long long anon_pagefault;
291 unsigned long long ratio;
292 unsigned long long time;
293
294 anon_pagefault = get_zram_pagefault();
295 time = jiffies;
296 if (anon_pagefault == last_anon_pagefault || time == last_snapshot_time)
297 return false;
298
299 ratio = div_u64((anon_pagefault - last_anon_pagefault) * percent_constant,
300 (jiffies_to_msecs(time - last_snapshot_time) + 1));
301 anon_refault_ratio = ratio;
302
303 if (ratio > get_area_anon_refault_threshold())
304 return true;
305
306 return false;
307 }
308
wakeup_snapshotd(void)309 void wakeup_snapshotd(void)
310 {
311 unsigned long snapshot_interval;
312
313 snapshot_interval = jiffies_to_msecs(jiffies - last_snapshot_time);
314 if (snapshot_interval >= get_anon_refault_snapshot_min_interval()) {
315 atomic_set(&snapshotd_wait_flag, 1);
316 wake_up_interruptible(&snapshotd_wait);
317 }
318 }
319
snapshotd(void * p)320 static int snapshotd(void *p)
321 {
322 int ret;
323
324 while (!kthread_should_stop()) {
325 ret = wait_event_interruptible(snapshotd_wait, atomic_read(&snapshotd_wait_flag));
326 if (ret)
327 continue;
328
329 atomic_set(&snapshotd_wait_flag, 0);
330
331 snapshot_anon_refaults();
332 count_vm_event(ZSWAPD_SNAPSHOT_TIMES);
333 }
334
335 return 0;
336 }
337
set_snapshotd_init_flag(unsigned int val)338 void set_snapshotd_init_flag(unsigned int val)
339 {
340 atomic_set(&snapshotd_init_flag, val);
341 }
342
343 /*
344 * This snapshotd start function will be called by init.
345 */
snapshotd_run(void)346 int snapshotd_run(void)
347 {
348 atomic_set(&snapshotd_wait_flag, 0);
349 init_waitqueue_head(&snapshotd_wait);
350
351 snapshotd_task = kthread_run(snapshotd, NULL, "snapshotd");
352 if (IS_ERR(snapshotd_task)) {
353 pr_err("Failed to start snapshotd\n");
354 return PTR_ERR(snapshotd_task);
355 }
356
357 return 0;
358 }
359
snapshotd_init(void)360 static int __init snapshotd_init(void)
361 {
362 snapshotd_run();
363
364 return 0;
365 }
366 module_init(snapshotd_init);
367
get_zswapd_eswap_policy(void)368 static int get_zswapd_eswap_policy(void)
369 {
370 if (get_zram_wm_ratio() == UNSET_ZRAM_WM_RATIO)
371 return CHECK_BUFFER_ONLY;
372 else
373 return CHECK_BUFFER_ZRAMRATIO_BOTH;
374 }
375
get_policy_zram_wm_ratio(void)376 static unsigned int get_policy_zram_wm_ratio(void)
377 {
378 enum zswapd_eswap_policy policy = get_zswapd_eswap_policy();
379
380 if (policy == CHECK_BUFFER_ONLY)
381 return DEFAULT_ZRAM_WM_RATIO;
382 else
383 return get_zram_wm_ratio();
384 }
385
get_zram_current_watermark(void)386 int get_zram_current_watermark(void)
387 {
388 long long diff_buffers;
389 const unsigned int percent_constant = 10;
390 u64 nr_total;
391 unsigned int zram_wm_ratio = get_policy_zram_wm_ratio();
392
393 nr_total = totalram_pages();
394 /* B_target - B_current */
395 diff_buffers = get_avail_buffers() - calc_sys_cur_avail_buffers();
396 /* MB to page */
397 diff_buffers *= SZ_1M / PAGE_SIZE;
398 /* after_comp to before_comp */
399 diff_buffers *= get_compress_ratio();
400 /* page to ratio */
401 diff_buffers = div64_s64(diff_buffers * percent_constant, nr_total);
402
403 return min((long long)zram_wm_ratio, zram_wm_ratio - diff_buffers);
404 }
405
zram_watermark_ok(void)406 bool zram_watermark_ok(void)
407 {
408 const unsigned int percent_constant = 100;
409 u64 nr_zram_used;
410 u64 nr_wm;
411 u64 ratio;
412
413 ratio = get_zram_current_watermark();
414 nr_zram_used = get_zram_used_pages();
415 nr_wm = div_u64(totalram_pages() * ratio, percent_constant);
416 if (nr_zram_used > nr_wm)
417 return true;
418
419 return false;
420 }
421
zram_watermark_exceed(void)422 bool zram_watermark_exceed(void)
423 {
424 u64 nr_zram_used;
425 const unsigned long long nr_wm = get_zram_critical_threshold() * (SZ_1M / PAGE_SIZE);
426
427 if (!nr_wm)
428 return false;
429
430 nr_zram_used = get_zram_used_pages();
431 if (nr_zram_used > nr_wm)
432 return true;
433 return false;
434 }
435
wakeup_zswapd(pg_data_t * pgdat)436 void wakeup_zswapd(pg_data_t *pgdat)
437 {
438 unsigned long interval;
439
440 if (IS_ERR(pgdat->zswapd))
441 return;
442
443 if (!wq_has_sleeper(&pgdat->zswapd_wait))
444 return;
445
446 /*
447 * make anon pagefault snapshots
448 * wake up snapshotd
449 */
450 if (atomic_read(&snapshotd_init_flag) == 1)
451 wakeup_snapshotd();
452
453 /* wake up when the buffer is lower than min_avail_buffer */
454 if (min_buffer_is_suitable())
455 return;
456
457 interval = jiffies_to_msecs(jiffies - last_zswapd_time);
458 if (interval < zswapd_skip_interval) {
459 count_vm_event(ZSWAPD_EMPTY_ROUND_SKIP_TIMES);
460 return;
461 }
462
463 atomic_set(&pgdat->zswapd_wait_flag, 1);
464 wake_up_interruptible(&pgdat->zswapd_wait);
465 }
466
wake_all_zswapd(void)467 void wake_all_zswapd(void)
468 {
469 pg_data_t *pgdat = NULL;
470 int nid;
471
472 for_each_online_node(nid) {
473 pgdat = NODE_DATA(nid);
474 wakeup_zswapd(pgdat);
475 }
476 }
477
478 #ifdef CONFIG_HYPERHOLD_FILE_LRU
zswapd_shrink_active_list(unsigned long nr_to_scan,struct lruvec * lruvec,struct scan_control * sc,enum lru_list lru)479 static void zswapd_shrink_active_list(unsigned long nr_to_scan,
480 struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru)
481 {
482 unsigned int nr_deactivate;
483 unsigned long nr_scanned;
484 unsigned long nr_taken;
485
486 struct page *page = NULL;
487 struct pglist_data *pgdat = lruvec_pgdat(lruvec);
488 unsigned long *node_anon_cost = &pgdat->__lruvec.anon_cost;
489 unsigned long *anon_cost = &lruvec->anon_cost;
490 LIST_HEAD(l_inactive);
491 LIST_HEAD(l_hold);
492
493 lru_add_drain();
494
495 spin_lock_irq(&pgdat->lru_lock);
496 nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru);
497 __mod_node_page_state(pgdat, NR_ISOLATED_ANON, nr_taken);
498 *anon_cost += nr_taken;
499 *node_anon_cost += nr_taken;
500 __count_vm_events(PGREFILL, nr_scanned);
501 count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
502 spin_unlock_irq(&pgdat->lru_lock);
503
504 while (!list_empty(&l_hold)) {
505 cond_resched();
506 page = lru_to_page(&l_hold);
507 list_del(&page->lru);
508
509 if (unlikely(!page_evictable(page))) {
510 putback_lru_page(page);
511 continue;
512 }
513
514 ClearPageActive(page);
515 SetPageWorkingset(page);
516 list_add(&page->lru, &l_inactive);
517 }
518
519 spin_lock_irq(&pgdat->lru_lock);
520 nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
521 __mod_node_page_state(pgdat, NR_ISOLATED_ANON, -nr_taken);
522 spin_unlock_irq(&pgdat->lru_lock);
523
524 mem_cgroup_uncharge_list(&l_inactive);
525 free_unref_page_list(&l_inactive);
526
527 trace_mm_vmscan_lru_zswapd_shrink_active(pgdat->node_id, nr_taken,
528 nr_deactivate, sc->priority);
529 }
530
zswapd_shrink_list(enum lru_list lru,unsigned long nr_to_scan,struct lruvec * lruvec,struct scan_control * sc)531 static unsigned long zswapd_shrink_list(enum lru_list lru,
532 unsigned long nr_to_scan, struct lruvec *lruvec,
533 struct scan_control *sc)
534 {
535 #ifdef CONFIG_RECLAIM_ACCT
536 unsigned long nr_reclaimed;
537
538 reclaimacct_substage_start(RA_SHRINKANON);
539 #endif
540 if (is_active_lru(lru)) {
541 if (sc->may_deactivate & (1 << is_file_lru(lru)))
542 zswapd_shrink_active_list(nr_to_scan, lruvec, sc, lru);
543 else
544 sc->skipped_deactivate = 1;
545 #ifdef CONFIG_RECLAIM_ACCT
546 reclaimacct_substage_end(RA_SHRINKANON, 0, NULL);
547 #endif
548 return 0;
549 }
550
551 #ifdef CONFIG_RECLAIM_ACCT
552 nr_reclaimed = shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
553 reclaimacct_substage_end(RA_SHRINKANON, nr_reclaimed, NULL);
554 return nr_reclaimed;
555 #else
556 return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
557 #endif
558 }
559
zswapd_shrink_anon_memcg(struct pglist_data * pgdat,struct mem_cgroup * memcg,struct scan_control * sc,unsigned long * nr)560 static void zswapd_shrink_anon_memcg(struct pglist_data *pgdat,
561 struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr)
562 {
563 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
564 unsigned long nr_reclaimed = 0;
565 unsigned long nr_to_scan;
566 struct blk_plug plug;
567 enum lru_list lru;
568
569 blk_start_plug(&plug);
570
571 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) {
572 for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) {
573 if (nr[lru]) {
574 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
575 nr[lru] -= nr_to_scan;
576 nr_reclaimed += zswapd_shrink_list(lru,
577 nr_to_scan, lruvec, sc);
578 }
579 }
580 }
581
582 blk_finish_plug(&plug);
583 sc->nr_reclaimed += nr_reclaimed;
584 }
585 #endif
586
zswapd_shrink_anon(pg_data_t * pgdat,struct scan_control * sc)587 static bool zswapd_shrink_anon(pg_data_t *pgdat, struct scan_control *sc)
588 {
589 const unsigned int percent_constant = 100;
590 struct mem_cgroup *memcg = NULL;
591 unsigned long nr[NR_LRU_LISTS];
592
593 while ((memcg = get_next_memcg(memcg)) != NULL) {
594 struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
595 u64 nr_active, nr_inactive, nr_zram, nr_eswap, zram_ratio;
596
597 /* reclaim and try to meet the high buffer watermark */
598 if (high_buffer_is_suitable()) {
599 get_next_memcg_break(memcg);
600 break;
601 }
602
603 if (get_memcg_anon_refault_status(memcg)) {
604 count_vm_event(ZSWAPD_MEMCG_REFAULT_SKIP);
605 continue;
606 }
607
608 nr_active = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES);
609 nr_inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
610 nr_zram = memcg_data_size(memcg, CACHE_PAGE);
611 nr_eswap = memcg_data_size(memcg, SWAP_PAGE);
612
613 zram_ratio = div64_u64((nr_zram + nr_eswap) * percent_constant,
614 (nr_inactive + nr_active + nr_zram + nr_eswap + 1));
615 if (zram_ratio >= (u32)atomic_read(&memcg->memcg_reclaimed.ub_mem2zram_ratio)) {
616 count_vm_event(ZSWAPD_MEMCG_RATIO_SKIP);
617 continue;
618 }
619
620 nr[LRU_ACTIVE_ANON] = nr_active >> (unsigned int)sc->priority;
621 nr[LRU_INACTIVE_ANON] = nr_inactive >> (unsigned int)sc->priority;
622 nr[LRU_ACTIVE_FILE] = 0;
623 nr[LRU_INACTIVE_FILE] = 0;
624
625 #ifdef CONFIG_HYPERHOLD_FILE_LRU
626 zswapd_shrink_anon_memcg(pgdat, memcg, sc, nr);
627 #else
628 shrink_lruvec(lruvec, sc);
629 #endif
630 shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
631
632 if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
633 get_next_memcg_break(memcg);
634 break;
635 }
636 }
637
638 return sc->nr_scanned >= sc->nr_to_reclaim;
639 }
640
__calc_nr_to_reclaim(void)641 static u64 __calc_nr_to_reclaim(void)
642 {
643 unsigned int buffers;
644 unsigned int high_buffers;
645 unsigned int max_reclaim_size;
646 u64 reclaim_size = 0;
647
648 high_buffers = get_high_avail_buffers();
649 buffers = calc_sys_cur_avail_buffers();
650 max_reclaim_size = get_zswapd_max_reclaim_size();
651 if (buffers < high_buffers)
652 reclaim_size = high_buffers - buffers;
653
654 /* once max reclaim target is max_reclaim_size */
655 reclaim_size = min(reclaim_size, (u64)max_reclaim_size);
656
657 /* MB to pages */
658 return div_u64(reclaim_size * SZ_1M, PAGE_SIZE);
659 }
660
zswapd_shrink_node(pg_data_t * pgdat)661 static void zswapd_shrink_node(pg_data_t *pgdat)
662 {
663 struct scan_control sc = {
664 .gfp_mask = GFP_KERNEL,
665 .order = 0,
666 .priority = DEF_PRIORITY / 2,
667 .may_writepage = !laptop_mode,
668 .may_unmap = 1,
669 .may_swap = 1,
670 .reclaim_idx = MAX_NR_ZONES - 1,
671 };
672 const unsigned int increase_rate = 2;
673
674 do {
675 unsigned long nr_reclaimed = sc.nr_reclaimed;
676 bool raise_priority = true;
677
678 /* reclaim and try to meet the high buffer watermark */
679 if (high_buffer_is_suitable())
680 break;
681
682 sc.nr_scanned = 0;
683 sc.nr_to_reclaim = __calc_nr_to_reclaim();
684
685 if (zswapd_shrink_anon(pgdat, &sc))
686 raise_priority = false;
687 count_vm_events(ZSWAPD_SCANNED, sc.nr_scanned);
688 count_vm_events(ZSWAPD_RECLAIMED, sc.nr_reclaimed);
689 if (try_to_freeze() || kthread_should_stop())
690 break;
691
692 nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
693 if (raise_priority || !nr_reclaimed)
694 sc.priority--;
695 } while (sc.priority >= 1);
696
697 /*
698 * When meets the first empty round, set the interval to t.
699 * If the following round is still empty, set the intervall
700 * to 2t. If the round is always empty, then 4t, 8t, and so on.
701 * But make sure the interval is not more than the max_skip_interval.
702 * Once a non-empty round occurs, reset the interval to 0.
703 */
704 if (sc.nr_reclaimed < get_empty_round_check_threshold()) {
705 count_vm_event(ZSWAPD_EMPTY_ROUND);
706 if (last_round_is_empty)
707 zswapd_skip_interval = min(zswapd_skip_interval *
708 increase_rate, get_max_skip_interval());
709 else
710 zswapd_skip_interval = get_empty_round_skip_interval();
711 last_round_is_empty = true;
712 } else {
713 zswapd_skip_interval = 0;
714 last_round_is_empty = false;
715 }
716 }
717
zram_watermark_diff(void)718 u64 zram_watermark_diff(void)
719 {
720 const unsigned int percent_constant = 100;
721 u64 nr_zram_used;
722 u64 nr_wm;
723 u64 ratio;
724
725 ratio = get_zram_current_watermark();
726 nr_zram_used = get_zram_used_pages();
727 nr_wm = div_u64(totalram_pages() * ratio, percent_constant);
728 if (nr_zram_used > nr_wm)
729 return (nr_zram_used - nr_wm) * PAGE_SIZE + SWAP_MORE_ZRAM;
730
731 return 0;
732 }
733
zswapd_buffer_diff(void)734 u64 zswapd_buffer_diff(void)
735 {
736 u64 buffers;
737 u64 avail;
738
739 buffers = calc_sys_cur_avail_buffers();
740 avail = get_high_avail_buffers();
741 if (buffers < avail)
742 return (avail - buffers) * SZ_1M;
743
744 return 0;
745 }
746
get_do_eswap_size(bool refault)747 u64 get_do_eswap_size(bool refault)
748 {
749 u64 size = 0;
750 enum zswapd_eswap_policy policy = get_zswapd_eswap_policy();
751
752 if (policy == CHECK_BUFFER_ZRAMRATIO_BOTH)
753 size = max(zram_watermark_diff(), zswapd_buffer_diff());
754 else if (policy == CHECK_BUFFER_ONLY && (zram_watermark_ok() || refault))
755 size = zswapd_buffer_diff();
756
757 return size;
758 }
759
zswapd(void * p)760 static int zswapd(void *p)
761 {
762 struct task_struct *tsk = current;
763 pg_data_t *pgdat = (pg_data_t *)p;
764 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
765 #ifdef CONFIG_RECLAIM_ACCT
766 struct reclaim_acct ra = {0};
767 #endif
768
769 /* save zswapd pid for schedule strategy */
770 zswapd_pid = tsk->pid;
771
772
773 if (!cpumask_empty(cpumask))
774 set_cpus_allowed_ptr(tsk, cpumask);
775
776 set_freezable();
777
778 while (!kthread_should_stop()) {
779 bool refault = false;
780 u64 size = 0;
781
782 (void)wait_event_freezable(pgdat->zswapd_wait,
783 atomic_read(&pgdat->zswapd_wait_flag));
784 atomic_set(&pgdat->zswapd_wait_flag, 0);
785 count_vm_event(ZSWAPD_WAKEUP);
786 zswapd_pressure_report(LEVEL_LOW);
787
788 if (get_area_anon_refault_status()) {
789 refault = true;
790 count_vm_event(ZSWAPD_REFAULT);
791 goto do_eswap;
792 }
793
794 #ifdef CONFIG_RECLAIM_ACCT
795 reclaimacct_start(ZSWAPD_RECLAIM, &ra);
796 #endif
797 zswapd_shrink_node(pgdat);
798 #ifdef CONFIG_RECLAIM_ACCT
799 reclaimacct_end(ZSWAPD_RECLAIM);
800 #endif
801 last_zswapd_time = jiffies;
802
803 do_eswap:
804 size = get_do_eswap_size(refault);
805 if (size >= SZ_1M) {
806 count_vm_event(ZSWAPD_SWAPOUT);
807 size = swapout(size);
808 }
809
810 if (!buffer_is_suitable()) {
811 if (free_swap_is_low() || zram_watermark_exceed()) {
812 zswapd_pressure_report(LEVEL_CRITICAL);
813 count_vm_event(ZSWAPD_CRITICAL_PRESS);
814 pr_info("%s:zrampages:%llu, eswappages:%llu\n", __func__,
815 get_zram_used_pages(), get_eswap_used_pages());
816 } else {
817 zswapd_pressure_report(LEVEL_MEDIUM);
818 count_vm_event(ZSWAPD_MEDIUM_PRESS);
819 }
820 }
821 }
822
823 return 0;
824 }
825
826 /*
827 * This zswapd start function will be called by init and node-hot-add.
828 */
zswapd_run(int nid)829 int zswapd_run(int nid)
830 {
831 const unsigned int priority_less = 5;
832 struct sched_param param = {
833 .sched_priority = MAX_PRIO - priority_less,
834 };
835 pg_data_t *pgdat = NODE_DATA(nid);
836
837 if (pgdat->zswapd)
838 return 0;
839
840 atomic_set(&pgdat->zswapd_wait_flag, 0);
841 pgdat->zswapd = kthread_create(zswapd, pgdat, "zswapd%d", nid);
842 if (IS_ERR(pgdat->zswapd)) {
843 pr_err("Failed to start zswapd on node %d\n", nid);
844 return PTR_ERR(pgdat->zswapd);
845 }
846
847 sched_setscheduler_nocheck(pgdat->zswapd, SCHED_NORMAL, ¶m);
848 set_user_nice(pgdat->zswapd, PRIO_TO_NICE(param.sched_priority));
849 wake_up_process(pgdat->zswapd);
850
851 return 0;
852 }
853
854 /*
855 * Called by memory hotplug when all memory in a node is offlined. Caller must
856 * hold mem_hotplug_begin/end().
857 */
zswapd_stop(int nid)858 void zswapd_stop(int nid)
859 {
860 struct task_struct *zswapd = NODE_DATA(nid)->zswapd;
861
862 if (zswapd) {
863 kthread_stop(zswapd);
864 NODE_DATA(nid)->zswapd = NULL;
865 }
866
867 zswapd_pid = -1;
868 }
869
870 /*
871 * It's optimal to keep kswapds on the same CPUs as their memory, but
872 * not required for correctness. So if the last cpu in a node goes away,
873 * we get changed to run anywhere: as the first one comes back, restore
874 * their cpu bindings.
875 */
zswapd_cpu_online(unsigned int cpu)876 static int zswapd_cpu_online(unsigned int cpu)
877 {
878 int nid;
879
880 for_each_node_state(nid, N_MEMORY) {
881 pg_data_t *pgdat = NODE_DATA(nid);
882 const struct cpumask *mask;
883
884 mask = cpumask_of_node(pgdat->node_id);
885 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
886 /* One of our CPUs online: restore mask */
887 set_cpus_allowed_ptr(pgdat->zswapd, mask);
888 }
889
890 return 0;
891 }
892
zswapd_init(void)893 static int __init zswapd_init(void)
894 {
895 int nid;
896 int ret;
897
898 ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/zswapd:online",
899 zswapd_cpu_online, NULL);
900 if (ret < 0) {
901 pr_err("zswapd: failed to register hotplug callbacks.\n");
902 return ret;
903 }
904
905 for_each_node_state(nid, N_MEMORY)
906 zswapd_run(nid);
907
908 return 0;
909 }
910 module_init(zswapd_init)
911