• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * mm/zswapd.c
4  *
5  * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd.
6  */
7 
8 #include <linux/freezer.h>
9 #include <linux/memcg_policy.h>
10 #include <trace/events/vmscan.h>
11 #include <uapi/linux/sched/types.h>
12 #include <linux/zswapd.h>
13 #ifdef CONFIG_RECLAIM_ACCT
14 #include <linux/reclaim_acct.h>
15 #endif
16 
17 #include "zswapd_internal.h"
18 #include "internal.h"
19 
20 #define UNSET_ZRAM_WM_RATIO 0
21 #define ESWAP_PERCENT_CONSTANT 100
22 #define DEFAULT_ZRAM_WM_RATIO 37
23 #define SWAP_MORE_ZRAM (50 * (SZ_1M))
24 
25 static wait_queue_head_t snapshotd_wait;
26 static atomic_t snapshotd_wait_flag;
27 static atomic_t snapshotd_init_flag = ATOMIC_INIT(0);
28 static struct task_struct *snapshotd_task;
29 
30 static pid_t zswapd_pid = -1;
31 static unsigned long long last_anon_pagefault;
32 static unsigned long long anon_refault_ratio;
33 static unsigned long long zswapd_skip_interval;
34 static unsigned long last_zswapd_time;
35 static unsigned long last_snapshot_time;
36 bool last_round_is_empty;
37 
38 
39 DECLARE_RWSEM(gs_lock);
40 LIST_HEAD(gs_list);
41 
unregister_group_swap(struct group_swap_device * gsdev)42 void unregister_group_swap(struct group_swap_device *gsdev)
43 {
44 	down_write(&gs_lock);
45 	list_del(&gsdev->list);
46 	up_write(&gs_lock);
47 
48 	kfree(gsdev);
49 }
50 EXPORT_SYMBOL(unregister_group_swap);
51 
register_group_swap(struct group_swap_ops * ops,void * priv)52 struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv)
53 {
54 	struct group_swap_device *gsdev = kzalloc(sizeof(struct group_swap_device), GFP_KERNEL);
55 
56 	if (!gsdev)
57 		return NULL;
58 
59 	gsdev->priv = priv;
60 	gsdev->ops = ops;
61 
62 	down_write(&gs_lock);
63 	list_add(&gsdev->list, &gs_list);
64 	up_write(&gs_lock);
65 
66 	return gsdev;
67 }
68 EXPORT_SYMBOL(register_group_swap);
69 
memcg_data_size(struct mem_cgroup * memcg,int type)70 u64 memcg_data_size(struct mem_cgroup *memcg, int type)
71 {
72 	struct group_swap_device *gsdev = NULL;
73 	u64 size = 0;
74 
75 	down_read(&gs_lock);
76 	list_for_each_entry(gsdev, &gs_list, list)
77 		size += gsdev->ops->group_data_size(memcg->id.id, type, gsdev->priv);
78 	up_read(&gs_lock);
79 
80 	return size;
81 }
82 
swapin_memcg(struct mem_cgroup * memcg,u64 req_size)83 u64 swapin_memcg(struct mem_cgroup *memcg, u64 req_size)
84 {
85 	u64 swap_size = memcg_data_size(memcg, SWAP_SIZE);
86 	u64 read_size = 0;
87 	u64 ratio = atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio);
88 	struct group_swap_device *gsdev = NULL;
89 
90 	if (req_size > div_u64(swap_size * ratio, ESWAP_PERCENT_CONSTANT))
91 		req_size = div_u64(swap_size * ratio, ESWAP_PERCENT_CONSTANT);
92 	down_read(&gs_lock);
93 	list_for_each_entry(gsdev, &gs_list, list) {
94 		read_size += gsdev->ops->group_read(memcg->id.id, req_size - read_size,
95 							gsdev->priv);
96 		if (read_size >= req_size)
97 			break;
98 	}
99 	up_read(&gs_lock);
100 
101 	return read_size;
102 }
103 
swapout_memcg(struct mem_cgroup * memcg,u64 req_size)104 static u64 swapout_memcg(struct mem_cgroup *memcg, u64 req_size)
105 {
106 	u64 cache_size = memcg_data_size(memcg, CACHE_SIZE);
107 	u64 swap_size = memcg_data_size(memcg, SWAP_SIZE);
108 	u64 all_size = cache_size + swap_size;
109 	u64 write_size = 0;
110 	u32 ratio = atomic_read(&memcg->memcg_reclaimed.ub_zram2ufs_ratio);
111 	struct group_swap_device *gsdev = NULL;
112 
113 	if (div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) <= swap_size)
114 		return 0;
115 	if (req_size > div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) - swap_size)
116 		req_size = div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) - swap_size;
117 	down_read(&gs_lock);
118 	list_for_each_entry(gsdev, &gs_list, list) {
119 		write_size += gsdev->ops->group_write(memcg->id.id, req_size - write_size,
120 							gsdev->priv);
121 		if (write_size >= req_size)
122 			break;
123 	}
124 	up_read(&gs_lock);
125 
126 	return write_size;
127 }
128 
swapout(u64 req_size)129 static u64 swapout(u64 req_size)
130 {
131 	struct mem_cgroup *memcg = NULL;
132 	u64 write_size = 0;
133 
134 	while ((memcg = get_next_memcg(memcg)) != NULL) {
135 		write_size += swapout_memcg(memcg, req_size - write_size);
136 		if (write_size >= req_size)
137 			break;
138 	}
139 
140 	return write_size;
141 }
142 
get_zram_used_pages(void)143 static unsigned long long get_zram_used_pages(void)
144 {
145 	struct mem_cgroup *memcg = NULL;
146 	unsigned long long zram_pages = 0;
147 
148 	while ((memcg = get_next_memcg(memcg)) != NULL)
149 		zram_pages += memcg_data_size(memcg, CACHE_PAGE);
150 
151 	return zram_pages;
152 }
153 
get_eswap_used_pages(void)154 static unsigned long long get_eswap_used_pages(void)
155 {
156 	struct mem_cgroup *memcg = NULL;
157 	unsigned long long eswap_pages = 0;
158 
159 	while ((memcg = get_next_memcg(memcg)) != NULL)
160 		eswap_pages += memcg_data_size(memcg, SWAP_PAGE);
161 
162 	return eswap_pages;
163 }
164 
get_zram_pagefault(void)165 static unsigned long long get_zram_pagefault(void)
166 {
167 	struct mem_cgroup *memcg = NULL;
168 	unsigned long long cache_fault = 0;
169 
170 	while ((memcg = get_next_memcg(memcg)) != NULL)
171 		cache_fault += memcg_data_size(memcg, CACHE_FAULT);
172 
173 	return cache_fault;
174 }
175 
calc_sys_cur_avail_buffers(void)176 static unsigned int calc_sys_cur_avail_buffers(void)
177 {
178 	const unsigned int percent_constant = 100;
179 	unsigned long freemem;
180 	unsigned long active_file;
181 	unsigned long inactive_file;
182 	unsigned long buffers;
183 
184 	freemem = global_zone_page_state(NR_FREE_PAGES) * PAGE_SIZE / SZ_1K;
185 	active_file = global_node_page_state(NR_ACTIVE_FILE) * PAGE_SIZE / SZ_1K;
186 	inactive_file = global_node_page_state(NR_INACTIVE_FILE) * PAGE_SIZE / SZ_1K;
187 
188 	buffers = freemem + inactive_file * get_inactive_file_ratio() / percent_constant +
189 		active_file * get_active_file_ratio() / percent_constant;
190 
191 	return (buffers * SZ_1K / SZ_1M); /* kb to mb */
192 }
193 
zswapd_status_show(struct seq_file * m)194 void zswapd_status_show(struct seq_file *m)
195 {
196 	unsigned int buffers = calc_sys_cur_avail_buffers();
197 
198 	seq_printf(m, "buffer_size:%u\n", buffers);
199 	seq_printf(m, "recent_refault:%llu\n", anon_refault_ratio);
200 }
201 
get_zswapd_pid(void)202 pid_t get_zswapd_pid(void)
203 {
204 	return zswapd_pid;
205 }
206 
min_buffer_is_suitable(void)207 static bool min_buffer_is_suitable(void)
208 {
209 	unsigned int buffers = calc_sys_cur_avail_buffers();
210 
211 	if (buffers >= get_min_avail_buffers())
212 		return true;
213 
214 	return false;
215 }
216 
buffer_is_suitable(void)217 static bool buffer_is_suitable(void)
218 {
219 	unsigned int buffers = calc_sys_cur_avail_buffers();
220 
221 	if (buffers >= get_avail_buffers())
222 		return true;
223 
224 	return false;
225 }
226 
high_buffer_is_suitable(void)227 static bool high_buffer_is_suitable(void)
228 {
229 	unsigned int buffers = calc_sys_cur_avail_buffers();
230 
231 	if (buffers >= get_high_avail_buffers())
232 		return true;
233 
234 	return false;
235 }
236 
snapshot_anon_refaults(void)237 static void snapshot_anon_refaults(void)
238 {
239 	struct mem_cgroup *memcg = NULL;
240 
241 	while ((memcg = get_next_memcg(memcg)) != NULL)
242 		memcg->memcg_reclaimed.reclaimed_pagefault = memcg_data_size(memcg, CACHE_FAULT);
243 
244 	last_anon_pagefault = get_zram_pagefault();
245 	last_snapshot_time = jiffies;
246 }
247 
248 /*
249  * Return true if refault changes between two read operations.
250  */
get_memcg_anon_refault_status(struct mem_cgroup * memcg)251 static bool get_memcg_anon_refault_status(struct mem_cgroup *memcg)
252 {
253 	const unsigned int percent_constant = 100;
254 	unsigned long long anon_pagefault;
255 	unsigned long long anon_total;
256 	unsigned long long ratio;
257 	struct mem_cgroup_per_node *mz = NULL;
258 	struct lruvec *lruvec = NULL;
259 
260 	if (!memcg)
261 		return false;
262 
263 	anon_pagefault = memcg_data_size(memcg, CACHE_FAULT);
264 	if (anon_pagefault == memcg->memcg_reclaimed.reclaimed_pagefault)
265 		return false;
266 
267 	mz = mem_cgroup_nodeinfo(memcg, 0);
268 	if (!mz)
269 		return false;
270 
271 	lruvec = &mz->lruvec;
272 	if (!lruvec)
273 		return false;
274 
275 	anon_total = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
276 		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES) +
277 		memcg_data_size(memcg, SWAP_PAGE) + memcg_data_size(memcg, CACHE_PAGE);
278 
279 	ratio = div64_u64((anon_pagefault - memcg->memcg_reclaimed.reclaimed_pagefault) *
280 			percent_constant, (anon_total + 1));
281 	if (ratio > atomic_read(&memcg->memcg_reclaimed.refault_threshold))
282 		return true;
283 
284 	return false;
285 }
286 
get_area_anon_refault_status(void)287 static bool get_area_anon_refault_status(void)
288 {
289 	const unsigned int percent_constant = 1000;
290 	unsigned long long anon_pagefault;
291 	unsigned long long ratio;
292 	unsigned long long time;
293 
294 	anon_pagefault = get_zram_pagefault();
295 	time = jiffies;
296 	if (anon_pagefault == last_anon_pagefault || time == last_snapshot_time)
297 		return false;
298 
299 	ratio = div_u64((anon_pagefault - last_anon_pagefault) * percent_constant,
300 			(jiffies_to_msecs(time - last_snapshot_time) + 1));
301 	anon_refault_ratio = ratio;
302 
303 	if (ratio > get_area_anon_refault_threshold())
304 		return true;
305 
306 	return false;
307 }
308 
wakeup_snapshotd(void)309 void wakeup_snapshotd(void)
310 {
311 	unsigned long snapshot_interval;
312 
313 	snapshot_interval = jiffies_to_msecs(jiffies - last_snapshot_time);
314 	if (snapshot_interval >= get_anon_refault_snapshot_min_interval()) {
315 		atomic_set(&snapshotd_wait_flag, 1);
316 		wake_up_interruptible(&snapshotd_wait);
317 	}
318 }
319 
snapshotd(void * p)320 static int snapshotd(void *p)
321 {
322 	int ret;
323 
324 	while (!kthread_should_stop()) {
325 		ret = wait_event_interruptible(snapshotd_wait, atomic_read(&snapshotd_wait_flag));
326 		if (ret)
327 			continue;
328 
329 		atomic_set(&snapshotd_wait_flag, 0);
330 
331 		snapshot_anon_refaults();
332 		count_vm_event(ZSWAPD_SNAPSHOT_TIMES);
333 	}
334 
335 	return 0;
336 }
337 
set_snapshotd_init_flag(unsigned int val)338 void set_snapshotd_init_flag(unsigned int val)
339 {
340 	atomic_set(&snapshotd_init_flag, val);
341 }
342 
343 /*
344  * This snapshotd start function will be called by init.
345  */
snapshotd_run(void)346 int snapshotd_run(void)
347 {
348 	atomic_set(&snapshotd_wait_flag, 0);
349 	init_waitqueue_head(&snapshotd_wait);
350 
351 	snapshotd_task = kthread_run(snapshotd, NULL, "snapshotd");
352 	if (IS_ERR(snapshotd_task)) {
353 		pr_err("Failed to start snapshotd\n");
354 		return PTR_ERR(snapshotd_task);
355 	}
356 
357 	return 0;
358 }
359 
snapshotd_init(void)360 static int __init snapshotd_init(void)
361 {
362 	snapshotd_run();
363 
364 	return 0;
365 }
366 module_init(snapshotd_init);
367 
get_zswapd_eswap_policy(void)368 static int get_zswapd_eswap_policy(void)
369 {
370 	if (get_zram_wm_ratio() == UNSET_ZRAM_WM_RATIO)
371 		return CHECK_BUFFER_ONLY;
372 	else
373 		return CHECK_BUFFER_ZRAMRATIO_BOTH;
374 }
375 
get_policy_zram_wm_ratio(void)376 static unsigned int get_policy_zram_wm_ratio(void)
377 {
378 	enum zswapd_eswap_policy policy = get_zswapd_eswap_policy();
379 
380 	if (policy == CHECK_BUFFER_ONLY)
381 		return DEFAULT_ZRAM_WM_RATIO;
382 	else
383 		return get_zram_wm_ratio();
384 }
385 
get_zram_current_watermark(void)386 int get_zram_current_watermark(void)
387 {
388 	long long diff_buffers;
389 	const unsigned int percent_constant = 10;
390 	u64 nr_total;
391 	unsigned int zram_wm_ratio = get_policy_zram_wm_ratio();
392 
393 	nr_total = totalram_pages();
394 	/* B_target - B_current */
395 	diff_buffers = get_avail_buffers() - calc_sys_cur_avail_buffers();
396 	/* MB to page */
397 	diff_buffers *= SZ_1M / PAGE_SIZE;
398 	/* after_comp to before_comp */
399 	diff_buffers *= get_compress_ratio();
400 	/* page to ratio */
401 	diff_buffers = div64_s64(diff_buffers * percent_constant, nr_total);
402 
403 	return min((long long)zram_wm_ratio, zram_wm_ratio - diff_buffers);
404 }
405 
zram_watermark_ok(void)406 bool zram_watermark_ok(void)
407 {
408 	const unsigned int percent_constant = 100;
409 	u64 nr_zram_used;
410 	u64 nr_wm;
411 	u64 ratio;
412 
413 	ratio = get_zram_current_watermark();
414 	nr_zram_used = get_zram_used_pages();
415 	nr_wm = div_u64(totalram_pages() * ratio, percent_constant);
416 	if (nr_zram_used > nr_wm)
417 		return true;
418 
419 	return false;
420 }
421 
zram_watermark_exceed(void)422 bool zram_watermark_exceed(void)
423 {
424 	u64 nr_zram_used;
425 	const unsigned long long nr_wm = get_zram_critical_threshold() * (SZ_1M / PAGE_SIZE);
426 
427 	if (!nr_wm)
428 		return false;
429 
430 	nr_zram_used = get_zram_used_pages();
431 	if (nr_zram_used > nr_wm)
432 		return true;
433 	return false;
434 }
435 
wakeup_zswapd(pg_data_t * pgdat)436 void wakeup_zswapd(pg_data_t *pgdat)
437 {
438 	unsigned long interval;
439 
440 	if (IS_ERR(pgdat->zswapd))
441 		return;
442 
443 	if (!wq_has_sleeper(&pgdat->zswapd_wait))
444 		return;
445 
446 	/*
447 	 * make anon pagefault snapshots
448 	 * wake up snapshotd
449 	 */
450 	if (atomic_read(&snapshotd_init_flag) == 1)
451 		wakeup_snapshotd();
452 
453 	/* wake up when the buffer is lower than min_avail_buffer */
454 	if (min_buffer_is_suitable())
455 		return;
456 
457 	interval = jiffies_to_msecs(jiffies - last_zswapd_time);
458 	if (interval < zswapd_skip_interval) {
459 		count_vm_event(ZSWAPD_EMPTY_ROUND_SKIP_TIMES);
460 		return;
461 	}
462 
463 	atomic_set(&pgdat->zswapd_wait_flag, 1);
464 	wake_up_interruptible(&pgdat->zswapd_wait);
465 }
466 
wake_all_zswapd(void)467 void wake_all_zswapd(void)
468 {
469 	pg_data_t *pgdat = NULL;
470 	int nid;
471 
472 	for_each_online_node(nid) {
473 		pgdat = NODE_DATA(nid);
474 		wakeup_zswapd(pgdat);
475 	}
476 }
477 
478 #ifdef CONFIG_HYPERHOLD_FILE_LRU
zswapd_shrink_active_list(unsigned long nr_to_scan,struct lruvec * lruvec,struct scan_control * sc,enum lru_list lru)479 static void zswapd_shrink_active_list(unsigned long nr_to_scan,
480 	struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru)
481 {
482 	unsigned int nr_deactivate;
483 	unsigned long nr_scanned;
484 	unsigned long nr_taken;
485 
486 	struct page *page = NULL;
487 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
488 	unsigned long *node_anon_cost = &pgdat->__lruvec.anon_cost;
489 	unsigned long *anon_cost = &lruvec->anon_cost;
490 	LIST_HEAD(l_inactive);
491 	LIST_HEAD(l_hold);
492 
493 	lru_add_drain();
494 
495 	spin_lock_irq(&pgdat->lru_lock);
496 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru);
497 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON, nr_taken);
498 	*anon_cost += nr_taken;
499 	*node_anon_cost += nr_taken;
500 	__count_vm_events(PGREFILL, nr_scanned);
501 	count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
502 	spin_unlock_irq(&pgdat->lru_lock);
503 
504 	while (!list_empty(&l_hold)) {
505 		cond_resched();
506 		page = lru_to_page(&l_hold);
507 		list_del(&page->lru);
508 
509 		if (unlikely(!page_evictable(page))) {
510 			putback_lru_page(page);
511 			continue;
512 		}
513 
514 		ClearPageActive(page);
515 		SetPageWorkingset(page);
516 		list_add(&page->lru, &l_inactive);
517 	}
518 
519 	spin_lock_irq(&pgdat->lru_lock);
520 	nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
521 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON, -nr_taken);
522 	spin_unlock_irq(&pgdat->lru_lock);
523 
524 	mem_cgroup_uncharge_list(&l_inactive);
525 	free_unref_page_list(&l_inactive);
526 
527 	trace_mm_vmscan_lru_zswapd_shrink_active(pgdat->node_id, nr_taken,
528 			nr_deactivate, sc->priority);
529 }
530 
zswapd_shrink_list(enum lru_list lru,unsigned long nr_to_scan,struct lruvec * lruvec,struct scan_control * sc)531 static unsigned long zswapd_shrink_list(enum lru_list lru,
532 		unsigned long nr_to_scan, struct lruvec *lruvec,
533 		struct scan_control *sc)
534 {
535 #ifdef CONFIG_RECLAIM_ACCT
536 	unsigned long nr_reclaimed;
537 
538 	reclaimacct_substage_start(RA_SHRINKANON);
539 #endif
540 	if (is_active_lru(lru)) {
541 		if (sc->may_deactivate & (1 << is_file_lru(lru)))
542 			zswapd_shrink_active_list(nr_to_scan, lruvec, sc, lru);
543 		else
544 			sc->skipped_deactivate = 1;
545 #ifdef CONFIG_RECLAIM_ACCT
546 		reclaimacct_substage_end(RA_SHRINKANON, 0, NULL);
547 #endif
548 		return 0;
549 	}
550 
551 #ifdef CONFIG_RECLAIM_ACCT
552 	nr_reclaimed = shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
553 	reclaimacct_substage_end(RA_SHRINKANON, nr_reclaimed, NULL);
554 	return nr_reclaimed;
555 #else
556 	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
557 #endif
558 }
559 
zswapd_shrink_anon_memcg(struct pglist_data * pgdat,struct mem_cgroup * memcg,struct scan_control * sc,unsigned long * nr)560 static void zswapd_shrink_anon_memcg(struct pglist_data *pgdat,
561 	struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr)
562 {
563 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
564 	unsigned long nr_reclaimed = 0;
565 	unsigned long nr_to_scan;
566 	struct blk_plug plug;
567 	enum lru_list lru;
568 
569 	blk_start_plug(&plug);
570 
571 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) {
572 		for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) {
573 			if (nr[lru]) {
574 				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
575 				nr[lru] -= nr_to_scan;
576 				nr_reclaimed += zswapd_shrink_list(lru,
577 							nr_to_scan, lruvec, sc);
578 			}
579 		}
580 	}
581 
582 	blk_finish_plug(&plug);
583 	sc->nr_reclaimed += nr_reclaimed;
584 }
585 #endif
586 
zswapd_shrink_anon(pg_data_t * pgdat,struct scan_control * sc)587 static bool zswapd_shrink_anon(pg_data_t *pgdat, struct scan_control *sc)
588 {
589 	const unsigned int percent_constant = 100;
590 	struct mem_cgroup *memcg = NULL;
591 	unsigned long nr[NR_LRU_LISTS];
592 
593 	while ((memcg = get_next_memcg(memcg)) != NULL) {
594 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
595 		u64 nr_active, nr_inactive, nr_zram, nr_eswap, zram_ratio;
596 
597 		/* reclaim and try to meet the high buffer watermark */
598 		if (high_buffer_is_suitable()) {
599 			get_next_memcg_break(memcg);
600 			break;
601 		}
602 
603 		if (get_memcg_anon_refault_status(memcg)) {
604 			count_vm_event(ZSWAPD_MEMCG_REFAULT_SKIP);
605 			continue;
606 		}
607 
608 		nr_active = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES);
609 		nr_inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
610 		nr_zram = memcg_data_size(memcg, CACHE_PAGE);
611 		nr_eswap = memcg_data_size(memcg, SWAP_PAGE);
612 
613 		zram_ratio = div64_u64((nr_zram + nr_eswap) * percent_constant,
614 				(nr_inactive + nr_active + nr_zram + nr_eswap + 1));
615 		if (zram_ratio >= (u32)atomic_read(&memcg->memcg_reclaimed.ub_mem2zram_ratio)) {
616 			count_vm_event(ZSWAPD_MEMCG_RATIO_SKIP);
617 			continue;
618 		}
619 
620 		nr[LRU_ACTIVE_ANON] = nr_active >> (unsigned int)sc->priority;
621 		nr[LRU_INACTIVE_ANON] = nr_inactive >> (unsigned int)sc->priority;
622 		nr[LRU_ACTIVE_FILE] = 0;
623 		nr[LRU_INACTIVE_FILE] = 0;
624 
625 #ifdef CONFIG_HYPERHOLD_FILE_LRU
626 		zswapd_shrink_anon_memcg(pgdat, memcg, sc, nr);
627 #else
628 		shrink_lruvec(lruvec, sc);
629 #endif
630 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
631 
632 		if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
633 			get_next_memcg_break(memcg);
634 			break;
635 		}
636 	}
637 
638 	return sc->nr_scanned >= sc->nr_to_reclaim;
639 }
640 
__calc_nr_to_reclaim(void)641 static u64 __calc_nr_to_reclaim(void)
642 {
643 	unsigned int buffers;
644 	unsigned int high_buffers;
645 	unsigned int max_reclaim_size;
646 	u64 reclaim_size = 0;
647 
648 	high_buffers = get_high_avail_buffers();
649 	buffers = calc_sys_cur_avail_buffers();
650 	max_reclaim_size = get_zswapd_max_reclaim_size();
651 	if (buffers < high_buffers)
652 		reclaim_size = high_buffers - buffers;
653 
654 	/* once max reclaim target is max_reclaim_size */
655 	reclaim_size = min(reclaim_size, (u64)max_reclaim_size);
656 
657 	/* MB to pages */
658 	return div_u64(reclaim_size * SZ_1M, PAGE_SIZE);
659 }
660 
zswapd_shrink_node(pg_data_t * pgdat)661 static void zswapd_shrink_node(pg_data_t *pgdat)
662 {
663 	struct scan_control sc = {
664 		.gfp_mask = GFP_KERNEL,
665 		.order = 0,
666 		.priority = DEF_PRIORITY / 2,
667 		.may_writepage = !laptop_mode,
668 		.may_unmap = 1,
669 		.may_swap = 1,
670 		.reclaim_idx = MAX_NR_ZONES - 1,
671 	};
672 	const unsigned int increase_rate = 2;
673 
674 	do {
675 		unsigned long nr_reclaimed = sc.nr_reclaimed;
676 		bool raise_priority = true;
677 
678 		/* reclaim and try to meet the high buffer watermark */
679 		if (high_buffer_is_suitable())
680 			break;
681 
682 		sc.nr_scanned = 0;
683 		sc.nr_to_reclaim = __calc_nr_to_reclaim();
684 
685 		if (zswapd_shrink_anon(pgdat, &sc))
686 			raise_priority = false;
687 		count_vm_events(ZSWAPD_SCANNED, sc.nr_scanned);
688 		count_vm_events(ZSWAPD_RECLAIMED, sc.nr_reclaimed);
689 		if (try_to_freeze() || kthread_should_stop())
690 			break;
691 
692 		nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
693 		if (raise_priority || !nr_reclaimed)
694 			sc.priority--;
695 	} while (sc.priority >= 1);
696 
697 	/*
698 	 * When meets the first empty round, set the interval to t.
699 	 * If the following round is still empty, set the intervall
700 	 * to 2t. If the round is always empty, then 4t, 8t, and so on.
701 	 * But make sure the interval is not more than the max_skip_interval.
702 	 * Once a non-empty round occurs, reset the interval to 0.
703 	 */
704 	if (sc.nr_reclaimed < get_empty_round_check_threshold()) {
705 		count_vm_event(ZSWAPD_EMPTY_ROUND);
706 		if (last_round_is_empty)
707 			zswapd_skip_interval = min(zswapd_skip_interval *
708 				increase_rate, get_max_skip_interval());
709 		else
710 			zswapd_skip_interval = get_empty_round_skip_interval();
711 		last_round_is_empty = true;
712 	} else {
713 		zswapd_skip_interval = 0;
714 		last_round_is_empty = false;
715 	}
716 }
717 
zram_watermark_diff(void)718 u64 zram_watermark_diff(void)
719 {
720 	const unsigned int percent_constant = 100;
721 	u64 nr_zram_used;
722 	u64 nr_wm;
723 	u64 ratio;
724 
725 	ratio = get_zram_current_watermark();
726 	nr_zram_used = get_zram_used_pages();
727 	nr_wm = div_u64(totalram_pages() * ratio, percent_constant);
728 	if (nr_zram_used > nr_wm)
729 		return (nr_zram_used - nr_wm) * PAGE_SIZE + SWAP_MORE_ZRAM;
730 
731 	return 0;
732 }
733 
zswapd_buffer_diff(void)734 u64 zswapd_buffer_diff(void)
735 {
736 	u64 buffers;
737 	u64 avail;
738 
739 	buffers = calc_sys_cur_avail_buffers();
740 	avail = get_high_avail_buffers();
741 	if (buffers < avail)
742 		return (avail - buffers) * SZ_1M;
743 
744 	return 0;
745 }
746 
get_do_eswap_size(bool refault)747 u64 get_do_eswap_size(bool refault)
748 {
749 	u64 size = 0;
750 	enum zswapd_eswap_policy policy = get_zswapd_eswap_policy();
751 
752 	if (policy == CHECK_BUFFER_ZRAMRATIO_BOTH)
753 		size = max(zram_watermark_diff(), zswapd_buffer_diff());
754 	else if (policy == CHECK_BUFFER_ONLY && (zram_watermark_ok() || refault))
755 		size = zswapd_buffer_diff();
756 
757 	return size;
758 }
759 
zswapd(void * p)760 static int zswapd(void *p)
761 {
762 	struct task_struct *tsk = current;
763 	pg_data_t *pgdat = (pg_data_t *)p;
764 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
765 #ifdef CONFIG_RECLAIM_ACCT
766 	struct reclaim_acct ra = {0};
767 #endif
768 
769 	/* save zswapd pid for schedule strategy */
770 	zswapd_pid = tsk->pid;
771 
772 
773 	if (!cpumask_empty(cpumask))
774 		set_cpus_allowed_ptr(tsk, cpumask);
775 
776 	set_freezable();
777 
778 	while (!kthread_should_stop()) {
779 		bool refault = false;
780 		u64 size = 0;
781 
782 		(void)wait_event_freezable(pgdat->zswapd_wait,
783 			atomic_read(&pgdat->zswapd_wait_flag));
784 		atomic_set(&pgdat->zswapd_wait_flag, 0);
785 		count_vm_event(ZSWAPD_WAKEUP);
786 		zswapd_pressure_report(LEVEL_LOW);
787 
788 		if (get_area_anon_refault_status()) {
789 			refault = true;
790 			count_vm_event(ZSWAPD_REFAULT);
791 			goto do_eswap;
792 		}
793 
794 #ifdef CONFIG_RECLAIM_ACCT
795 		reclaimacct_start(ZSWAPD_RECLAIM, &ra);
796 #endif
797 		zswapd_shrink_node(pgdat);
798 #ifdef CONFIG_RECLAIM_ACCT
799 		reclaimacct_end(ZSWAPD_RECLAIM);
800 #endif
801 		last_zswapd_time = jiffies;
802 
803 do_eswap:
804 		size = get_do_eswap_size(refault);
805 		if (size >= SZ_1M) {
806 			count_vm_event(ZSWAPD_SWAPOUT);
807 			size = swapout(size);
808 		}
809 
810 		if (!buffer_is_suitable()) {
811 			if (free_swap_is_low() || zram_watermark_exceed()) {
812 				zswapd_pressure_report(LEVEL_CRITICAL);
813 				count_vm_event(ZSWAPD_CRITICAL_PRESS);
814 				pr_info("%s:zrampages:%llu, eswappages:%llu\n", __func__,
815 					get_zram_used_pages(), get_eswap_used_pages());
816 			} else {
817 				zswapd_pressure_report(LEVEL_MEDIUM);
818 				count_vm_event(ZSWAPD_MEDIUM_PRESS);
819 			}
820 		}
821 	}
822 
823 	return 0;
824 }
825 
826 /*
827  * This zswapd start function will be called by init and node-hot-add.
828  */
zswapd_run(int nid)829 int zswapd_run(int nid)
830 {
831 	const unsigned int priority_less = 5;
832 	struct sched_param param = {
833 		.sched_priority = MAX_PRIO - priority_less,
834 	};
835 	pg_data_t *pgdat = NODE_DATA(nid);
836 
837 	if (pgdat->zswapd)
838 		return 0;
839 
840 	atomic_set(&pgdat->zswapd_wait_flag, 0);
841 	pgdat->zswapd = kthread_create(zswapd, pgdat, "zswapd%d", nid);
842 	if (IS_ERR(pgdat->zswapd)) {
843 		pr_err("Failed to start zswapd on node %d\n", nid);
844 		return PTR_ERR(pgdat->zswapd);
845 	}
846 
847 	sched_setscheduler_nocheck(pgdat->zswapd, SCHED_NORMAL, &param);
848 	set_user_nice(pgdat->zswapd, PRIO_TO_NICE(param.sched_priority));
849 	wake_up_process(pgdat->zswapd);
850 
851 	return 0;
852 }
853 
854 /*
855  * Called by memory hotplug when all memory in a node is offlined. Caller must
856  * hold mem_hotplug_begin/end().
857  */
zswapd_stop(int nid)858 void zswapd_stop(int nid)
859 {
860 	struct task_struct *zswapd = NODE_DATA(nid)->zswapd;
861 
862 	if (zswapd) {
863 		kthread_stop(zswapd);
864 		NODE_DATA(nid)->zswapd = NULL;
865 	}
866 
867 	zswapd_pid = -1;
868 }
869 
870 /*
871  * It's optimal to keep kswapds on the same CPUs as their memory, but
872  * not required for correctness. So if the last cpu in a node goes away,
873  * we get changed to run anywhere: as the first one comes back, restore
874  * their cpu bindings.
875  */
zswapd_cpu_online(unsigned int cpu)876 static int zswapd_cpu_online(unsigned int cpu)
877 {
878 	int nid;
879 
880 	for_each_node_state(nid, N_MEMORY) {
881 		pg_data_t *pgdat = NODE_DATA(nid);
882 		const struct cpumask *mask;
883 
884 		mask = cpumask_of_node(pgdat->node_id);
885 		if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
886 			/* One of our CPUs online: restore mask */
887 			set_cpus_allowed_ptr(pgdat->zswapd, mask);
888 	}
889 
890 	return 0;
891 }
892 
zswapd_init(void)893 static int __init zswapd_init(void)
894 {
895 	int nid;
896 	int ret;
897 
898 	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/zswapd:online",
899 					zswapd_cpu_online, NULL);
900 	if (ret < 0) {
901 		pr_err("zswapd: failed to register hotplug callbacks.\n");
902 		return ret;
903 	}
904 
905 	for_each_node_state(nid, N_MEMORY)
906 		zswapd_run(nid);
907 
908 	return 0;
909 }
910 module_init(zswapd_init)
911