• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * mm/zswapd.c
4  *
5  * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd.
6  */
7 
8 #include <linux/freezer.h>
9 #include <linux/memcg_policy.h>
10 #include <trace/events/vmscan.h>
11 #include <uapi/linux/sched/types.h>
12 #include <linux/zswapd.h>
13 
14 #include "zswapd_internal.h"
15 #include "internal.h"
16 
17 #define UNSET_ZRAM_WM_RATIO 0
18 #define DEFAULT_ZRAM_WM_RATIO 37
19 #define SWAP_MORE_ZRAM (50 * (SZ_1M))
20 
21 static wait_queue_head_t snapshotd_wait;
22 static atomic_t snapshotd_wait_flag;
23 static atomic_t snapshotd_init_flag = ATOMIC_INIT(0);
24 static struct task_struct *snapshotd_task;
25 
26 static pid_t zswapd_pid = -1;
27 static unsigned long long last_anon_pagefault;
28 static unsigned long long anon_refault_ratio;
29 static unsigned long long zswapd_skip_interval;
30 static unsigned long last_zswapd_time;
31 static unsigned long last_snapshot_time;
32 bool last_round_is_empty;
33 
34 
35 DECLARE_RWSEM(gs_lock);
36 LIST_HEAD(gs_list);
37 
unregister_group_swap(struct group_swap_device * gsdev)38 void unregister_group_swap(struct group_swap_device *gsdev)
39 {
40 	down_write(&gs_lock);
41 	list_del(&gsdev->list);
42 	up_write(&gs_lock);
43 
44 	kfree(gsdev);
45 }
46 EXPORT_SYMBOL(unregister_group_swap);
47 
register_group_swap(struct group_swap_ops * ops,void * priv)48 struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv)
49 {
50 	struct group_swap_device *gsdev = kzalloc(sizeof(struct group_swap_device), GFP_KERNEL);
51 
52 	if (!gsdev)
53 		return NULL;
54 
55 	gsdev->priv = priv;
56 	gsdev->ops = ops;
57 
58 	down_write(&gs_lock);
59 	list_add(&gsdev->list, &gs_list);
60 	up_write(&gs_lock);
61 
62 	return gsdev;
63 }
64 EXPORT_SYMBOL(register_group_swap);
65 
memcg_data_size(struct mem_cgroup * memcg,int type)66 u64 memcg_data_size(struct mem_cgroup *memcg, int type)
67 {
68 	struct group_swap_device *gsdev = NULL;
69 	u64 size = 0;
70 
71 	down_read(&gs_lock);
72 	list_for_each_entry(gsdev, &gs_list, list)
73 		size += gsdev->ops->group_data_size(memcg->id.id, type, gsdev->priv);
74 	up_read(&gs_lock);
75 
76 	return size;
77 }
78 
swapin_memcg(struct mem_cgroup * memcg,u64 req_size)79 u64 swapin_memcg(struct mem_cgroup *memcg, u64 req_size)
80 {
81 	u64 swap_size = memcg_data_size(memcg, SWAP_SIZE);
82 	u64 read_size = 0;
83 	u64 ratio = atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio);
84 	struct group_swap_device *gsdev = NULL;
85 
86 	if (req_size > swap_size * ratio)
87 		req_size = swap_size * ratio;
88 	down_read(&gs_lock);
89 	list_for_each_entry(gsdev, &gs_list, list) {
90 		read_size += gsdev->ops->group_write(memcg->id.id, req_size - read_size,
91 							gsdev->priv);
92 		if (read_size >= req_size)
93 			break;
94 	}
95 	up_read(&gs_lock);
96 
97 	return read_size;
98 }
99 
swapout_memcg(struct mem_cgroup * memcg,u64 req_size)100 static u64 swapout_memcg(struct mem_cgroup *memcg, u64 req_size)
101 {
102 	u64 cache_size = memcg_data_size(memcg, CACHE_SIZE);
103 	u64 swap_size = memcg_data_size(memcg, SWAP_SIZE);
104 	u64 all_size = cache_size + swap_size;
105 	u64 write_size = 0;
106 	u32 ratio = atomic_read(&memcg->memcg_reclaimed.ub_zram2ufs_ratio);
107 	struct group_swap_device *gsdev = NULL;
108 
109 	if (all_size * ratio <= swap_size)
110 		return 0;
111 	if (req_size > all_size * ratio - swap_size)
112 		req_size = all_size * ratio - swap_size;
113 	down_read(&gs_lock);
114 	list_for_each_entry(gsdev, &gs_list, list) {
115 		write_size += gsdev->ops->group_write(memcg->id.id, req_size - write_size,
116 							gsdev->priv);
117 		if (write_size >= req_size)
118 			break;
119 	}
120 	up_read(&gs_lock);
121 
122 	return write_size;
123 }
124 
swapout(u64 req_size)125 static u64 swapout(u64 req_size)
126 {
127 	struct mem_cgroup *memcg = NULL;
128 	u64 write_size = 0;
129 
130 	while ((memcg = get_next_memcg(memcg)) != NULL) {
131 		write_size += swapout_memcg(memcg, req_size - write_size);
132 		if (write_size >= req_size)
133 			break;
134 	}
135 
136 	return write_size;
137 }
138 
get_zram_used_pages(void)139 static unsigned long long get_zram_used_pages(void)
140 {
141 	struct mem_cgroup *memcg = NULL;
142 	unsigned long long zram_pages = 0;
143 
144 	while ((memcg = get_next_memcg(memcg)) != NULL)
145 		zram_pages += memcg_data_size(memcg, CACHE_PAGE);
146 
147 	return zram_pages;
148 }
149 
get_eswap_used_pages(void)150 static unsigned long long get_eswap_used_pages(void)
151 {
152 	struct mem_cgroup *memcg = NULL;
153 	unsigned long long eswap_pages = 0;
154 
155 	while ((memcg = get_next_memcg(memcg)) != NULL)
156 		eswap_pages += memcg_data_size(memcg, SWAP_PAGE);
157 
158 	return eswap_pages;
159 }
160 
get_zram_pagefault(void)161 static unsigned long long get_zram_pagefault(void)
162 {
163 	struct mem_cgroup *memcg = NULL;
164 	unsigned long long cache_fault = 0;
165 
166 	while ((memcg = get_next_memcg(memcg)) != NULL)
167 		cache_fault += memcg_data_size(memcg, CACHE_FAULT);
168 
169 	return cache_fault;
170 }
171 
calc_sys_cur_avail_buffers(void)172 static unsigned int calc_sys_cur_avail_buffers(void)
173 {
174 	const unsigned int percent_constant = 100;
175 	unsigned long freemem;
176 	unsigned long active_file;
177 	unsigned long inactive_file;
178 	unsigned long inactive_anon;
179 	unsigned long buffers;
180 
181 	freemem = global_zone_page_state(NR_FREE_PAGES) * PAGE_SIZE / SZ_1K;
182 	active_file = global_node_page_state(NR_ACTIVE_FILE) * PAGE_SIZE / SZ_1K;
183 	inactive_file = global_node_page_state(NR_INACTIVE_FILE) * PAGE_SIZE / SZ_1K;
184 	inactive_anon = global_node_page_state(NR_INACTIVE_ANON) * PAGE_SIZE / SZ_1K;
185 
186 	buffers = freemem + inactive_file * get_inactive_file_ratio() / percent_constant +
187 		active_file * get_active_file_ratio() / percent_constant;
188 
189 	return (buffers * SZ_1K / SZ_1M); /* kb to mb */
190 }
191 
zswapd_status_show(struct seq_file * m)192 void zswapd_status_show(struct seq_file *m)
193 {
194 	unsigned int buffers = calc_sys_cur_avail_buffers();
195 
196 	seq_printf(m, "buffer_size:%u\n", buffers);
197 	seq_printf(m, "recent_refault:%llu\n", anon_refault_ratio);
198 }
199 
get_zswapd_pid(void)200 pid_t get_zswapd_pid(void)
201 {
202 	return zswapd_pid;
203 }
204 
min_buffer_is_suitable(void)205 static bool min_buffer_is_suitable(void)
206 {
207 	unsigned int buffers = calc_sys_cur_avail_buffers();
208 
209 	if (buffers >= get_min_avail_buffers())
210 		return true;
211 
212 	return false;
213 }
214 
buffer_is_suitable(void)215 static bool buffer_is_suitable(void)
216 {
217 	unsigned int buffers = calc_sys_cur_avail_buffers();
218 
219 	if (buffers >= get_avail_buffers())
220 		return true;
221 
222 	return false;
223 }
224 
high_buffer_is_suitable(void)225 static bool high_buffer_is_suitable(void)
226 {
227 	unsigned int buffers = calc_sys_cur_avail_buffers();
228 
229 	if (buffers >= get_high_avail_buffers())
230 		return true;
231 
232 	return false;
233 }
234 
snapshot_anon_refaults(void)235 static void snapshot_anon_refaults(void)
236 {
237 	struct mem_cgroup *memcg = NULL;
238 
239 	while ((memcg = get_next_memcg(memcg)) != NULL)
240 		memcg->memcg_reclaimed.reclaimed_pagefault = memcg_data_size(memcg, CACHE_FAULT);
241 
242 	last_anon_pagefault = get_zram_pagefault();
243 	last_snapshot_time = jiffies;
244 }
245 
246 /*
247  * Return true if refault changes between two read operations.
248  */
get_memcg_anon_refault_status(struct mem_cgroup * memcg)249 static bool get_memcg_anon_refault_status(struct mem_cgroup *memcg)
250 {
251 	const unsigned int percent_constant = 100;
252 	unsigned long long anon_pagefault;
253 	unsigned long long anon_total;
254 	unsigned long long ratio;
255 	struct mem_cgroup_per_node *mz = NULL;
256 	struct lruvec *lruvec = NULL;
257 
258 	if (!memcg)
259 		return false;
260 
261 	anon_pagefault = memcg_data_size(memcg, CACHE_FAULT);
262 	if (anon_pagefault == memcg->memcg_reclaimed.reclaimed_pagefault)
263 		return false;
264 
265 	mz = mem_cgroup_nodeinfo(memcg, 0);
266 	if (!mz)
267 		return false;
268 
269 	lruvec = &mz->lruvec;
270 	if (!lruvec)
271 		return false;
272 
273 	anon_total = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
274 		lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES) +
275 		memcg_data_size(memcg, SWAP_PAGE) + memcg_data_size(memcg, CACHE_PAGE);
276 
277 	ratio = div64_u64((anon_pagefault - memcg->memcg_reclaimed.reclaimed_pagefault) *
278 			percent_constant, (anon_total + 1));
279 	if (ratio > atomic_read(&memcg->memcg_reclaimed.refault_threshold))
280 		return true;
281 
282 	return false;
283 }
284 
get_area_anon_refault_status(void)285 static bool get_area_anon_refault_status(void)
286 {
287 	const unsigned int percent_constant = 1000;
288 	unsigned long long anon_pagefault;
289 	unsigned long long ratio;
290 	unsigned long long time;
291 
292 	anon_pagefault = get_zram_pagefault();
293 	time = jiffies;
294 	if (anon_pagefault == last_anon_pagefault || time == last_snapshot_time)
295 		return false;
296 
297 	ratio = div_u64((anon_pagefault - last_anon_pagefault) * percent_constant,
298 			(jiffies_to_msecs(time - last_snapshot_time) + 1));
299 	anon_refault_ratio = ratio;
300 
301 	if (ratio > get_area_anon_refault_threshold())
302 		return true;
303 
304 	return false;
305 }
306 
wakeup_snapshotd(void)307 void wakeup_snapshotd(void)
308 {
309 	unsigned long snapshot_interval;
310 
311 	snapshot_interval = jiffies_to_msecs(jiffies - last_snapshot_time);
312 	if (snapshot_interval >= get_anon_refault_snapshot_min_interval()) {
313 		atomic_set(&snapshotd_wait_flag, 1);
314 		wake_up_interruptible(&snapshotd_wait);
315 	}
316 }
317 
snapshotd(void * p)318 static int snapshotd(void *p)
319 {
320 	int ret;
321 
322 	while (!kthread_should_stop()) {
323 		ret = wait_event_interruptible(snapshotd_wait, atomic_read(&snapshotd_wait_flag));
324 		if (ret)
325 			continue;
326 
327 		atomic_set(&snapshotd_wait_flag, 0);
328 
329 		snapshot_anon_refaults();
330 		count_vm_event(ZSWAPD_SNAPSHOT_TIMES);
331 	}
332 
333 	return 0;
334 }
335 
set_snapshotd_init_flag(unsigned int val)336 void set_snapshotd_init_flag(unsigned int val)
337 {
338 	atomic_set(&snapshotd_init_flag, val);
339 }
340 
341 /*
342  * This snapshotd start function will be called by init.
343  */
snapshotd_run(void)344 int snapshotd_run(void)
345 {
346 	atomic_set(&snapshotd_wait_flag, 0);
347 	init_waitqueue_head(&snapshotd_wait);
348 
349 	snapshotd_task = kthread_run(snapshotd, NULL, "snapshotd");
350 	if (IS_ERR(snapshotd_task)) {
351 		pr_err("Failed to start snapshotd\n");
352 		return PTR_ERR(snapshotd_task);
353 	}
354 
355 	return 0;
356 }
357 
snapshotd_init(void)358 static int __init snapshotd_init(void)
359 {
360 	snapshotd_run();
361 
362 	return 0;
363 }
364 module_init(snapshotd_init);
365 
get_zswapd_eswap_policy(void)366 static int get_zswapd_eswap_policy(void)
367 {
368 	if (get_zram_wm_ratio() == UNSET_ZRAM_WM_RATIO)
369 		return CHECK_BUFFER_ONLY;
370 	else
371 		return CHECK_BUFFER_ZRAMRATIO_BOTH;
372 }
373 
get_policy_zram_wm_ratio(void)374 static unsigned int get_policy_zram_wm_ratio(void)
375 {
376 	enum zswapd_eswap_policy policy = get_zswapd_eswap_policy();
377 
378 	if (policy == CHECK_BUFFER_ONLY)
379 		return DEFAULT_ZRAM_WM_RATIO;
380 	else
381 		return get_zram_wm_ratio();
382 }
383 
get_zram_current_watermark(void)384 int get_zram_current_watermark(void)
385 {
386 	long long diff_buffers;
387 	const unsigned int percent_constant = 10;
388 	u64 nr_total;
389 	unsigned int zram_wm_ratio = get_policy_zram_wm_ratio();
390 
391 	nr_total = totalram_pages();
392 	/* B_target - B_current */
393 	diff_buffers = get_avail_buffers() - calc_sys_cur_avail_buffers();
394 	/* MB to page */
395 	diff_buffers *= SZ_1M / PAGE_SIZE;
396 	/* after_comp to before_comp */
397 	diff_buffers *= get_compress_ratio();
398 	/* page to ratio */
399 	diff_buffers = div64_s64(diff_buffers * percent_constant, nr_total);
400 
401 	return min((long long)zram_wm_ratio, zram_wm_ratio - diff_buffers);
402 }
403 
zram_watermark_ok(void)404 bool zram_watermark_ok(void)
405 {
406 	const unsigned int percent_constant = 100;
407 	u64 nr_zram_used;
408 	u64 nr_wm;
409 	u64 ratio;
410 
411 	ratio = get_zram_current_watermark();
412 	nr_zram_used = get_zram_used_pages();
413 	nr_wm = div_u64(totalram_pages() * ratio, percent_constant);
414 	if (nr_zram_used > nr_wm)
415 		return true;
416 
417 	return false;
418 }
419 
zram_watermark_exceed(void)420 bool zram_watermark_exceed(void)
421 {
422 	u64 nr_zram_used;
423 	const unsigned long long nr_wm = get_zram_critical_threshold() * (SZ_1M / PAGE_SIZE);
424 
425 	if (!nr_wm)
426 		return false;
427 
428 	nr_zram_used = get_zram_used_pages();
429 	if (nr_zram_used > nr_wm)
430 		return true;
431 	return false;
432 }
433 
wakeup_zswapd(pg_data_t * pgdat)434 void wakeup_zswapd(pg_data_t *pgdat)
435 {
436 	unsigned long interval;
437 
438 	if (IS_ERR(pgdat->zswapd))
439 		return;
440 
441 	if (!wq_has_sleeper(&pgdat->zswapd_wait))
442 		return;
443 
444 	/*
445 	 * make anon pagefault snapshots
446 	 * wake up snapshotd
447 	 */
448 	if (atomic_read(&snapshotd_init_flag) == 1)
449 		wakeup_snapshotd();
450 
451 	/* wake up when the buffer is lower than min_avail_buffer */
452 	if (min_buffer_is_suitable())
453 		return;
454 
455 	interval = jiffies_to_msecs(jiffies - last_zswapd_time);
456 	if (interval < zswapd_skip_interval) {
457 		count_vm_event(ZSWAPD_EMPTY_ROUND_SKIP_TIMES);
458 		return;
459 	}
460 
461 	atomic_set(&pgdat->zswapd_wait_flag, 1);
462 	wake_up_interruptible(&pgdat->zswapd_wait);
463 }
464 
wake_all_zswapd(void)465 void wake_all_zswapd(void)
466 {
467 	pg_data_t *pgdat = NULL;
468 	int nid;
469 
470 	for_each_online_node(nid) {
471 		pgdat = NODE_DATA(nid);
472 		wakeup_zswapd(pgdat);
473 	}
474 }
475 
476 #ifdef CONFIG_HYPERHOLD_FILE_LRU
zswapd_shrink_active_list(unsigned long nr_to_scan,struct lruvec * lruvec,struct scan_control * sc,enum lru_list lru)477 static void zswapd_shrink_active_list(unsigned long nr_to_scan,
478 	struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru)
479 {
480 	unsigned int nr_deactivate;
481 	unsigned long nr_scanned;
482 	unsigned long nr_taken;
483 
484 	struct page *page = NULL;
485 	struct pglist_data *pgdat = lruvec_pgdat(lruvec);
486 	unsigned long *node_anon_cost = &pgdat->__lruvec.anon_cost;
487 	unsigned long *anon_cost = &lruvec->anon_cost;
488 	LIST_HEAD(l_inactive);
489 	LIST_HEAD(l_hold);
490 
491 	lru_add_drain();
492 
493 	spin_lock_irq(&pgdat->lru_lock);
494 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru);
495 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON, nr_taken);
496 	*anon_cost += nr_taken;
497 	*node_anon_cost += nr_taken;
498 	__count_vm_events(PGREFILL, nr_scanned);
499 	count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
500 	spin_unlock_irq(&pgdat->lru_lock);
501 
502 	while (!list_empty(&l_hold)) {
503 		cond_resched();
504 		page = lru_to_page(&l_hold);
505 		list_del(&page->lru);
506 
507 		if (unlikely(!page_evictable(page))) {
508 			putback_lru_page(page);
509 			continue;
510 		}
511 
512 		ClearPageActive(page);
513 		SetPageWorkingset(page);
514 		list_add(&page->lru, &l_inactive);
515 	}
516 
517 	spin_lock_irq(&pgdat->lru_lock);
518 	nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
519 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON, -nr_taken);
520 	spin_unlock_irq(&pgdat->lru_lock);
521 
522 	mem_cgroup_uncharge_list(&l_inactive);
523 	free_unref_page_list(&l_inactive);
524 
525 	trace_mm_vmscan_lru_zswapd_shrink_active(pgdat->node_id, nr_taken,
526 			nr_deactivate, sc->priority);
527 }
528 
zswapd_shrink_list(enum lru_list lru,unsigned long nr_to_scan,struct lruvec * lruvec,struct scan_control * sc)529 static unsigned long zswapd_shrink_list(enum lru_list lru,
530 		unsigned long nr_to_scan, struct lruvec *lruvec,
531 		struct scan_control *sc)
532 {
533 	if (is_active_lru(lru)) {
534 		if (sc->may_deactivate & (1 << is_file_lru(lru)))
535 			zswapd_shrink_active_list(nr_to_scan, lruvec, sc, lru);
536 		else
537 			sc->skipped_deactivate = 1;
538 		return 0;
539 	}
540 
541 	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
542 }
543 
zswapd_shrink_anon_memcg(struct pglist_data * pgdat,struct mem_cgroup * memcg,struct scan_control * sc,unsigned long * nr)544 static void zswapd_shrink_anon_memcg(struct pglist_data *pgdat,
545 	struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr)
546 {
547 	struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
548 	unsigned long nr_reclaimed = 0;
549 	unsigned long nr_to_scan;
550 	struct blk_plug plug;
551 	enum lru_list lru;
552 
553 	blk_start_plug(&plug);
554 
555 	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) {
556 		for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) {
557 			if (nr[lru]) {
558 				nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
559 				nr[lru] -= nr_to_scan;
560 				nr_reclaimed += zswapd_shrink_list(lru,
561 							nr_to_scan, lruvec, sc);
562 			}
563 		}
564 	}
565 
566 	blk_finish_plug(&plug);
567 	sc->nr_reclaimed += nr_reclaimed;
568 }
569 #endif
570 
zswapd_shrink_anon(pg_data_t * pgdat,struct scan_control * sc)571 static bool zswapd_shrink_anon(pg_data_t *pgdat, struct scan_control *sc)
572 {
573 	const unsigned int percent_constant = 100;
574 	struct mem_cgroup *memcg = NULL;
575 	unsigned long nr[NR_LRU_LISTS];
576 
577 	while ((memcg = get_next_memcg(memcg)) != NULL) {
578 		struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
579 		u64 nr_active, nr_inactive, nr_zram, nr_eswap, zram_ratio;
580 
581 		/* reclaim and try to meet the high buffer watermark */
582 		if (high_buffer_is_suitable()) {
583 			get_next_memcg_break(memcg);
584 			break;
585 		}
586 
587 		if (get_memcg_anon_refault_status(memcg)) {
588 			count_vm_event(ZSWAPD_MEMCG_REFAULT_SKIP);
589 			continue;
590 		}
591 
592 		nr_active = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES);
593 		nr_inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
594 		nr_zram = memcg_data_size(memcg, CACHE_PAGE);
595 		nr_eswap = memcg_data_size(memcg, SWAP_PAGE);
596 
597 		zram_ratio = div64_u64((nr_zram + nr_eswap) * percent_constant,
598 				(nr_inactive + nr_active + nr_zram + nr_eswap + 1));
599 		if (zram_ratio >= (u32)atomic_read(&memcg->memcg_reclaimed.ub_mem2zram_ratio)) {
600 			count_vm_event(ZSWAPD_MEMCG_RATIO_SKIP);
601 			continue;
602 		}
603 
604 		nr[LRU_ACTIVE_ANON] = nr_active >> (unsigned int)sc->priority;
605 		nr[LRU_INACTIVE_ANON] = nr_inactive >> (unsigned int)sc->priority;
606 		nr[LRU_ACTIVE_FILE] = 0;
607 		nr[LRU_INACTIVE_FILE] = 0;
608 
609 #ifdef CONFIG_HYPERHOLD_FILE_LRU
610 		zswapd_shrink_anon_memcg(pgdat, memcg, sc, nr);
611 #else
612 		shrink_lruvec(lruvec, sc);
613 #endif
614 		shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
615 
616 		if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
617 			get_next_memcg_break(memcg);
618 			break;
619 		}
620 	}
621 
622 	return sc->nr_scanned >= sc->nr_to_reclaim;
623 }
624 
__calc_nr_to_reclaim(void)625 static u64 __calc_nr_to_reclaim(void)
626 {
627 	unsigned int buffers;
628 	unsigned int high_buffers;
629 	unsigned int max_reclaim_size;
630 	u64 reclaim_size = 0;
631 
632 	high_buffers = get_high_avail_buffers();
633 	buffers = calc_sys_cur_avail_buffers();
634 	max_reclaim_size = get_zswapd_max_reclaim_size();
635 	if (buffers < high_buffers)
636 		reclaim_size = high_buffers - buffers;
637 
638 	/* once max reclaim target is max_reclaim_size */
639 	reclaim_size = min(reclaim_size, (u64)max_reclaim_size);
640 
641 	/* MB to pages */
642 	return div_u64(reclaim_size * SZ_1M, PAGE_SIZE);
643 }
644 
zswapd_shrink_node(pg_data_t * pgdat)645 static void zswapd_shrink_node(pg_data_t *pgdat)
646 {
647 	struct scan_control sc = {
648 		.gfp_mask = GFP_KERNEL,
649 		.order = 0,
650 		.priority = DEF_PRIORITY / 2,
651 		.may_writepage = !laptop_mode,
652 		.may_unmap = 1,
653 		.may_swap = 1,
654 		.reclaim_idx = MAX_NR_ZONES - 1,
655 	};
656 	const unsigned int increase_rate = 2;
657 
658 	do {
659 		unsigned long nr_reclaimed = sc.nr_reclaimed;
660 		bool raise_priority = true;
661 
662 		/* reclaim and try to meet the high buffer watermark */
663 		if (high_buffer_is_suitable())
664 			break;
665 
666 		sc.nr_scanned = 0;
667 		sc.nr_to_reclaim = __calc_nr_to_reclaim();
668 
669 		if (zswapd_shrink_anon(pgdat, &sc))
670 			raise_priority = false;
671 		count_vm_events(ZSWAPD_SCANNED, sc.nr_scanned);
672 		count_vm_events(ZSWAPD_RECLAIMED, sc.nr_reclaimed);
673 		if (try_to_freeze() || kthread_should_stop())
674 			break;
675 
676 		nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
677 		if (raise_priority || !nr_reclaimed)
678 			sc.priority--;
679 	} while (sc.priority >= 1);
680 
681 	/*
682 	 * When meets the first empty round, set the interval to t.
683 	 * If the following round is still empty, set the intervall
684 	 * to 2t. If the round is always empty, then 4t, 8t, and so on.
685 	 * But make sure the interval is not more than the max_skip_interval.
686 	 * Once a non-empty round occurs, reset the interval to 0.
687 	 */
688 	if (sc.nr_reclaimed < get_empty_round_check_threshold()) {
689 		count_vm_event(ZSWAPD_EMPTY_ROUND);
690 		if (last_round_is_empty)
691 			zswapd_skip_interval = min(zswapd_skip_interval *
692 				increase_rate, get_max_skip_interval());
693 		else
694 			zswapd_skip_interval = get_empty_round_skip_interval();
695 		last_round_is_empty = true;
696 	} else {
697 		zswapd_skip_interval = 0;
698 		last_round_is_empty = false;
699 	}
700 }
701 
zram_watermark_diff(void)702 u64 zram_watermark_diff(void)
703 {
704 	const unsigned int percent_constant = 100;
705 	u64 nr_zram_used;
706 	u64 nr_wm;
707 	u64 ratio;
708 
709 	ratio = get_zram_current_watermark();
710 	nr_zram_used = get_zram_used_pages();
711 	nr_wm = div_u64(totalram_pages() * ratio, percent_constant);
712 	if (nr_zram_used > nr_wm)
713 		return (nr_zram_used - nr_wm) * PAGE_SIZE + SWAP_MORE_ZRAM;
714 
715 	return 0;
716 }
717 
zswapd_buffer_diff(void)718 u64 zswapd_buffer_diff(void)
719 {
720 	u64 buffers;
721 	u64 avail;
722 
723 	buffers = calc_sys_cur_avail_buffers();
724 	avail = get_high_avail_buffers();
725 	if (buffers < avail)
726 		return (avail - buffers) * SZ_1M;
727 
728 	return 0;
729 }
730 
get_do_eswap_size(bool refault)731 u64 get_do_eswap_size(bool refault)
732 {
733 	u64 size = 0;
734 	enum zswapd_eswap_policy policy = get_zswapd_eswap_policy();
735 
736 	if (policy == CHECK_BUFFER_ZRAMRATIO_BOTH)
737 		size = max(zram_watermark_diff(), zswapd_buffer_diff());
738 	else if (policy == CHECK_BUFFER_ONLY && (zram_watermark_ok() || refault))
739 		size = zswapd_buffer_diff();
740 
741 	return size;
742 }
743 
zswapd(void * p)744 static int zswapd(void *p)
745 {
746 	struct task_struct *tsk = current;
747 	pg_data_t *pgdat = (pg_data_t *)p;
748 	const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
749 
750 	/* save zswapd pid for schedule strategy */
751 	zswapd_pid = tsk->pid;
752 
753 	if (!cpumask_empty(cpumask))
754 		set_cpus_allowed_ptr(tsk, cpumask);
755 
756 	set_freezable();
757 
758 	while (!kthread_should_stop()) {
759 		bool refault = false;
760 		u64 size = 0;
761 
762 		(void)wait_event_freezable(pgdat->zswapd_wait,
763 			atomic_read(&pgdat->zswapd_wait_flag));
764 		atomic_set(&pgdat->zswapd_wait_flag, 0);
765 		count_vm_event(ZSWAPD_WAKEUP);
766 		zswapd_pressure_report(LEVEL_LOW);
767 
768 		if (get_area_anon_refault_status()) {
769 			refault = true;
770 			count_vm_event(ZSWAPD_REFAULT);
771 			goto do_eswap;
772 		}
773 
774 		zswapd_shrink_node(pgdat);
775 		last_zswapd_time = jiffies;
776 
777 do_eswap:
778 		size = get_do_eswap_size(refault);
779 		if (size >= SZ_1M) {
780 			count_vm_event(ZSWAPD_SWAPOUT);
781 			size = swapout(size);
782 		}
783 
784 		if (!buffer_is_suitable()) {
785 			if (free_swap_is_low() || zram_watermark_exceed()) {
786 				zswapd_pressure_report(LEVEL_CRITICAL);
787 				count_vm_event(ZSWAPD_CRITICAL_PRESS);
788 				pr_info("%s:zrampages:%llu, eswappages:%llu\n", __func__,
789 					get_zram_used_pages(), get_eswap_used_pages());
790 			} else {
791 				zswapd_pressure_report(LEVEL_MEDIUM);
792 				count_vm_event(ZSWAPD_MEDIUM_PRESS);
793 			}
794 		}
795 	}
796 
797 	return 0;
798 }
799 
800 /*
801  * This zswapd start function will be called by init and node-hot-add.
802  */
zswapd_run(int nid)803 int zswapd_run(int nid)
804 {
805 	const unsigned int priority_less = 5;
806 	struct sched_param param = {
807 		.sched_priority = MAX_PRIO - priority_less,
808 	};
809 	pg_data_t *pgdat = NODE_DATA(nid);
810 
811 	if (pgdat->zswapd)
812 		return 0;
813 
814 	atomic_set(&pgdat->zswapd_wait_flag, 0);
815 	pgdat->zswapd = kthread_create(zswapd, pgdat, "zswapd%d", nid);
816 	if (IS_ERR(pgdat->zswapd)) {
817 		pr_err("Failed to start zswapd on node %d\n", nid);
818 		return PTR_ERR(pgdat->zswapd);
819 	}
820 
821 	sched_setscheduler_nocheck(pgdat->zswapd, SCHED_NORMAL, &param);
822 	set_user_nice(pgdat->zswapd, PRIO_TO_NICE(param.sched_priority));
823 	wake_up_process(pgdat->zswapd);
824 
825 	return 0;
826 }
827 
828 /*
829  * Called by memory hotplug when all memory in a node is offlined. Caller must
830  * hold mem_hotplug_begin/end().
831  */
zswapd_stop(int nid)832 void zswapd_stop(int nid)
833 {
834 	struct task_struct *zswapd = NODE_DATA(nid)->zswapd;
835 
836 	if (zswapd) {
837 		kthread_stop(zswapd);
838 		NODE_DATA(nid)->zswapd = NULL;
839 	}
840 
841 	zswapd_pid = -1;
842 }
843 
844 /*
845  * It's optimal to keep kswapds on the same CPUs as their memory, but
846  * not required for correctness. So if the last cpu in a node goes away,
847  * we get changed to run anywhere: as the first one comes back, restore
848  * their cpu bindings.
849  */
zswapd_cpu_online(unsigned int cpu)850 static int zswapd_cpu_online(unsigned int cpu)
851 {
852 	int nid;
853 
854 	for_each_node_state(nid, N_MEMORY) {
855 		pg_data_t *pgdat = NODE_DATA(nid);
856 		const struct cpumask *mask;
857 
858 		mask = cpumask_of_node(pgdat->node_id);
859 		if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
860 			/* One of our CPUs online: restore mask */
861 			set_cpus_allowed_ptr(pgdat->zswapd, mask);
862 	}
863 
864 	return 0;
865 }
866 
zswapd_init(void)867 static int __init zswapd_init(void)
868 {
869 	int nid;
870 	int ret;
871 
872 	ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/zswapd:online",
873 					zswapd_cpu_online, NULL);
874 	if (ret < 0) {
875 		pr_err("zswapd: failed to register hotplug callbacks.\n");
876 		return ret;
877 	}
878 
879 	for_each_node_state(nid, N_MEMORY)
880 		zswapd_run(nid);
881 
882 	return 0;
883 }
884 module_init(zswapd_init)
885