1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (c) 2016 Red Hat, Inc.
4 */
5
6 /*\
7 * [Description]
8 *
9 * Page fault occurs in spite that madvise(WILLNEED) system call is called
10 * to prefetch the page. This issue is reproduced by running a program
11 * which sequentially accesses to a shared memory and calls madvise(WILLNEED)
12 * to the next page on a page fault.
13 *
14 * This bug is present in all RHEL7 versions. It looks like this was fixed in
15 * mainline kernel > v3.15 by the following patch:
16 *
17 * commit 55231e5c898c5c03c14194001e349f40f59bd300
18 * Author: Johannes Weiner <hannes@cmpxchg.org>
19 * Date: Thu May 22 11:54:17 2014 -0700
20 *
21 * mm: madvise: fix MADV_WILLNEED on shmem swapouts
22 *
23 * Two checks are performed, the first looks at how SwapCache
24 * changes during madvise. When the pages are dirtied, about half
25 * will be accounted for under Cached and the other half will be
26 * moved into Swap. When madvise is run it will cause the pages
27 * under Cached to also be moved to Swap while rotating the pages
28 * already in Swap into SwapCached. So we expect that SwapCached has
29 * roughly MEM_LIMIT bytes added to it, but for reliability the
30 * PASS_THRESHOLD is much lower than that.
31 *
32 * Secondly we run madvise again, but only on the first
33 * PASS_THRESHOLD bytes to ensure these are entirely in RAM. Then we
34 * dirty these pages and check there were (almost) no page
35 * faults. Two faults are allowed incase some tasklet or something
36 * else unexpected, but irrelevant procedure, registers a fault to
37 * our process.
38 *
39 * It also can reproduce the MADV_WILLNEED preformance problem.
40 * It was introduced since 5.9 kernel with the following commit
41 * e6e88712e43b ("mm: optimise madvise WILLNEED")
42 * and fixed since 5.10-rc5 kernel with the following commit
43 * 66383800df9c ("mm: fix madvise WILLNEED performance problem").
44 */
45
46 #include <errno.h>
47 #include <stdio.h>
48 #include <sys/mount.h>
49 #include <sys/sysinfo.h>
50 #include "tst_test.h"
51 #include "tst_cgroup.h"
52
53 #define CHUNK_SZ (400*1024*1024L)
54 #define MEM_LIMIT (CHUNK_SZ / 2)
55 #define MEMSW_LIMIT (2 * CHUNK_SZ)
56 #define PASS_THRESHOLD (CHUNK_SZ / 4)
57 #define PASS_THRESHOLD_KB (PASS_THRESHOLD / 1024)
58
59 static const struct tst_cgroup_group *cg;
60
61 static const char drop_caches_fname[] = "/proc/sys/vm/drop_caches";
62 static int pg_sz, stat_refresh_sup;
63
64 static long init_swap, init_swap_cached, init_cached;
65
check_path(const char * path)66 static void check_path(const char *path)
67 {
68 if (access(path, R_OK | W_OK))
69 tst_brk(TCONF, "file needed: %s", path);
70 }
71
print_cgmem(const char * name)72 static void print_cgmem(const char *name)
73 {
74 long ret;
75
76 if (!SAFE_CGROUP_HAS(cg, name))
77 return;
78
79 SAFE_CGROUP_SCANF(cg, name, "%ld", &ret);
80 tst_res(TINFO, "\t%s: %ld Kb", name, ret / 1024);
81 }
82
meminfo_diag(const char * point)83 static void meminfo_diag(const char *point)
84 {
85 if (stat_refresh_sup)
86 SAFE_FILE_PRINTF("/proc/sys/vm/stat_refresh", "1");
87
88 tst_res(TINFO, "%s", point);
89 tst_res(TINFO, "\tSwap: %ld Kb",
90 SAFE_READ_MEMINFO("SwapTotal:") - SAFE_READ_MEMINFO("SwapFree:") - init_swap);
91 tst_res(TINFO, "\tSwapCached: %ld Kb",
92 SAFE_READ_MEMINFO("SwapCached:") - init_swap_cached);
93 tst_res(TINFO, "\tCached: %ld Kb",
94 SAFE_READ_MEMINFO("Cached:") - init_cached);
95
96 print_cgmem("memory.current");
97 print_cgmem("memory.swap.current");
98 print_cgmem("memory.kmem.usage_in_bytes");
99 }
100
setup(void)101 static void setup(void)
102 {
103 struct sysinfo sys_buf_start;
104
105 pg_sz = getpagesize();
106
107 tst_res(TINFO, "dropping caches");
108 sync();
109 SAFE_FILE_PRINTF(drop_caches_fname, "3");
110
111 sysinfo(&sys_buf_start);
112 if (sys_buf_start.freeram < 2 * CHUNK_SZ) {
113 tst_brk(TCONF, "System RAM is too small (%li bytes needed)",
114 2 * CHUNK_SZ);
115 }
116 if (sys_buf_start.freeswap < 2 * CHUNK_SZ) {
117 tst_brk(TCONF, "System swap is too small (%li bytes needed)",
118 2 * CHUNK_SZ);
119 }
120
121 check_path("/proc/self/oom_score_adj");
122 SAFE_FILE_PRINTF("/proc/self/oom_score_adj", "%d", -1000);
123
124 tst_cgroup_require("memory", NULL);
125 cg = tst_cgroup_get_test_group();
126
127 SAFE_CGROUP_PRINTF(cg, "memory.max", "%ld", MEM_LIMIT);
128 if (SAFE_CGROUP_HAS(cg, "memory.swap.max"))
129 SAFE_CGROUP_PRINTF(cg, "memory.swap.max", "%ld", MEMSW_LIMIT);
130
131 if (SAFE_CGROUP_HAS(cg, "memory.swappiness")) {
132 SAFE_CGROUP_PRINT(cg, "memory.swappiness", "60");
133 } else {
134 check_path("/proc/sys/vm/swappiness");
135 SAFE_FILE_PRINTF("/proc/sys/vm/swappiness", "%d", 60);
136 }
137
138 SAFE_CGROUP_PRINTF(cg, "cgroup.procs", "%d", getpid());
139
140 meminfo_diag("Initial meminfo, later values are relative to this (except memcg)");
141 init_swap = SAFE_READ_MEMINFO("SwapTotal:") - SAFE_READ_MEMINFO("SwapFree:");
142 init_swap_cached = SAFE_READ_MEMINFO("SwapCached:");
143 init_cached = SAFE_READ_MEMINFO("Cached:");
144
145 if (!access("/proc/sys/vm/stat_refresh", W_OK))
146 stat_refresh_sup = 1;
147
148 tst_res(TINFO, "mapping %ld Kb (%ld pages), limit %ld Kb, pass threshold %ld Kb",
149 CHUNK_SZ / 1024, CHUNK_SZ / pg_sz, MEM_LIMIT / 1024, PASS_THRESHOLD_KB);
150 }
151
cleanup(void)152 static void cleanup(void)
153 {
154 tst_cgroup_cleanup();
155 }
156
dirty_pages(char * ptr,long size)157 static void dirty_pages(char *ptr, long size)
158 {
159 long i;
160 long pages = size / pg_sz;
161
162 for (i = 0; i < pages; i++)
163 ptr[i * pg_sz] = 'x';
164 }
165
get_page_fault_num(void)166 static int get_page_fault_num(void)
167 {
168 int pg;
169
170 SAFE_FILE_SCANF("/proc/self/stat",
171 "%*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %*s %d",
172 &pg);
173 return pg;
174 }
175
test_advice_willneed(void)176 static void test_advice_willneed(void)
177 {
178 int loops = 50, res;
179 char *target;
180 long swapcached_start, swapcached;
181 int page_fault_num_1, page_fault_num_2;
182
183 meminfo_diag("Before mmap");
184 tst_res(TINFO, "PageFault(before mmap): %d", get_page_fault_num());
185 target = SAFE_MMAP(NULL, CHUNK_SZ, PROT_READ | PROT_WRITE,
186 MAP_SHARED | MAP_ANONYMOUS,
187 -1, 0);
188 meminfo_diag("Before dirty");
189 tst_res(TINFO, "PageFault(before dirty): %d", get_page_fault_num());
190 dirty_pages(target, CHUNK_SZ);
191 tst_res(TINFO, "PageFault(after dirty): %d", get_page_fault_num());
192
193 meminfo_diag("Before madvise");
194 SAFE_FILE_LINES_SCANF("/proc/meminfo", "SwapCached: %ld",
195 &swapcached_start);
196
197 TEST(madvise(target, MEM_LIMIT, MADV_WILLNEED));
198 if (TST_RET == -1)
199 tst_brk(TBROK | TTERRNO, "madvise failed");
200
201 do {
202 loops--;
203 usleep(100000);
204 if (stat_refresh_sup)
205 SAFE_FILE_PRINTF("/proc/sys/vm/stat_refresh", "1");
206 SAFE_FILE_LINES_SCANF("/proc/meminfo", "SwapCached: %ld",
207 &swapcached);
208 } while (swapcached < swapcached_start + PASS_THRESHOLD_KB && loops > 0);
209
210 meminfo_diag("After madvise");
211 res = swapcached > swapcached_start + PASS_THRESHOLD_KB;
212 tst_res(res ? TPASS : TFAIL,
213 "%s than %ld Kb were moved to the swap cache",
214 res ? "more" : "less", PASS_THRESHOLD_KB);
215
216
217 TEST(madvise(target, PASS_THRESHOLD, MADV_WILLNEED));
218 if (TST_RET == -1)
219 tst_brk(TBROK | TTERRNO, "madvise failed");
220
221 page_fault_num_1 = get_page_fault_num();
222 tst_res(TINFO, "PageFault(madvice / no mem access): %d",
223 page_fault_num_1);
224 dirty_pages(target, PASS_THRESHOLD);
225 page_fault_num_2 = get_page_fault_num();
226 tst_res(TINFO, "PageFault(madvice / mem access): %d",
227 page_fault_num_2);
228 meminfo_diag("After page access");
229
230 res = page_fault_num_2 - page_fault_num_1;
231 tst_res(res < 3 ? TPASS : TFAIL,
232 "%d pages were faulted out of 2 max", res);
233
234 SAFE_MUNMAP(target, CHUNK_SZ);
235 }
236
237 static struct tst_test test = {
238 .test_all = test_advice_willneed,
239 .setup = setup,
240 .cleanup = cleanup,
241 .min_kver = "3.10.0",
242 .needs_tmpdir = 1,
243 .needs_root = 1,
244 .save_restore = (const char * const[]) {
245 "?/proc/sys/vm/swappiness",
246 NULL
247 },
248 .tags = (const struct tst_tag[]) {
249 {"linux-git", "55231e5c898c"},
250 {"linux-git", "8de15e920dc8"},
251 {"linux-git", "66383800df9c"},
252 {}
253 }
254 };
255