1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * Copyright (C) 2012 Linux Test Project, Inc.
4 */
5
6 /*
7 * use migrate_pages() and check that address is on correct node
8 * 1. process A can migrate its non-shared mem with CAP_SYS_NICE
9 * 2. process A can migrate its non-shared mem without CAP_SYS_NICE
10 * 3. process A can migrate shared mem only with CAP_SYS_NICE
11 * 4. process A can migrate non-shared mem in process B with same effective uid
12 * 5. process A can migrate non-shared mem in process B with CAP_SYS_NICE
13 */
14 #include <sys/types.h>
15 #include <sys/syscall.h>
16 #include <sys/wait.h>
17 #include <sys/mman.h>
18 #include <sys/prctl.h>
19 #include <errno.h>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <unistd.h>
23 #include <pwd.h>
24
25 #include "tst_test.h"
26 #include "lapi/syscalls.h"
27 #include "numa_helper.h"
28 #include "migrate_pages_common.h"
29
30 /*
31 * This is an estimated minimum of free mem required to migrate this
32 * process to another node as migrate_pages will fail if there is not
33 * enough free space on node. While running this test on x86_64
34 * it used ~2048 pages (total VM, not just RSS). Considering ia64 as
35 * architecture with largest (non-huge) page size (16k), this limit
36 * is set to 2048*16k == 32M.
37 */
38 #define NODE_MIN_FREEMEM (32*1024*1024)
39
40 #ifdef HAVE_NUMA_V2
41
42 static const char nobody_uid[] = "nobody";
43 static struct passwd *ltpuser;
44 static int *nodes, nodeA, nodeB;
45 static int num_nodes;
46
print_mem_stats(pid_t pid,int node)47 static void print_mem_stats(pid_t pid, int node)
48 {
49 char s[64];
50 long long node_size, freep;
51
52 if (pid == 0)
53 pid = getpid();
54
55 tst_res(TINFO, "mem_stats pid: %d, node: %d", pid, node);
56
57 /* dump pid's VM info */
58 sprintf(s, "cat /proc/%d/status", pid);
59 system(s);
60 sprintf(s, "cat /proc/%d/numa_maps", pid);
61 system(s);
62
63 /* dump node free mem */
64 node_size = numa_node_size64(node, &freep);
65 tst_res(TINFO, "Node id: %d, size: %lld, free: %lld",
66 node, node_size, freep);
67 }
68
migrate_to_node(pid_t pid,int node)69 static int migrate_to_node(pid_t pid, int node)
70 {
71 unsigned long nodemask_size, max_node;
72 unsigned long *old_nodes, *new_nodes;
73 int i;
74
75 tst_res(TINFO, "pid(%d) migrate pid %d to node -> %d",
76 getpid(), pid, node);
77 max_node = LTP_ALIGN(get_max_node(), sizeof(unsigned long)*8);
78 nodemask_size = max_node / 8;
79 old_nodes = SAFE_MALLOC(nodemask_size);
80 new_nodes = SAFE_MALLOC(nodemask_size);
81
82 memset(old_nodes, 0, nodemask_size);
83 memset(new_nodes, 0, nodemask_size);
84 for (i = 0; i < num_nodes; i++)
85 set_bit(old_nodes, nodes[i], 1);
86 set_bit(new_nodes, node, 1);
87
88 TEST(tst_syscall(__NR_migrate_pages, pid, max_node, old_nodes,
89 new_nodes));
90 if (TST_RET != 0) {
91 if (TST_RET < 0) {
92 tst_res(TFAIL | TTERRNO, "migrate_pages failed "
93 "ret: %ld, ", TST_RET);
94 print_mem_stats(pid, node);
95 } else {
96 tst_res(TINFO, "migrate_pages could not migrate all "
97 "pages, not migrated: %ld", TST_RET);
98 }
99 }
100 free(old_nodes);
101 free(new_nodes);
102 return TST_RET;
103 }
104
addr_on_node(void * addr)105 static int addr_on_node(void *addr)
106 {
107 int node;
108 int ret;
109
110 ret = tst_syscall(__NR_get_mempolicy, &node, NULL, (unsigned long)0,
111 (unsigned long)addr, MPOL_F_NODE | MPOL_F_ADDR);
112 if (ret == -1) {
113 tst_res(TFAIL | TERRNO,
114 "error getting memory policy for page %p", addr);
115 }
116 return node;
117 }
118
check_addr_on_node(void * addr,int exp_node)119 static int check_addr_on_node(void *addr, int exp_node)
120 {
121 int node;
122
123 node = addr_on_node(addr);
124 if (node == exp_node) {
125 tst_res(TPASS, "pid(%d) addr %p is on expected node: %d",
126 getpid(), addr, exp_node);
127 return TPASS;
128 } else {
129 tst_res(TFAIL, "pid(%d) addr %p not on expected node: %d "
130 ", expected %d", getpid(), addr, node, exp_node);
131 print_mem_stats(0, exp_node);
132 return TFAIL;
133 }
134 }
135
test_migrate_current_process(int node1,int node2,int cap_sys_nice)136 static void test_migrate_current_process(int node1, int node2, int cap_sys_nice)
137 {
138 char *private, *shared;
139 int ret;
140 pid_t child;
141
142 /* parent can migrate its non-shared memory */
143 tst_res(TINFO, "current_process, cap_sys_nice: %d", cap_sys_nice);
144 private = SAFE_MMAP(NULL, getpagesize(), PROT_READ | PROT_WRITE,
145 MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
146 private[0] = 0;
147 tst_res(TINFO, "private anonymous: %p", private);
148
149 migrate_to_node(0, node2);
150 check_addr_on_node(private, node2);
151 migrate_to_node(0, node1);
152 check_addr_on_node(private, node1);
153 SAFE_MUNMAP(private, getpagesize());
154
155 /* parent can migrate shared memory with CAP_SYS_NICE */
156 shared = SAFE_MMAP(NULL, getpagesize(), PROT_READ | PROT_WRITE,
157 MAP_ANONYMOUS | MAP_SHARED, 0, 0);
158 shared[0] = 1;
159 tst_res(TINFO, "shared anonymous: %p", shared);
160 migrate_to_node(0, node2);
161 check_addr_on_node(shared, node2);
162
163 /* shared mem is on node2, try to migrate in child to node1 */
164 fflush(stdout);
165 child = SAFE_FORK();
166 if (child == 0) {
167 tst_res(TINFO, "child shared anonymous, cap_sys_nice: %d",
168 cap_sys_nice);
169 private = SAFE_MMAP(NULL, getpagesize(),
170 PROT_READ | PROT_WRITE,
171 MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
172 private[0] = 1;
173 shared[0] = 1;
174 if (!cap_sys_nice)
175 SAFE_SETEUID(ltpuser->pw_uid);
176
177 migrate_to_node(0, node1);
178 /* child can migrate non-shared memory */
179 ret = check_addr_on_node(private, node1);
180
181 exit(ret);
182 }
183
184 SAFE_WAITPID(child, NULL, 0);
185 if (cap_sys_nice)
186 /* child can migrate shared memory only
187 * with CAP_SYS_NICE */
188 check_addr_on_node(shared, node1);
189 else
190 check_addr_on_node(shared, node2);
191 SAFE_MUNMAP(shared, getpagesize());
192 }
193
test_migrate_other_process(int node1,int node2,int cap_sys_nice)194 static void test_migrate_other_process(int node1, int node2, int cap_sys_nice)
195 {
196 char *private;
197 int ret;
198 pid_t child1, child2;
199
200 tst_res(TINFO, "other_process, cap_sys_nice: %d", cap_sys_nice);
201
202 fflush(stdout);
203 child1 = SAFE_FORK();
204 if (child1 == 0) {
205 private = SAFE_MMAP(NULL, getpagesize(),
206 PROT_READ | PROT_WRITE,
207 MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
208 private[0] = 0;
209
210 /* make sure we are on node1 */
211 migrate_to_node(0, node1);
212 check_addr_on_node(private, node1);
213
214 SAFE_SETUID(ltpuser->pw_uid);
215
216 /* commit_creds() will clear dumpable, restore it */
217 if (prctl(PR_SET_DUMPABLE, 1))
218 tst_brk(TBROK | TERRNO, "prctl");
219
220 /* signal child2 it's OK to migrate child1 and wait */
221 TST_CHECKPOINT_WAKE(0);
222 TST_CHECKPOINT_WAIT(1);
223
224 /* child2 can migrate child1 process if it's privileged */
225 /* child2 can migrate child1 process if it has same uid */
226 ret = check_addr_on_node(private, node2);
227
228 exit(ret);
229 }
230
231 fflush(stdout);
232 child2 = SAFE_FORK();
233 if (child2 == 0) {
234 if (!cap_sys_nice)
235 SAFE_SETUID(ltpuser->pw_uid);
236
237 /* wait until child1 is ready on node1, then migrate and
238 * signal to check current node */
239 TST_CHECKPOINT_WAIT(0);
240 migrate_to_node(child1, node2);
241 TST_CHECKPOINT_WAKE(1);
242
243 exit(TPASS);
244 }
245
246 SAFE_WAITPID(child1, NULL, 0);
247 SAFE_WAITPID(child2, NULL, 0);
248 }
249
run(void)250 static void run(void)
251 {
252 test_migrate_current_process(nodeA, nodeB, 1);
253 test_migrate_current_process(nodeA, nodeB, 0);
254 test_migrate_other_process(nodeA, nodeB, 1);
255 test_migrate_other_process(nodeA, nodeB, 0);
256 }
257
setup(void)258 static void setup(void)
259 {
260 int ret, i, j;
261 int pagesize = getpagesize();
262 void *p;
263
264 tst_syscall(__NR_migrate_pages, 0, 0, NULL, NULL);
265
266 if (numa_available() == -1)
267 tst_brk(TCONF, "NUMA not available");
268
269 ret = get_allowed_nodes_arr(NH_MEMS, &num_nodes, &nodes);
270 if (ret < 0)
271 tst_brk(TBROK | TERRNO, "get_allowed_nodes(): %d", ret);
272
273 if (num_nodes < 2)
274 tst_brk(TCONF, "at least 2 allowed NUMA nodes"
275 " are required");
276 else if (tst_kvercmp(2, 6, 18) < 0)
277 tst_brk(TCONF, "2.6.18 or greater kernel required");
278
279 FILE_PRINTF("/proc/sys/kernel/numa_balancing", "0");
280 /*
281 * find 2 nodes, which can hold NODE_MIN_FREEMEM bytes
282 * The reason is that:
283 * 1. migrate_pages() is expected to succeed
284 * 2. this test avoids hitting:
285 * Bug 870326 - migrate_pages() reports success, but pages are
286 * not moved to desired node
287 * https://bugzilla.redhat.com/show_bug.cgi?id=870326
288 */
289 nodeA = nodeB = -1;
290 for (i = 0; i < num_nodes; i++) {
291 p = numa_alloc_onnode(NODE_MIN_FREEMEM, nodes[i]);
292 if (p == NULL)
293 break;
294 memset(p, 0xff, NODE_MIN_FREEMEM);
295
296 j = 0;
297 while (j < NODE_MIN_FREEMEM) {
298 if (addr_on_node(p + j) != nodes[i])
299 break;
300 j += pagesize;
301 }
302 numa_free(p, NODE_MIN_FREEMEM);
303
304 if (j >= NODE_MIN_FREEMEM) {
305 if (nodeA == -1)
306 nodeA = nodes[i];
307 else if (nodeB == -1)
308 nodeB = nodes[i];
309 else
310 break;
311 }
312 }
313
314 if (nodeA == -1 || nodeB == -1)
315 tst_brk(TCONF, "at least 2 NUMA nodes with "
316 "free mem > %d are needed", NODE_MIN_FREEMEM);
317 tst_res(TINFO, "Using nodes: %d %d", nodeA, nodeB);
318
319 ltpuser = getpwnam(nobody_uid);
320 if (ltpuser == NULL)
321 tst_brk(TBROK | TERRNO, "getpwnam failed");
322 }
323
324 static struct tst_test test = {
325 .needs_root = 1,
326 .needs_checkpoints = 1,
327 .forks_child = 1,
328 .test_all = run,
329 .setup = setup,
330 .save_restore = (const char * const[]) {
331 "?/proc/sys/kernel/numa_balancing",
332 NULL,
333 },
334 };
335 #else
336 TST_TEST_TCONF(NUMA_ERROR_MSG);
337 #endif
338