• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2012 Linux Test Project, Inc.
4  */
5 
6 /*
7  * use migrate_pages() and check that address is on correct node
8  * 1. process A can migrate its non-shared mem with CAP_SYS_NICE
9  * 2. process A can migrate its non-shared mem without CAP_SYS_NICE
10  * 3. process A can migrate shared mem only with CAP_SYS_NICE
11  * 4. process A can migrate non-shared mem in process B with same effective uid
12  * 5. process A can migrate non-shared mem in process B with CAP_SYS_NICE
13  */
14 #include <sys/types.h>
15 #include <sys/syscall.h>
16 #include <sys/wait.h>
17 #include <sys/mman.h>
18 #include <sys/prctl.h>
19 #include <errno.h>
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <unistd.h>
23 #include <pwd.h>
24 
25 #include "tst_test.h"
26 #include "lapi/syscalls.h"
27 #include "numa_helper.h"
28 #include "migrate_pages_common.h"
29 
30 /*
31  * This is an estimated minimum of free mem required to migrate this
32  * process to another node as migrate_pages will fail if there is not
33  * enough free space on node. While running this test on x86_64
34  * it used ~2048 pages (total VM, not just RSS). Considering ia64 as
35  * architecture with largest (non-huge) page size (16k), this limit
36  * is set to 2048*16k == 32M.
37  */
38 #define NODE_MIN_FREEMEM (32*1024*1024)
39 
40 #ifdef HAVE_NUMA_V2
41 
42 static const char nobody_uid[] = "nobody";
43 static struct passwd *ltpuser;
44 static int *nodes, nodeA, nodeB;
45 static int num_nodes;
46 
print_mem_stats(pid_t pid,int node)47 static void print_mem_stats(pid_t pid, int node)
48 {
49 	char s[64];
50 	long long node_size, freep;
51 
52 	if (pid == 0)
53 		pid = getpid();
54 
55 	tst_res(TINFO, "mem_stats pid: %d, node: %d", pid, node);
56 
57 	/* dump pid's VM info */
58 	sprintf(s, "cat /proc/%d/status", pid);
59 	system(s);
60 	sprintf(s, "cat /proc/%d/numa_maps", pid);
61 	system(s);
62 
63 	/* dump node free mem */
64 	node_size = numa_node_size64(node, &freep);
65 	tst_res(TINFO, "Node id: %d, size: %lld, free: %lld",
66 		 node, node_size, freep);
67 }
68 
migrate_to_node(pid_t pid,int node)69 static int migrate_to_node(pid_t pid, int node)
70 {
71 	unsigned long nodemask_size, max_node;
72 	unsigned long *old_nodes, *new_nodes;
73 	int i;
74 
75 	tst_res(TINFO, "pid(%d) migrate pid %d to node -> %d",
76 		 getpid(), pid, node);
77 	max_node = LTP_ALIGN(get_max_node(), sizeof(unsigned long)*8);
78 	nodemask_size = max_node / 8;
79 	old_nodes = SAFE_MALLOC(nodemask_size);
80 	new_nodes = SAFE_MALLOC(nodemask_size);
81 
82 	memset(old_nodes, 0, nodemask_size);
83 	memset(new_nodes, 0, nodemask_size);
84 	for (i = 0; i < num_nodes; i++)
85 		set_bit(old_nodes, nodes[i], 1);
86 	set_bit(new_nodes, node, 1);
87 
88 	TEST(tst_syscall(__NR_migrate_pages, pid, max_node, old_nodes,
89 		new_nodes));
90 	if (TST_RET != 0) {
91 		if (TST_RET < 0) {
92 			tst_res(TFAIL | TTERRNO, "migrate_pages failed "
93 				 "ret: %ld, ", TST_RET);
94 			print_mem_stats(pid, node);
95 		} else {
96 			tst_res(TINFO, "migrate_pages could not migrate all "
97 				 "pages, not migrated: %ld", TST_RET);
98 		}
99 	}
100 	free(old_nodes);
101 	free(new_nodes);
102 	return TST_RET;
103 }
104 
addr_on_node(void * addr)105 static int addr_on_node(void *addr)
106 {
107 	int node;
108 	int ret;
109 
110 	ret = tst_syscall(__NR_get_mempolicy, &node, NULL, (unsigned long)0,
111 		      (unsigned long)addr, MPOL_F_NODE | MPOL_F_ADDR);
112 	if (ret == -1) {
113 		tst_res(TFAIL | TERRNO,
114 				"error getting memory policy for page %p", addr);
115 	}
116 	return node;
117 }
118 
check_addr_on_node(void * addr,int exp_node)119 static int check_addr_on_node(void *addr, int exp_node)
120 {
121 	int node;
122 
123 	node = addr_on_node(addr);
124 	if (node == exp_node) {
125 		tst_res(TPASS, "pid(%d) addr %p is on expected node: %d",
126 			 getpid(), addr, exp_node);
127 		return TPASS;
128 	} else {
129 		tst_res(TFAIL, "pid(%d) addr %p not on expected node: %d "
130 			 ", expected %d", getpid(), addr, node, exp_node);
131 		print_mem_stats(0, exp_node);
132 		return TFAIL;
133 	}
134 }
135 
test_migrate_current_process(int node1,int node2,int cap_sys_nice)136 static void test_migrate_current_process(int node1, int node2, int cap_sys_nice)
137 {
138 	char *private, *shared;
139 	int ret;
140 	pid_t child;
141 
142 	/* parent can migrate its non-shared memory */
143 	tst_res(TINFO, "current_process, cap_sys_nice: %d", cap_sys_nice);
144 	private =  SAFE_MMAP(NULL, getpagesize(), PROT_READ | PROT_WRITE,
145 		MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
146 	private[0] = 0;
147 	tst_res(TINFO, "private anonymous: %p", private);
148 
149 	migrate_to_node(0, node2);
150 	check_addr_on_node(private, node2);
151 	migrate_to_node(0, node1);
152 	check_addr_on_node(private, node1);
153 	SAFE_MUNMAP(private, getpagesize());
154 
155 	/* parent can migrate shared memory with CAP_SYS_NICE */
156 	shared = SAFE_MMAP(NULL, getpagesize(), PROT_READ | PROT_WRITE,
157 		      MAP_ANONYMOUS | MAP_SHARED, 0, 0);
158 	shared[0] = 1;
159 	tst_res(TINFO, "shared anonymous: %p", shared);
160 	migrate_to_node(0, node2);
161 	check_addr_on_node(shared, node2);
162 
163 	/* shared mem is on node2, try to migrate in child to node1 */
164 	fflush(stdout);
165 	child = SAFE_FORK();
166 	if (child == 0) {
167 		tst_res(TINFO, "child shared anonymous, cap_sys_nice: %d",
168 			 cap_sys_nice);
169 		private =  SAFE_MMAP(NULL, getpagesize(),
170 			PROT_READ | PROT_WRITE,
171 			MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
172 		private[0] = 1;
173 		shared[0] = 1;
174 		if (!cap_sys_nice)
175 			SAFE_SETEUID(ltpuser->pw_uid);
176 
177 		migrate_to_node(0, node1);
178 		/* child can migrate non-shared memory */
179 		ret = check_addr_on_node(private, node1);
180 
181 		exit(ret);
182 	}
183 
184 	SAFE_WAITPID(child, NULL, 0);
185 	if (cap_sys_nice)
186 		/* child can migrate shared memory only
187 		 * with CAP_SYS_NICE */
188 		check_addr_on_node(shared, node1);
189 	else
190 		check_addr_on_node(shared, node2);
191 	SAFE_MUNMAP(shared, getpagesize());
192 }
193 
test_migrate_other_process(int node1,int node2,int cap_sys_nice)194 static void test_migrate_other_process(int node1, int node2, int cap_sys_nice)
195 {
196 	char *private;
197 	int ret;
198 	pid_t child1, child2;
199 
200 	tst_res(TINFO, "other_process, cap_sys_nice: %d", cap_sys_nice);
201 
202 	fflush(stdout);
203 	child1 = SAFE_FORK();
204 	if (child1 == 0) {
205 		private =  SAFE_MMAP(NULL, getpagesize(),
206 			PROT_READ | PROT_WRITE,
207 			MAP_ANONYMOUS | MAP_PRIVATE, 0, 0);
208 		private[0] = 0;
209 
210 		/* make sure we are on node1 */
211 		migrate_to_node(0, node1);
212 		check_addr_on_node(private, node1);
213 
214 		SAFE_SETUID(ltpuser->pw_uid);
215 
216 		/* commit_creds() will clear dumpable, restore it */
217 		if (prctl(PR_SET_DUMPABLE, 1))
218 			tst_brk(TBROK | TERRNO, "prctl");
219 
220 		/* signal child2 it's OK to migrate child1 and wait */
221 		TST_CHECKPOINT_WAKE(0);
222 		TST_CHECKPOINT_WAIT(1);
223 
224 		/* child2 can migrate child1 process if it's privileged */
225 		/* child2 can migrate child1 process if it has same uid */
226 		ret = check_addr_on_node(private, node2);
227 
228 		exit(ret);
229 	}
230 
231 	fflush(stdout);
232 	child2 = SAFE_FORK();
233 	if (child2 == 0) {
234 		if (!cap_sys_nice)
235 			SAFE_SETUID(ltpuser->pw_uid);
236 
237 		/* wait until child1 is ready on node1, then migrate and
238 		 * signal to check current node */
239 		TST_CHECKPOINT_WAIT(0);
240 		migrate_to_node(child1, node2);
241 		TST_CHECKPOINT_WAKE(1);
242 
243 		exit(TPASS);
244 	}
245 
246 	SAFE_WAITPID(child1, NULL, 0);
247 	SAFE_WAITPID(child2, NULL, 0);
248 }
249 
run(void)250 static void run(void)
251 {
252 	test_migrate_current_process(nodeA, nodeB, 1);
253 	test_migrate_current_process(nodeA, nodeB, 0);
254 	test_migrate_other_process(nodeA, nodeB, 1);
255 	test_migrate_other_process(nodeA, nodeB, 0);
256 }
257 
setup(void)258 static void setup(void)
259 {
260 	int ret, i, j;
261 	int pagesize = getpagesize();
262 	void *p;
263 
264 	tst_syscall(__NR_migrate_pages, 0, 0, NULL, NULL);
265 
266 	if (numa_available() == -1)
267 		tst_brk(TCONF, "NUMA not available");
268 
269 	ret = get_allowed_nodes_arr(NH_MEMS, &num_nodes, &nodes);
270 	if (ret < 0)
271 		tst_brk(TBROK | TERRNO, "get_allowed_nodes(): %d", ret);
272 
273 	if (num_nodes < 2)
274 		tst_brk(TCONF, "at least 2 allowed NUMA nodes"
275 			 " are required");
276 	else if (tst_kvercmp(2, 6, 18) < 0)
277 		tst_brk(TCONF, "2.6.18 or greater kernel required");
278 
279 	FILE_PRINTF("/proc/sys/kernel/numa_balancing", "0");
280 	/*
281 	 * find 2 nodes, which can hold NODE_MIN_FREEMEM bytes
282 	 * The reason is that:
283 	 * 1. migrate_pages() is expected to succeed
284 	 * 2. this test avoids hitting:
285 	 *    Bug 870326 - migrate_pages() reports success, but pages are
286 	 *                 not moved to desired node
287 	 *    https://bugzilla.redhat.com/show_bug.cgi?id=870326
288 	 */
289 	nodeA = nodeB = -1;
290 	for (i = 0; i < num_nodes; i++) {
291 		p = numa_alloc_onnode(NODE_MIN_FREEMEM, nodes[i]);
292 		if (p == NULL)
293 			break;
294 		memset(p, 0xff, NODE_MIN_FREEMEM);
295 
296 		j = 0;
297 		while (j < NODE_MIN_FREEMEM) {
298 			if (addr_on_node(p + j) != nodes[i])
299 				break;
300 			j += pagesize;
301 		}
302 		numa_free(p, NODE_MIN_FREEMEM);
303 
304 		if (j >= NODE_MIN_FREEMEM) {
305 			if (nodeA == -1)
306 				nodeA = nodes[i];
307 			else if (nodeB == -1)
308 				nodeB = nodes[i];
309 			else
310 				break;
311 		}
312 	}
313 
314 	if (nodeA == -1 || nodeB == -1)
315 		tst_brk(TCONF, "at least 2 NUMA nodes with "
316 			 "free mem > %d are needed", NODE_MIN_FREEMEM);
317 	tst_res(TINFO, "Using nodes: %d %d", nodeA, nodeB);
318 
319 	ltpuser = getpwnam(nobody_uid);
320 	if (ltpuser == NULL)
321 		tst_brk(TBROK | TERRNO, "getpwnam failed");
322 }
323 
324 static struct tst_test test = {
325 	.needs_root = 1,
326 	.needs_checkpoints = 1,
327 	.forks_child = 1,
328 	.test_all = run,
329 	.setup = setup,
330 	.save_restore = (const char * const[]) {
331 		"?/proc/sys/kernel/numa_balancing",
332 		NULL,
333 	},
334 };
335 #else
336 TST_TEST_TCONF(NUMA_ERROR_MSG);
337 #endif
338