• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* SPDX-License-Identifier: GPL-2.0 */
2 #define _GNU_SOURCE
3 
4 #include <linux/limits.h>
5 #include <linux/oom.h>
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/stat.h>
11 #include <sys/types.h>
12 #include <unistd.h>
13 #include <sys/socket.h>
14 #include <sys/wait.h>
15 #include <arpa/inet.h>
16 #include <netinet/in.h>
17 #include <netdb.h>
18 #include <errno.h>
19 #include <sys/mman.h>
20 
21 #include "../kselftest.h"
22 #include "cgroup_util.h"
23 
24 static bool has_localevents;
25 static bool has_recursiveprot;
26 
27 /*
28  * This test creates two nested cgroups with and without enabling
29  * the memory controller.
30  */
test_memcg_subtree_control(const char * root)31 static int test_memcg_subtree_control(const char *root)
32 {
33 	char *parent, *child, *parent2 = NULL, *child2 = NULL;
34 	int ret = KSFT_FAIL;
35 	char buf[PAGE_SIZE];
36 
37 	/* Create two nested cgroups with the memory controller enabled */
38 	parent = cg_name(root, "memcg_test_0");
39 	child = cg_name(root, "memcg_test_0/memcg_test_1");
40 	if (!parent || !child)
41 		goto cleanup_free;
42 
43 	if (cg_create(parent))
44 		goto cleanup_free;
45 
46 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
47 		goto cleanup_parent;
48 
49 	if (cg_create(child))
50 		goto cleanup_parent;
51 
52 	if (cg_read_strstr(child, "cgroup.controllers", "memory"))
53 		goto cleanup_child;
54 
55 	/* Create two nested cgroups without enabling memory controller */
56 	parent2 = cg_name(root, "memcg_test_1");
57 	child2 = cg_name(root, "memcg_test_1/memcg_test_1");
58 	if (!parent2 || !child2)
59 		goto cleanup_free2;
60 
61 	if (cg_create(parent2))
62 		goto cleanup_free2;
63 
64 	if (cg_create(child2))
65 		goto cleanup_parent2;
66 
67 	if (cg_read(child2, "cgroup.controllers", buf, sizeof(buf)))
68 		goto cleanup_all;
69 
70 	if (!cg_read_strstr(child2, "cgroup.controllers", "memory"))
71 		goto cleanup_all;
72 
73 	ret = KSFT_PASS;
74 
75 cleanup_all:
76 	cg_destroy(child2);
77 cleanup_parent2:
78 	cg_destroy(parent2);
79 cleanup_free2:
80 	free(parent2);
81 	free(child2);
82 cleanup_child:
83 	cg_destroy(child);
84 cleanup_parent:
85 	cg_destroy(parent);
86 cleanup_free:
87 	free(parent);
88 	free(child);
89 
90 	return ret;
91 }
92 
alloc_anon_50M_check(const char * cgroup,void * arg)93 static int alloc_anon_50M_check(const char *cgroup, void *arg)
94 {
95 	size_t size = MB(50);
96 	char *buf, *ptr;
97 	long anon, current;
98 	int ret = -1;
99 
100 	buf = malloc(size);
101 	if (buf == NULL) {
102 		fprintf(stderr, "malloc() failed\n");
103 		return -1;
104 	}
105 
106 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
107 		*ptr = 0;
108 
109 	current = cg_read_long(cgroup, "memory.current");
110 	if (current < size)
111 		goto cleanup;
112 
113 	if (!values_close(size, current, 3))
114 		goto cleanup;
115 
116 	anon = cg_read_key_long(cgroup, "memory.stat", "anon ");
117 	if (anon < 0)
118 		goto cleanup;
119 
120 	if (!values_close(anon, current, 3))
121 		goto cleanup;
122 
123 	ret = 0;
124 cleanup:
125 	free(buf);
126 	return ret;
127 }
128 
alloc_pagecache_50M_check(const char * cgroup,void * arg)129 static int alloc_pagecache_50M_check(const char *cgroup, void *arg)
130 {
131 	size_t size = MB(50);
132 	int ret = -1;
133 	long current, file;
134 	int fd;
135 
136 	fd = get_temp_fd();
137 	if (fd < 0)
138 		return -1;
139 
140 	if (alloc_pagecache(fd, size))
141 		goto cleanup;
142 
143 	current = cg_read_long(cgroup, "memory.current");
144 	if (current < size)
145 		goto cleanup;
146 
147 	file = cg_read_key_long(cgroup, "memory.stat", "file ");
148 	if (file < 0)
149 		goto cleanup;
150 
151 	if (!values_close(file, current, 10))
152 		goto cleanup;
153 
154 	ret = 0;
155 
156 cleanup:
157 	close(fd);
158 	return ret;
159 }
160 
161 /*
162  * This test create a memory cgroup, allocates
163  * some anonymous memory and some pagecache
164  * and check memory.current and some memory.stat values.
165  */
test_memcg_current(const char * root)166 static int test_memcg_current(const char *root)
167 {
168 	int ret = KSFT_FAIL;
169 	long current;
170 	char *memcg;
171 
172 	memcg = cg_name(root, "memcg_test");
173 	if (!memcg)
174 		goto cleanup;
175 
176 	if (cg_create(memcg))
177 		goto cleanup;
178 
179 	current = cg_read_long(memcg, "memory.current");
180 	if (current != 0)
181 		goto cleanup;
182 
183 	if (cg_run(memcg, alloc_anon_50M_check, NULL))
184 		goto cleanup;
185 
186 	if (cg_run(memcg, alloc_pagecache_50M_check, NULL))
187 		goto cleanup;
188 
189 	ret = KSFT_PASS;
190 
191 cleanup:
192 	cg_destroy(memcg);
193 	free(memcg);
194 
195 	return ret;
196 }
197 
alloc_pagecache_50M_noexit(const char * cgroup,void * arg)198 static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
199 {
200 	int fd = (long)arg;
201 	int ppid = getppid();
202 
203 	if (alloc_pagecache(fd, MB(50)))
204 		return -1;
205 
206 	while (getppid() == ppid)
207 		sleep(1);
208 
209 	return 0;
210 }
211 
alloc_anon_noexit(const char * cgroup,void * arg)212 static int alloc_anon_noexit(const char *cgroup, void *arg)
213 {
214 	int ppid = getppid();
215 	size_t size = (unsigned long)arg;
216 	char *buf, *ptr;
217 
218 	buf = malloc(size);
219 	if (buf == NULL) {
220 		fprintf(stderr, "malloc() failed\n");
221 		return -1;
222 	}
223 
224 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
225 		*ptr = 0;
226 
227 	while (getppid() == ppid)
228 		sleep(1);
229 
230 	free(buf);
231 	return 0;
232 }
233 
234 /*
235  * Wait until processes are killed asynchronously by the OOM killer
236  * If we exceed a timeout, fail.
237  */
cg_test_proc_killed(const char * cgroup)238 static int cg_test_proc_killed(const char *cgroup)
239 {
240 	int limit;
241 
242 	for (limit = 10; limit > 0; limit--) {
243 		if (cg_read_strcmp(cgroup, "cgroup.procs", "") == 0)
244 			return 0;
245 
246 		usleep(100000);
247 	}
248 	return -1;
249 }
250 
251 /*
252  * First, this test creates the following hierarchy:
253  * A       memory.min = 0,    memory.max = 200M
254  * A/B     memory.min = 50M
255  * A/B/C   memory.min = 75M,  memory.current = 50M
256  * A/B/D   memory.min = 25M,  memory.current = 50M
257  * A/B/E   memory.min = 0,    memory.current = 50M
258  * A/B/F   memory.min = 500M, memory.current = 0
259  *
260  * (or memory.low if we test soft protection)
261  *
262  * Usages are pagecache and the test keeps a running
263  * process in every leaf cgroup.
264  * Then it creates A/G and creates a significant
265  * memory pressure in A.
266  *
267  * Then it checks actual memory usages and expects that:
268  * A/B    memory.current ~= 50M
269  * A/B/C  memory.current ~= 29M
270  * A/B/D  memory.current ~= 21M
271  * A/B/E  memory.current ~= 0
272  * A/B/F  memory.current  = 0
273  * (for origin of the numbers, see model in memcg_protection.m.)
274  *
275  * After that it tries to allocate more than there is
276  * unprotected memory in A available, and checks that:
277  * a) memory.min protects pagecache even in this case,
278  * b) memory.low allows reclaiming page cache with low events.
279  */
test_memcg_protection(const char * root,bool min)280 static int test_memcg_protection(const char *root, bool min)
281 {
282 	int ret = KSFT_FAIL, rc;
283 	char *parent[3] = {NULL};
284 	char *children[4] = {NULL};
285 	const char *attribute = min ? "memory.min" : "memory.low";
286 	long c[4];
287 	long current;
288 	int i, attempts;
289 	int fd;
290 
291 	fd = get_temp_fd();
292 	if (fd < 0)
293 		goto cleanup;
294 
295 	parent[0] = cg_name(root, "memcg_test_0");
296 	if (!parent[0])
297 		goto cleanup;
298 
299 	parent[1] = cg_name(parent[0], "memcg_test_1");
300 	if (!parent[1])
301 		goto cleanup;
302 
303 	parent[2] = cg_name(parent[0], "memcg_test_2");
304 	if (!parent[2])
305 		goto cleanup;
306 
307 	if (cg_create(parent[0]))
308 		goto cleanup;
309 
310 	if (cg_read_long(parent[0], attribute)) {
311 		/* No memory.min on older kernels is fine */
312 		if (min)
313 			ret = KSFT_SKIP;
314 		goto cleanup;
315 	}
316 
317 	if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
318 		goto cleanup;
319 
320 	if (cg_write(parent[0], "memory.max", "200M"))
321 		goto cleanup;
322 
323 	if (cg_write(parent[0], "memory.swap.max", "0"))
324 		goto cleanup;
325 
326 	if (cg_create(parent[1]))
327 		goto cleanup;
328 
329 	if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
330 		goto cleanup;
331 
332 	if (cg_create(parent[2]))
333 		goto cleanup;
334 
335 	for (i = 0; i < ARRAY_SIZE(children); i++) {
336 		children[i] = cg_name_indexed(parent[1], "child_memcg", i);
337 		if (!children[i])
338 			goto cleanup;
339 
340 		if (cg_create(children[i]))
341 			goto cleanup;
342 
343 		if (i > 2)
344 			continue;
345 
346 		cg_run_nowait(children[i], alloc_pagecache_50M_noexit,
347 			      (void *)(long)fd);
348 	}
349 
350 	if (cg_write(parent[1],   attribute, "50M"))
351 		goto cleanup;
352 	if (cg_write(children[0], attribute, "75M"))
353 		goto cleanup;
354 	if (cg_write(children[1], attribute, "25M"))
355 		goto cleanup;
356 	if (cg_write(children[2], attribute, "0"))
357 		goto cleanup;
358 	if (cg_write(children[3], attribute, "500M"))
359 		goto cleanup;
360 
361 	attempts = 0;
362 	while (!values_close(cg_read_long(parent[1], "memory.current"),
363 			     MB(150), 3)) {
364 		if (attempts++ > 5)
365 			break;
366 		sleep(1);
367 	}
368 
369 	if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
370 		goto cleanup;
371 
372 	if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
373 		goto cleanup;
374 
375 	for (i = 0; i < ARRAY_SIZE(children); i++)
376 		c[i] = cg_read_long(children[i], "memory.current");
377 
378 	if (!values_close(c[0], MB(29), 10))
379 		goto cleanup;
380 
381 	if (!values_close(c[1], MB(21), 10))
382 		goto cleanup;
383 
384 	if (c[3] != 0)
385 		goto cleanup;
386 
387 	rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
388 	if (min && !rc)
389 		goto cleanup;
390 	else if (!min && rc) {
391 		fprintf(stderr,
392 			"memory.low prevents from allocating anon memory\n");
393 		goto cleanup;
394 	}
395 
396 	current = min ? MB(50) : MB(30);
397 	if (!values_close(cg_read_long(parent[1], "memory.current"), current, 3))
398 		goto cleanup;
399 
400 	if (min) {
401 		ret = KSFT_PASS;
402 		goto cleanup;
403 	}
404 
405 	for (i = 0; i < ARRAY_SIZE(children); i++) {
406 		int no_low_events_index = 1;
407 		long low, oom;
408 
409 		oom = cg_read_key_long(children[i], "memory.events", "oom ");
410 		low = cg_read_key_long(children[i], "memory.events", "low ");
411 
412 		if (oom)
413 			goto cleanup;
414 		if (i <= no_low_events_index && low <= 0)
415 			goto cleanup;
416 		if (i > no_low_events_index && low)
417 			goto cleanup;
418 
419 	}
420 
421 	ret = KSFT_PASS;
422 
423 cleanup:
424 	for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
425 		if (!children[i])
426 			continue;
427 
428 		cg_destroy(children[i]);
429 		free(children[i]);
430 	}
431 
432 	for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
433 		if (!parent[i])
434 			continue;
435 
436 		cg_destroy(parent[i]);
437 		free(parent[i]);
438 	}
439 	close(fd);
440 	return ret;
441 }
442 
test_memcg_min(const char * root)443 static int test_memcg_min(const char *root)
444 {
445 	return test_memcg_protection(root, true);
446 }
447 
test_memcg_low(const char * root)448 static int test_memcg_low(const char *root)
449 {
450 	return test_memcg_protection(root, false);
451 }
452 
alloc_pagecache_max_30M(const char * cgroup,void * arg)453 static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
454 {
455 	size_t size = MB(50);
456 	int ret = -1;
457 	long current, high, max;
458 	int fd;
459 
460 	high = cg_read_long(cgroup, "memory.high");
461 	max = cg_read_long(cgroup, "memory.max");
462 	if (high != MB(30) && max != MB(30))
463 		return -1;
464 
465 	fd = get_temp_fd();
466 	if (fd < 0)
467 		return -1;
468 
469 	if (alloc_pagecache(fd, size))
470 		goto cleanup;
471 
472 	current = cg_read_long(cgroup, "memory.current");
473 	if (!values_close(current, MB(30), 5))
474 		goto cleanup;
475 
476 	ret = 0;
477 
478 cleanup:
479 	close(fd);
480 	return ret;
481 
482 }
483 
484 /*
485  * This test checks that memory.high limits the amount of
486  * memory which can be consumed by either anonymous memory
487  * or pagecache.
488  */
test_memcg_high(const char * root)489 static int test_memcg_high(const char *root)
490 {
491 	int ret = KSFT_FAIL;
492 	char *memcg;
493 	long high;
494 
495 	memcg = cg_name(root, "memcg_test");
496 	if (!memcg)
497 		goto cleanup;
498 
499 	if (cg_create(memcg))
500 		goto cleanup;
501 
502 	if (cg_read_strcmp(memcg, "memory.high", "max\n"))
503 		goto cleanup;
504 
505 	if (cg_write(memcg, "memory.swap.max", "0"))
506 		goto cleanup;
507 
508 	if (cg_write(memcg, "memory.high", "30M"))
509 		goto cleanup;
510 
511 	if (cg_run(memcg, alloc_anon, (void *)MB(31)))
512 		goto cleanup;
513 
514 	if (!cg_run(memcg, alloc_pagecache_50M_check, NULL))
515 		goto cleanup;
516 
517 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
518 		goto cleanup;
519 
520 	high = cg_read_key_long(memcg, "memory.events", "high ");
521 	if (high <= 0)
522 		goto cleanup;
523 
524 	ret = KSFT_PASS;
525 
526 cleanup:
527 	cg_destroy(memcg);
528 	free(memcg);
529 
530 	return ret;
531 }
532 
alloc_anon_mlock(const char * cgroup,void * arg)533 static int alloc_anon_mlock(const char *cgroup, void *arg)
534 {
535 	size_t size = (size_t)arg;
536 	void *buf;
537 
538 	buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
539 		   0, 0);
540 	if (buf == MAP_FAILED)
541 		return -1;
542 
543 	mlock(buf, size);
544 	munmap(buf, size);
545 	return 0;
546 }
547 
548 /*
549  * This test checks that memory.high is able to throttle big single shot
550  * allocation i.e. large allocation within one kernel entry.
551  */
test_memcg_high_sync(const char * root)552 static int test_memcg_high_sync(const char *root)
553 {
554 	int ret = KSFT_FAIL, pid, fd = -1;
555 	char *memcg;
556 	long pre_high, pre_max;
557 	long post_high, post_max;
558 
559 	memcg = cg_name(root, "memcg_test");
560 	if (!memcg)
561 		goto cleanup;
562 
563 	if (cg_create(memcg))
564 		goto cleanup;
565 
566 	pre_high = cg_read_key_long(memcg, "memory.events", "high ");
567 	pre_max = cg_read_key_long(memcg, "memory.events", "max ");
568 	if (pre_high < 0 || pre_max < 0)
569 		goto cleanup;
570 
571 	if (cg_write(memcg, "memory.swap.max", "0"))
572 		goto cleanup;
573 
574 	if (cg_write(memcg, "memory.high", "30M"))
575 		goto cleanup;
576 
577 	if (cg_write(memcg, "memory.max", "140M"))
578 		goto cleanup;
579 
580 	fd = memcg_prepare_for_wait(memcg);
581 	if (fd < 0)
582 		goto cleanup;
583 
584 	pid = cg_run_nowait(memcg, alloc_anon_mlock, (void *)MB(200));
585 	if (pid < 0)
586 		goto cleanup;
587 
588 	cg_wait_for(fd);
589 
590 	post_high = cg_read_key_long(memcg, "memory.events", "high ");
591 	post_max = cg_read_key_long(memcg, "memory.events", "max ");
592 	if (post_high < 0 || post_max < 0)
593 		goto cleanup;
594 
595 	if (pre_high == post_high || pre_max != post_max)
596 		goto cleanup;
597 
598 	ret = KSFT_PASS;
599 
600 cleanup:
601 	if (fd >= 0)
602 		close(fd);
603 	cg_destroy(memcg);
604 	free(memcg);
605 
606 	return ret;
607 }
608 
609 /*
610  * This test checks that memory.max limits the amount of
611  * memory which can be consumed by either anonymous memory
612  * or pagecache.
613  */
test_memcg_max(const char * root)614 static int test_memcg_max(const char *root)
615 {
616 	int ret = KSFT_FAIL;
617 	char *memcg;
618 	long current, max;
619 
620 	memcg = cg_name(root, "memcg_test");
621 	if (!memcg)
622 		goto cleanup;
623 
624 	if (cg_create(memcg))
625 		goto cleanup;
626 
627 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
628 		goto cleanup;
629 
630 	if (cg_write(memcg, "memory.swap.max", "0"))
631 		goto cleanup;
632 
633 	if (cg_write(memcg, "memory.max", "30M"))
634 		goto cleanup;
635 
636 	/* Should be killed by OOM killer */
637 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
638 		goto cleanup;
639 
640 	if (cg_run(memcg, alloc_pagecache_max_30M, NULL))
641 		goto cleanup;
642 
643 	current = cg_read_long(memcg, "memory.current");
644 	if (current > MB(30) || !current)
645 		goto cleanup;
646 
647 	max = cg_read_key_long(memcg, "memory.events", "max ");
648 	if (max <= 0)
649 		goto cleanup;
650 
651 	ret = KSFT_PASS;
652 
653 cleanup:
654 	cg_destroy(memcg);
655 	free(memcg);
656 
657 	return ret;
658 }
659 
660 /*
661  * This test checks that memory.reclaim reclaims the given
662  * amount of memory (from both anon and file, if possible).
663  */
test_memcg_reclaim(const char * root)664 static int test_memcg_reclaim(const char *root)
665 {
666 	int ret = KSFT_FAIL, fd, retries;
667 	char *memcg;
668 	long current, expected_usage, to_reclaim;
669 	char buf[64];
670 
671 	memcg = cg_name(root, "memcg_test");
672 	if (!memcg)
673 		goto cleanup;
674 
675 	if (cg_create(memcg))
676 		goto cleanup;
677 
678 	current = cg_read_long(memcg, "memory.current");
679 	if (current != 0)
680 		goto cleanup;
681 
682 	fd = get_temp_fd();
683 	if (fd < 0)
684 		goto cleanup;
685 
686 	cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd);
687 
688 	/*
689 	 * If swap is enabled, try to reclaim from both anon and file, else try
690 	 * to reclaim from file only.
691 	 */
692 	if (is_swap_enabled()) {
693 		cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50));
694 		expected_usage = MB(100);
695 	} else
696 		expected_usage = MB(50);
697 
698 	/*
699 	 * Wait until current usage reaches the expected usage (or we run out of
700 	 * retries).
701 	 */
702 	retries = 5;
703 	while (!values_close(cg_read_long(memcg, "memory.current"),
704 			    expected_usage, 10)) {
705 		if (retries--) {
706 			sleep(1);
707 			continue;
708 		} else {
709 			fprintf(stderr,
710 				"failed to allocate %ld for memcg reclaim test\n",
711 				expected_usage);
712 			goto cleanup;
713 		}
714 	}
715 
716 	/*
717 	 * Reclaim until current reaches 30M, this makes sure we hit both anon
718 	 * and file if swap is enabled.
719 	 */
720 	retries = 5;
721 	while (true) {
722 		int err;
723 
724 		current = cg_read_long(memcg, "memory.current");
725 		to_reclaim = current - MB(30);
726 
727 		/*
728 		 * We only keep looping if we get EAGAIN, which means we could
729 		 * not reclaim the full amount.
730 		 */
731 		if (to_reclaim <= 0)
732 			goto cleanup;
733 
734 
735 		snprintf(buf, sizeof(buf), "%ld", to_reclaim);
736 		err = cg_write(memcg, "memory.reclaim", buf);
737 		if (!err) {
738 			/*
739 			 * If writing succeeds, then the written amount should have been
740 			 * fully reclaimed (and maybe more).
741 			 */
742 			current = cg_read_long(memcg, "memory.current");
743 			if (!values_close(current, MB(30), 3) && current > MB(30))
744 				goto cleanup;
745 			break;
746 		}
747 
748 		/* The kernel could not reclaim the full amount, try again. */
749 		if (err == -EAGAIN && retries--)
750 			continue;
751 
752 		/* We got an unexpected error or ran out of retries. */
753 		goto cleanup;
754 	}
755 
756 	ret = KSFT_PASS;
757 cleanup:
758 	cg_destroy(memcg);
759 	free(memcg);
760 	close(fd);
761 
762 	return ret;
763 }
764 
alloc_anon_50M_check_swap(const char * cgroup,void * arg)765 static int alloc_anon_50M_check_swap(const char *cgroup, void *arg)
766 {
767 	long mem_max = (long)arg;
768 	size_t size = MB(50);
769 	char *buf, *ptr;
770 	long mem_current, swap_current;
771 	int ret = -1;
772 
773 	buf = malloc(size);
774 	if (buf == NULL) {
775 		fprintf(stderr, "malloc() failed\n");
776 		return -1;
777 	}
778 
779 	for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE)
780 		*ptr = 0;
781 
782 	mem_current = cg_read_long(cgroup, "memory.current");
783 	if (!mem_current || !values_close(mem_current, mem_max, 3))
784 		goto cleanup;
785 
786 	swap_current = cg_read_long(cgroup, "memory.swap.current");
787 	if (!swap_current ||
788 	    !values_close(mem_current + swap_current, size, 3))
789 		goto cleanup;
790 
791 	ret = 0;
792 cleanup:
793 	free(buf);
794 	return ret;
795 }
796 
797 /*
798  * This test checks that memory.swap.max limits the amount of
799  * anonymous memory which can be swapped out.
800  */
test_memcg_swap_max(const char * root)801 static int test_memcg_swap_max(const char *root)
802 {
803 	int ret = KSFT_FAIL;
804 	char *memcg;
805 	long max;
806 
807 	if (!is_swap_enabled())
808 		return KSFT_SKIP;
809 
810 	memcg = cg_name(root, "memcg_test");
811 	if (!memcg)
812 		goto cleanup;
813 
814 	if (cg_create(memcg))
815 		goto cleanup;
816 
817 	if (cg_read_long(memcg, "memory.swap.current")) {
818 		ret = KSFT_SKIP;
819 		goto cleanup;
820 	}
821 
822 	if (cg_read_strcmp(memcg, "memory.max", "max\n"))
823 		goto cleanup;
824 
825 	if (cg_read_strcmp(memcg, "memory.swap.max", "max\n"))
826 		goto cleanup;
827 
828 	if (cg_write(memcg, "memory.swap.max", "30M"))
829 		goto cleanup;
830 
831 	if (cg_write(memcg, "memory.max", "30M"))
832 		goto cleanup;
833 
834 	/* Should be killed by OOM killer */
835 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
836 		goto cleanup;
837 
838 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
839 		goto cleanup;
840 
841 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
842 		goto cleanup;
843 
844 	if (cg_run(memcg, alloc_anon_50M_check_swap, (void *)MB(30)))
845 		goto cleanup;
846 
847 	max = cg_read_key_long(memcg, "memory.events", "max ");
848 	if (max <= 0)
849 		goto cleanup;
850 
851 	ret = KSFT_PASS;
852 
853 cleanup:
854 	cg_destroy(memcg);
855 	free(memcg);
856 
857 	return ret;
858 }
859 
860 /*
861  * This test disables swapping and tries to allocate anonymous memory
862  * up to OOM. Then it checks for oom and oom_kill events in
863  * memory.events.
864  */
test_memcg_oom_events(const char * root)865 static int test_memcg_oom_events(const char *root)
866 {
867 	int ret = KSFT_FAIL;
868 	char *memcg;
869 
870 	memcg = cg_name(root, "memcg_test");
871 	if (!memcg)
872 		goto cleanup;
873 
874 	if (cg_create(memcg))
875 		goto cleanup;
876 
877 	if (cg_write(memcg, "memory.max", "30M"))
878 		goto cleanup;
879 
880 	if (cg_write(memcg, "memory.swap.max", "0"))
881 		goto cleanup;
882 
883 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
884 		goto cleanup;
885 
886 	if (cg_read_strcmp(memcg, "cgroup.procs", ""))
887 		goto cleanup;
888 
889 	if (cg_read_key_long(memcg, "memory.events", "oom ") != 1)
890 		goto cleanup;
891 
892 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 1)
893 		goto cleanup;
894 
895 	ret = KSFT_PASS;
896 
897 cleanup:
898 	cg_destroy(memcg);
899 	free(memcg);
900 
901 	return ret;
902 }
903 
904 struct tcp_server_args {
905 	unsigned short port;
906 	int ctl[2];
907 };
908 
tcp_server(const char * cgroup,void * arg)909 static int tcp_server(const char *cgroup, void *arg)
910 {
911 	struct tcp_server_args *srv_args = arg;
912 	struct sockaddr_in6 saddr = { 0 };
913 	socklen_t slen = sizeof(saddr);
914 	int sk, client_sk, ctl_fd, yes = 1, ret = -1;
915 
916 	close(srv_args->ctl[0]);
917 	ctl_fd = srv_args->ctl[1];
918 
919 	saddr.sin6_family = AF_INET6;
920 	saddr.sin6_addr = in6addr_any;
921 	saddr.sin6_port = htons(srv_args->port);
922 
923 	sk = socket(AF_INET6, SOCK_STREAM, 0);
924 	if (sk < 0)
925 		return ret;
926 
927 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0)
928 		goto cleanup;
929 
930 	if (bind(sk, (struct sockaddr *)&saddr, slen)) {
931 		write(ctl_fd, &errno, sizeof(errno));
932 		goto cleanup;
933 	}
934 
935 	if (listen(sk, 1))
936 		goto cleanup;
937 
938 	ret = 0;
939 	if (write(ctl_fd, &ret, sizeof(ret)) != sizeof(ret)) {
940 		ret = -1;
941 		goto cleanup;
942 	}
943 
944 	client_sk = accept(sk, NULL, NULL);
945 	if (client_sk < 0)
946 		goto cleanup;
947 
948 	ret = -1;
949 	for (;;) {
950 		uint8_t buf[0x100000];
951 
952 		if (write(client_sk, buf, sizeof(buf)) <= 0) {
953 			if (errno == ECONNRESET)
954 				ret = 0;
955 			break;
956 		}
957 	}
958 
959 	close(client_sk);
960 
961 cleanup:
962 	close(sk);
963 	return ret;
964 }
965 
tcp_client(const char * cgroup,unsigned short port)966 static int tcp_client(const char *cgroup, unsigned short port)
967 {
968 	const char server[] = "localhost";
969 	struct addrinfo *ai;
970 	char servport[6];
971 	int retries = 0x10; /* nice round number */
972 	int sk, ret;
973 
974 	snprintf(servport, sizeof(servport), "%hd", port);
975 	ret = getaddrinfo(server, servport, NULL, &ai);
976 	if (ret)
977 		return ret;
978 
979 	sk = socket(ai->ai_family, ai->ai_socktype, ai->ai_protocol);
980 	if (sk < 0)
981 		goto free_ainfo;
982 
983 	ret = connect(sk, ai->ai_addr, ai->ai_addrlen);
984 	if (ret < 0)
985 		goto close_sk;
986 
987 	ret = KSFT_FAIL;
988 	while (retries--) {
989 		uint8_t buf[0x100000];
990 		long current, sock;
991 
992 		if (read(sk, buf, sizeof(buf)) <= 0)
993 			goto close_sk;
994 
995 		current = cg_read_long(cgroup, "memory.current");
996 		sock = cg_read_key_long(cgroup, "memory.stat", "sock ");
997 
998 		if (current < 0 || sock < 0)
999 			goto close_sk;
1000 
1001 		if (values_close(current, sock, 10)) {
1002 			ret = KSFT_PASS;
1003 			break;
1004 		}
1005 	}
1006 
1007 close_sk:
1008 	close(sk);
1009 free_ainfo:
1010 	freeaddrinfo(ai);
1011 	return ret;
1012 }
1013 
1014 /*
1015  * This test checks socket memory accounting.
1016  * The test forks a TCP server listens on a random port between 1000
1017  * and 61000. Once it gets a client connection, it starts writing to
1018  * its socket.
1019  * The TCP client interleaves reads from the socket with check whether
1020  * memory.current and memory.stat.sock are similar.
1021  */
test_memcg_sock(const char * root)1022 static int test_memcg_sock(const char *root)
1023 {
1024 	int bind_retries = 5, ret = KSFT_FAIL, pid, err;
1025 	unsigned short port;
1026 	char *memcg;
1027 
1028 	memcg = cg_name(root, "memcg_test");
1029 	if (!memcg)
1030 		goto cleanup;
1031 
1032 	if (cg_create(memcg))
1033 		goto cleanup;
1034 
1035 	while (bind_retries--) {
1036 		struct tcp_server_args args;
1037 
1038 		if (pipe(args.ctl))
1039 			goto cleanup;
1040 
1041 		port = args.port = 1000 + rand() % 60000;
1042 
1043 		pid = cg_run_nowait(memcg, tcp_server, &args);
1044 		if (pid < 0)
1045 			goto cleanup;
1046 
1047 		close(args.ctl[1]);
1048 		if (read(args.ctl[0], &err, sizeof(err)) != sizeof(err))
1049 			goto cleanup;
1050 		close(args.ctl[0]);
1051 
1052 		if (!err)
1053 			break;
1054 		if (err != EADDRINUSE)
1055 			goto cleanup;
1056 
1057 		waitpid(pid, NULL, 0);
1058 	}
1059 
1060 	if (err == EADDRINUSE) {
1061 		ret = KSFT_SKIP;
1062 		goto cleanup;
1063 	}
1064 
1065 	if (tcp_client(memcg, port) != KSFT_PASS)
1066 		goto cleanup;
1067 
1068 	waitpid(pid, &err, 0);
1069 	if (WEXITSTATUS(err))
1070 		goto cleanup;
1071 
1072 	if (cg_read_long(memcg, "memory.current") < 0)
1073 		goto cleanup;
1074 
1075 	if (cg_read_key_long(memcg, "memory.stat", "sock "))
1076 		goto cleanup;
1077 
1078 	ret = KSFT_PASS;
1079 
1080 cleanup:
1081 	cg_destroy(memcg);
1082 	free(memcg);
1083 
1084 	return ret;
1085 }
1086 
1087 /*
1088  * This test disables swapping and tries to allocate anonymous memory
1089  * up to OOM with memory.group.oom set. Then it checks that all
1090  * processes in the leaf were killed. It also checks that oom_events
1091  * were propagated to the parent level.
1092  */
test_memcg_oom_group_leaf_events(const char * root)1093 static int test_memcg_oom_group_leaf_events(const char *root)
1094 {
1095 	int ret = KSFT_FAIL;
1096 	char *parent, *child;
1097 	long parent_oom_events;
1098 
1099 	parent = cg_name(root, "memcg_test_0");
1100 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1101 
1102 	if (!parent || !child)
1103 		goto cleanup;
1104 
1105 	if (cg_create(parent))
1106 		goto cleanup;
1107 
1108 	if (cg_create(child))
1109 		goto cleanup;
1110 
1111 	if (cg_write(parent, "cgroup.subtree_control", "+memory"))
1112 		goto cleanup;
1113 
1114 	if (cg_write(child, "memory.max", "50M"))
1115 		goto cleanup;
1116 
1117 	if (cg_write(child, "memory.swap.max", "0"))
1118 		goto cleanup;
1119 
1120 	if (cg_write(child, "memory.oom.group", "1"))
1121 		goto cleanup;
1122 
1123 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1124 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1125 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1126 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1127 		goto cleanup;
1128 
1129 	if (cg_test_proc_killed(child))
1130 		goto cleanup;
1131 
1132 	if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
1133 		goto cleanup;
1134 
1135 	parent_oom_events = cg_read_key_long(
1136 			parent, "memory.events", "oom_kill ");
1137 	/*
1138 	 * If memory_localevents is not enabled (the default), the parent should
1139 	 * count OOM events in its children groups. Otherwise, it should not
1140 	 * have observed any events.
1141 	 */
1142 	if (has_localevents && parent_oom_events != 0)
1143 		goto cleanup;
1144 	else if (!has_localevents && parent_oom_events <= 0)
1145 		goto cleanup;
1146 
1147 	ret = KSFT_PASS;
1148 
1149 cleanup:
1150 	if (child)
1151 		cg_destroy(child);
1152 	if (parent)
1153 		cg_destroy(parent);
1154 	free(child);
1155 	free(parent);
1156 
1157 	return ret;
1158 }
1159 
1160 /*
1161  * This test disables swapping and tries to allocate anonymous memory
1162  * up to OOM with memory.group.oom set. Then it checks that all
1163  * processes in the parent and leaf were killed.
1164  */
test_memcg_oom_group_parent_events(const char * root)1165 static int test_memcg_oom_group_parent_events(const char *root)
1166 {
1167 	int ret = KSFT_FAIL;
1168 	char *parent, *child;
1169 
1170 	parent = cg_name(root, "memcg_test_0");
1171 	child = cg_name(root, "memcg_test_0/memcg_test_1");
1172 
1173 	if (!parent || !child)
1174 		goto cleanup;
1175 
1176 	if (cg_create(parent))
1177 		goto cleanup;
1178 
1179 	if (cg_create(child))
1180 		goto cleanup;
1181 
1182 	if (cg_write(parent, "memory.max", "80M"))
1183 		goto cleanup;
1184 
1185 	if (cg_write(parent, "memory.swap.max", "0"))
1186 		goto cleanup;
1187 
1188 	if (cg_write(parent, "memory.oom.group", "1"))
1189 		goto cleanup;
1190 
1191 	cg_run_nowait(parent, alloc_anon_noexit, (void *) MB(60));
1192 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1193 	cg_run_nowait(child, alloc_anon_noexit, (void *) MB(1));
1194 
1195 	if (!cg_run(child, alloc_anon, (void *)MB(100)))
1196 		goto cleanup;
1197 
1198 	if (cg_test_proc_killed(child))
1199 		goto cleanup;
1200 	if (cg_test_proc_killed(parent))
1201 		goto cleanup;
1202 
1203 	ret = KSFT_PASS;
1204 
1205 cleanup:
1206 	if (child)
1207 		cg_destroy(child);
1208 	if (parent)
1209 		cg_destroy(parent);
1210 	free(child);
1211 	free(parent);
1212 
1213 	return ret;
1214 }
1215 
1216 /*
1217  * This test disables swapping and tries to allocate anonymous memory
1218  * up to OOM with memory.group.oom set. Then it checks that all
1219  * processes were killed except those set with OOM_SCORE_ADJ_MIN
1220  */
test_memcg_oom_group_score_events(const char * root)1221 static int test_memcg_oom_group_score_events(const char *root)
1222 {
1223 	int ret = KSFT_FAIL;
1224 	char *memcg;
1225 	int safe_pid;
1226 
1227 	memcg = cg_name(root, "memcg_test_0");
1228 
1229 	if (!memcg)
1230 		goto cleanup;
1231 
1232 	if (cg_create(memcg))
1233 		goto cleanup;
1234 
1235 	if (cg_write(memcg, "memory.max", "50M"))
1236 		goto cleanup;
1237 
1238 	if (cg_write(memcg, "memory.swap.max", "0"))
1239 		goto cleanup;
1240 
1241 	if (cg_write(memcg, "memory.oom.group", "1"))
1242 		goto cleanup;
1243 
1244 	safe_pid = cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1245 	if (set_oom_adj_score(safe_pid, OOM_SCORE_ADJ_MIN))
1246 		goto cleanup;
1247 
1248 	cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(1));
1249 	if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
1250 		goto cleanup;
1251 
1252 	if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
1253 		goto cleanup;
1254 
1255 	if (kill(safe_pid, SIGKILL))
1256 		goto cleanup;
1257 
1258 	ret = KSFT_PASS;
1259 
1260 cleanup:
1261 	if (memcg)
1262 		cg_destroy(memcg);
1263 	free(memcg);
1264 
1265 	return ret;
1266 }
1267 
1268 #define T(x) { x, #x }
1269 struct memcg_test {
1270 	int (*fn)(const char *root);
1271 	const char *name;
1272 } tests[] = {
1273 	T(test_memcg_subtree_control),
1274 	T(test_memcg_current),
1275 	T(test_memcg_min),
1276 	T(test_memcg_low),
1277 	T(test_memcg_high),
1278 	T(test_memcg_high_sync),
1279 	T(test_memcg_max),
1280 	T(test_memcg_reclaim),
1281 	T(test_memcg_oom_events),
1282 	T(test_memcg_swap_max),
1283 	T(test_memcg_sock),
1284 	T(test_memcg_oom_group_leaf_events),
1285 	T(test_memcg_oom_group_parent_events),
1286 	T(test_memcg_oom_group_score_events),
1287 };
1288 #undef T
1289 
main(int argc,char ** argv)1290 int main(int argc, char **argv)
1291 {
1292 	char root[PATH_MAX];
1293 	int i, proc_status, ret = EXIT_SUCCESS;
1294 
1295 	if (cg_find_unified_root(root, sizeof(root)))
1296 		ksft_exit_skip("cgroup v2 isn't mounted\n");
1297 
1298 	/*
1299 	 * Check that memory controller is available:
1300 	 * memory is listed in cgroup.controllers
1301 	 */
1302 	if (cg_read_strstr(root, "cgroup.controllers", "memory"))
1303 		ksft_exit_skip("memory controller isn't available\n");
1304 
1305 	if (cg_read_strstr(root, "cgroup.subtree_control", "memory"))
1306 		if (cg_write(root, "cgroup.subtree_control", "+memory"))
1307 			ksft_exit_skip("Failed to set memory controller\n");
1308 
1309 	proc_status = proc_mount_contains("memory_recursiveprot");
1310 	if (proc_status < 0)
1311 		ksft_exit_skip("Failed to query cgroup mount option\n");
1312 	has_recursiveprot = proc_status;
1313 
1314 	proc_status = proc_mount_contains("memory_localevents");
1315 	if (proc_status < 0)
1316 		ksft_exit_skip("Failed to query cgroup mount option\n");
1317 	has_localevents = proc_status;
1318 
1319 	for (i = 0; i < ARRAY_SIZE(tests); i++) {
1320 		switch (tests[i].fn(root)) {
1321 		case KSFT_PASS:
1322 			ksft_test_result_pass("%s\n", tests[i].name);
1323 			break;
1324 		case KSFT_SKIP:
1325 			ksft_test_result_skip("%s\n", tests[i].name);
1326 			break;
1327 		default:
1328 			ret = EXIT_FAILURE;
1329 			ksft_test_result_fail("%s\n", tests[i].name);
1330 			break;
1331 		}
1332 	}
1333 
1334 	return ret;
1335 }
1336