• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * cpuset user library implementation.
3  *
4  * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved.
5  *
6  * Paul Jackson <pj@sgi.com>
7  */
8 
9 /*
10  *  This program is free software; you can redistribute it and/or modify
11  *  it under the terms of the GNU Lesser General Public License as published by
12  *  the Free Software Foundation; either version 2.1 of the License, or
13  *  (at your option) any later version.
14  *
15  *  This program is distributed in the hope that it will be useful,
16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *  GNU Lesser General Public License for more details.
19  *
20  *  You should have received a copy of the GNU Lesser General Public License
21  *  along with this program; if not, write to the Free Software
22  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
23  */
24 
25 #define _GNU_SOURCE	/* need to see pread() and syscall() */
26 #include <unistd.h>
27 
28 #include <ctype.h>
29 #include <dirent.h>
30 #include <errno.h>
31 #include <fcntl.h>
32 #include <fts.h>
33 #include <limits.h>
34 #include <signal.h>
35 #include <stdint.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/stat.h>
40 #include <sys/syscall.h>
41 #include <sys/types.h>
42 #include <time.h>
43 #include <utime.h>
44 #include <sys/utsname.h>	/* for cpuset_would_crash_kernel() */
45 
46 #include "bitmask.h"
47 #include "cpuset.h"
48 #include "common.h"
49 #include "test.h"
50 #include "lapi/syscalls.h"
51 #include "config.h"
52 
53 #if HAVE_LINUX_MEMPOLICY_H
54 #include <linux/mempolicy.h>
55 
56 /* Bump version, and update Change History, when libcpuset API changes */
57 #define CPUSET_VERSION 3
58 
59 /*
60  * For a history of what changed in each version, see the "Change
61  * History" section, at the end of the libcpuset master document.
62  */
63 
cpuset_version(void)64 int cpuset_version(void)
65 {
66 	return CPUSET_VERSION;
67 }
68 
69 struct cpuset {
70 	struct bitmask *cpus;
71 	struct bitmask *mems;
72 	char cpu_exclusive;
73 	char mem_exclusive;
74 	char mem_hardwall;
75 	char notify_on_release;
76 	char memory_migrate;
77 	char memory_pressure_enabled;
78 	char memory_spread_page;
79 	char memory_spread_slab;
80 	char sched_load_balance;
81 	int sched_relax_domain_level;
82 
83 	/*
84 	 * Each field 'x' above gets an 'x_valid' field below.
85 	 * The apply_cpuset_settings() will only set those fields whose
86 	 * corresponding *_valid flags are set.  The cpuset_alloc()
87 	 * routine clears these flags as part of the clear in calloc(),
88 	 * and the various cpuset_set*() routines set these flags when
89 	 * setting the corresponding value.
90 	 *
91 	 * The purpose of these valid fields is to ensure that when
92 	 * we create a new cpuset, we don't accidentally overwrite
93 	 * some non-zero kernel default, such as an inherited
94 	 * memory_spread_* flag, just because the user application
95 	 * code didn't override the default zero settings resulting
96 	 * from the calloc() call in cpuset_alloc().
97 	 *
98 	 * The choice of 'char' for the type of the flags above,
99 	 * but a bitfield for the flags below, is somewhat capricious.
100 	 */
101 	unsigned cpus_valid:1;
102 	unsigned mems_valid:1;
103 	unsigned cpu_exclusive_valid:1;
104 	unsigned mem_exclusive_valid:1;
105 	unsigned mem_hardwall_valid:1;
106 	unsigned notify_on_release_valid:1;
107 	unsigned memory_migrate_valid:1;
108 	unsigned memory_pressure_enabled_valid:1;
109 	unsigned memory_spread_page_valid:1;
110 	unsigned memory_spread_slab_valid:1;
111 	unsigned sched_load_balance_valid:1;
112 	unsigned sched_relax_domain_level_valid:1;
113 
114 	/*
115 	 * if the relative variable was modified, use following flags
116 	 * to put a mark
117 	 */
118 	unsigned cpus_dirty:1;
119 	unsigned mems_dirty:1;
120 	unsigned cpu_exclusive_dirty:1;
121 	unsigned mem_exclusive_dirty:1;
122 	unsigned mem_hardwall_dirty:1;
123 	unsigned notify_on_release_dirty:1;
124 	unsigned memory_migrate_dirty:1;
125 	unsigned memory_pressure_enabled_dirty:1;
126 	unsigned memory_spread_page_dirty:1;
127 	unsigned memory_spread_slab_dirty:1;
128 	unsigned sched_load_balance_dirty:1;
129 	unsigned sched_relax_domain_level_dirty:1;
130 };
131 
132 /* Presumed cpuset file system mount point */
133 static const char *cpusetmnt = "/dev/cpuset";
134 
135 /* Stashed copy of cpunodemap[], mapping each cpu to its node. */
136 static const char *mapfile = "/var/run/cpunodemap";
137 
138 /* The primary source for the cpunodemap[] is available below here. */
139 static const char *sysdevices = "/sys/devices/system";
140 
141 /* small buffer size - for reading boolean flags or map file (1 or 2 ints) */
142 #define SMALL_BUFSZ 16
143 
144 /*
145  * The 'mask_size_file' is used to ferrit out the kernel cpumask_t
146  * and nodemask_t sizes.  The lines in this file that begin with the
147  * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask
148  * and nodemask string, respectively.  The lengths of these strings
149  * reflect the kernel's internal cpumask_t and nodemask_t sizes,
150  * which sizes are needed to correctly call the sched_setaffinity
151  * and set_mempolicy system calls, and to size user level
152  * bitmasks to match the kernels.
153  */
154 
155 static const char *mask_size_file = "/proc/self/status";
156 static const char *cpumask_prefix = "Cpus_allowed:\t";
157 static const char *nodemask_prefix = "Mems_allowed:\t";
158 
159 /*
160  * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits.
161  *
162  * The first time we need these, we parse the Cpus_allowed and
163  * Mems_allowed lines from mask_size_file ("/proc/self/status").
164  */
165 
166 static int cpumask_sz;
167 static int nodemask_sz;
168 
169 /*
170  * These defaults only kick in if we fail to size the kernel
171  * cpumask and nodemask by reading the Cpus_allowed and
172  * Mems_allowed fields from the /proc/self/status file.
173  */
174 
175 #define DEFCPUBITS (512)
176 #define DEFNODEBITS (DEFCPUBITS/2)
177 
178 /*
179  * Arch-neutral API for obtaining NUMA distances between CPUs
180  * and Memory Nodes, via the files:
181  *	/sys/devices/system/node/nodeN/distance
182  * which have lines such as:
183  *	46 66 10 20
184  * which say that for cpu on node N (from the path above), the
185  * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20,
186  * respectively.
187  */
188 
189 static const char *distance_directory = "/sys/devices/system/node";
190 
191 /*
192  * Someday, we should disable, then later discard, the SN code
193  * marked ALTERNATE_SN_DISTMAP.
194  */
195 
196 #define ALTERNATE_SN_DISTMAP 1
197 #ifdef ALTERNATE_SN_DISTMAP
198 
199 /*
200  * Alternative SN (SGI ia64) architecture specific API for obtaining
201  * NUMA distances between CPUs and Memory Nodes is via the file
202  * /proc/sgi_sn/sn_topology, which has lines such as:
203  *
204  *   node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20
205  *
206  * which says that for each CPU on node 2, the distance to nodes
207  * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively.
208  *
209  * This file has other lines as well, which start with other
210  * keywords than "node".  Ignore these other lines.
211  */
212 
213 static const char *sn_topology = "/proc/sgi_sn/sn_topology";
214 static const char *sn_top_node_prefix = "node ";
215 
216 #endif
217 
218 /*
219  * Check that cpusets supported, /dev/cpuset mounted.
220  * If ok, return 0.
221  * If not, return -1 and set errno:
222  *	ENOSYS - kernel doesn't support cpusets
223  *	ENODEV - /dev/cpuset not mounted
224  */
225 
226 static enum {
227 	check_notdone,
228 	check_enosys,
229 	check_enodev,
230 	check_ok
231 } check_state = check_notdone;
232 
check(void)233 static int check(void)
234 {
235 	if (check_state == check_notdone) {
236 		struct stat statbuf;
237 
238 		if (stat("/proc/self/cpuset", &statbuf) < 0) {
239 			check_state = check_enosys;
240 			goto done;
241 		}
242 
243 		if (stat("/dev/cpuset/tasks", &statbuf) < 0) {
244 			check_state = check_enodev;
245 			goto done;
246 		}
247 
248 		check_state = check_ok;
249 	}
250 done:
251 	switch (check_state) {
252 	case check_enosys:
253 		errno = ENOSYS;
254 		return -1;
255 	case check_enodev:
256 		errno = ENODEV;
257 		return -1;
258 	default:
259 		break;
260 	}
261 	return 0;
262 }
263 
chomp(char * s)264 static void chomp(char *s)
265 {
266 	char *t;
267 
268 	for (t = s + strlen(s) - 1; t >= s; t--) {
269 		if (*t == '\n' || *t == '\r')
270 			*t = '\0';
271 		else
272 			break;
273 	}
274 }
275 
276 /*
277  * Determine number of bytes in a seekable open file, without
278  * assuming that stat(2) on that file has a useful size.
279  * Has side affect of leaving the file rewound to the beginnning.
280  */
filesize(FILE * fp)281 static int filesize(FILE * fp)
282 {
283 	int sz = 0;
284 	rewind(fp);
285 	while (fgetc(fp) != EOF)
286 		sz++;
287 	rewind(fp);
288 	return sz;
289 }
290 
291 /* Are strings s1 and s2 equal? */
streq(const char * s1,const char * s2)292 static int streq(const char *s1, const char *s2)
293 {
294 	return strcmp(s1, s2) == 0;
295 }
296 
297 /* Is string 'pre' a prefix of string 's'? */
strprefix(const char * s,const char * pre)298 static int strprefix(const char *s, const char *pre)
299 {
300 	return strncmp(s, pre, strlen(pre)) == 0;
301 }
302 
303 /*
304  * char *flgets(char *buf, int buflen, FILE *fp)
305  *
306  * Obtain one line from input file fp.  Copy up to first
307  * buflen-1 chars of line into buffer buf, discarding any remainder
308  * of line.  Stop reading at newline, discarding newline.
309  * Nul terminate result and return pointer to buffer buf
310  * on success, or NULL if nothing more to read or failure.
311  */
312 
flgets(char * buf,int buflen,FILE * fp)313 static char *flgets(char *buf, int buflen, FILE * fp)
314 {
315 	int c = -1;
316 	char *bp;
317 
318 	bp = buf;
319 	while ((--buflen > 0) && ((c = getc(fp)) >= 0)) {
320 		if (c == '\n')
321 			goto newline;
322 		*bp++ = c;
323 	}
324 	if ((c < 0) && (bp == buf))
325 		return NULL;
326 
327 	if (c > 0) {
328 		while ((c = getc(fp)) >= 0) {
329 			if (c == '\n')
330 				break;
331 		}
332 	}
333 
334 newline:
335 	*bp++ = '\0';
336 	return buf;
337 }
338 
339 /*
340  * sgetc(const char *inputbuf, int *offsetptr)
341  *
342  * Return next char from nul-terminated input buffer inputbuf,
343  * starting at offset *offsetptr.  Increment *offsetptr.
344  * If next char would be nul ('\0'), return EOF and don't
345  * increment *offsetptr.
346  */
347 
sgetc(const char * inputbuf,int * offsetptr)348 static int sgetc(const char *inputbuf, int *offsetptr)
349 {
350 	char c;
351 
352 	if ((c = inputbuf[*offsetptr]) != 0) {
353 		*offsetptr = *offsetptr + 1;
354 		return c;
355 	} else {
356 		return EOF;
357 	}
358 }
359 
360 /*
361  * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
362  *
363  * Obtain next line from nul-terminated input buffer 'inputbuf',
364  * starting at offset *offsetptr.  Copy up to first buflen-1
365  * chars of line into output buffer buf, discarding any remainder
366  * of line.  Stop reading at newline, discarding newline.
367  * Nul terminate result and return pointer to output buffer
368  * buf on success, or NULL if nothing more to read.
369  */
370 
slgets(char * buf,int buflen,const char * inputbuf,int * offsetptr)371 static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
372 {
373 	int c = -1;
374 	char *bp;
375 
376 	bp = buf;
377 	while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) {
378 		if (c == '\n')
379 			goto newline;
380 		*bp++ = c;
381 	}
382 	if ((c < 0) && (bp == buf))
383 		return NULL;
384 
385 	if (c > 0) {
386 		while ((c = sgetc(inputbuf, offsetptr)) >= 0) {
387 			if (c == '\n')
388 				break;
389 		}
390 	}
391 
392 newline:
393 	*bp++ = '\0';
394 	return buf;
395 }
396 
397 /*
398  * time_t get_mtime(char *path)
399  *
400  * Return modtime of file at location path, else return 0.
401  */
402 
get_mtime(const char * path)403 static time_t get_mtime(const char *path)
404 {
405 	struct stat statbuf;
406 
407 	if (stat(path, &statbuf) != 0)
408 		return 0;
409 	return statbuf.st_mtime;
410 }
411 
412 /*
413  * int set_mtime(const char *path, time_t mtime)
414  *
415  * Set modtime of file 'path' to 'mtime'.  Return 0 on success,
416  * or -1 on error, setting errno.
417  */
418 
set_mtime(const char * path,time_t mtime)419 static int set_mtime(const char *path, time_t mtime)
420 {
421 	struct utimbuf times;
422 
423 	times.actime = mtime;
424 	times.modtime = mtime;
425 	return utime(path, &times);
426 }
427 
428 /*
429  * True if two pathnames resolve to same file.
430  * False if either path can not be stat'd,
431  * or if the two paths resolve to a different file.
432  */
433 
samefile(const char * path1,const char * path2)434 static int samefile(const char *path1, const char *path2)
435 {
436 	struct stat sb1, sb2;
437 
438 	if (stat(path1, &sb1) != 0)
439 		return 0;
440 	if (stat(path2, &sb2) != 0)
441 		return 0;
442 	return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev;
443 }
444 
445 #define slash(c) (*(c) == '/')
446 #define eocomp(c) (slash(c) || !*(c))
447 #define dot1(c) (*(c) == '.' && eocomp(c+1))
448 
449 /* In place path compression.  Remove extra dots and slashes. */
pathcomp(char * p)450 static char *pathcomp(char *p)
451 {
452 	char *a = p;
453 	char *b = p;
454 
455 	if (!p || !*p)
456 		return p;
457 	if (slash(p))
458 		*b++ = *a++;
459 	for (;;) {
460 		if (slash(a))
461 			while (slash(++a))
462 				continue;
463 		if (!*a) {
464 			if (b == p)
465 				*b++ = '.';
466 			*b = '\0';
467 			return (p);
468 		} else if (dot1(a)) {
469 			a++;
470 		} else {
471 			if ((b != p) && !slash(b - 1))
472 				*b++ = '/';
473 			while (!eocomp(a))
474 				*b++ = *a++;
475 		}
476 	}
477 }
478 
479 #undef slash
480 #undef eocomp
481 #undef dot1
482 
483 /*
484  * pathcat2(buf, buflen, name1, name2)
485  *
486  * Return buf, of length buflen, with name1/name2 stored in it.
487  */
488 
pathcat2(char * buf,int buflen,const char * name1,const char * name2)489 static char *pathcat2(char *buf, int buflen, const char *name1,
490 		      const char *name2)
491 {
492 	(void)snprintf(buf, buflen, "%s/%s", name1, name2);
493 	return pathcomp(buf);
494 }
495 
496 /*
497  * pathcat3(buf, buflen, name1, name2, name3)
498  *
499  * Return buf, of length buflen, with name1/name2/name3 stored in it.
500  */
501 
pathcat3(char * buf,int buflen,const char * name1,const char * name2,const char * name3)502 static char *pathcat3(char *buf, int buflen, const char *name1,
503 		      const char *name2, const char *name3)
504 {
505 	(void)snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3);
506 	return pathcomp(buf);
507 }
508 
509 /*
510  * fullpath(buf, buflen, name)
511  *
512  * Put full path of cpuset 'name' in buffer 'buf'.  If name
513  * starts with a slash (``/``) character, then this a path
514  * relative to ``/dev/cpuset``, otherwise it is relative to
515  * the current tasks cpuset.  Return 0 on success, else
516  * -1 on error, setting errno.
517  */
518 
fullpath(char * buf,int buflen,const char * name)519 static int fullpath(char *buf, int buflen, const char *name)
520 {
521 	int len;
522 
523 	/* easy case */
524 	if (*name == '/') {
525 		pathcat2(buf, buflen, cpusetmnt, name);
526 		pathcomp(buf);
527 		return 0;
528 	}
529 
530 	/* hard case */
531 	snprintf(buf, buflen, "%s/", cpusetmnt);
532 	len = strlen(buf);
533 	if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL)
534 		return -1;
535 	if (strlen(buf) >= buflen - 1 - strlen(name)) {
536 		errno = E2BIG;
537 		return -1;
538 	}
539 	strcat(buf, "/");
540 	strcat(buf, name);
541 	pathcomp(buf);
542 	return 0;
543 }
544 
545 /*
546  * fullpath2(buf, buflen, name1, name2)
547  *
548  * Like fullpath(), only concatenate two pathname components on end.
549  */
550 
fullpath2(char * buf,int buflen,const char * name1,const char * name2)551 static int fullpath2(char *buf, int buflen, const char *name1,
552 		     const char *name2)
553 {
554 	if (fullpath(buf, buflen, name1) < 0)
555 		return -1;
556 	if (strlen(buf) >= buflen - 1 - strlen(name2)) {
557 		errno = E2BIG;
558 		return -1;
559 	}
560 	strcat(buf, "/");
561 	strcat(buf, name2);
562 	pathcomp(buf);
563 	return 0;
564 }
565 
566 /*
567  * Convert the string length of an ascii hex mask to the number
568  * of bits represented by that mask.
569  *
570  * The cpumask and nodemask values in /proc/self/status are in an
571  * ascii format that uses 9 characters for each 32 bits of mask.
572  */
s2nbits(const char * s)573 static int s2nbits(const char *s)
574 {
575 	return strlen(s) * 32 / 9;
576 }
577 
update_mask_sizes(void)578 static void update_mask_sizes(void)
579 {
580 	FILE *fp = NULL;
581 	char *buf = NULL;
582 	int fsize;
583 
584 	if ((fp = fopen(mask_size_file, "r")) == NULL)
585 		goto done;
586 	fsize = filesize(fp);
587 	if ((buf = malloc(fsize)) == NULL)
588 		goto done;
589 
590 	/*
591 	 * Beware: mask sizing arithmetic is fussy.
592 	 * The trailing newline left by fgets() is required.
593 	 */
594 	while (fgets(buf, fsize, fp)) {
595 		if (strprefix(buf, cpumask_prefix))
596 			cpumask_sz = s2nbits(buf + strlen(cpumask_prefix));
597 		if (strprefix(buf, nodemask_prefix))
598 			nodemask_sz = s2nbits(buf + strlen(nodemask_prefix));
599 	}
600 done:
601 	free(buf);
602 	if (fp != NULL)
603 		fclose(fp);
604 	if (cpumask_sz == 0)
605 		cpumask_sz = DEFCPUBITS;
606 	if (nodemask_sz == 0)
607 		nodemask_sz = DEFNODEBITS;
608 }
609 
610 /* Allocate a new struct cpuset */
cpuset_alloc(void)611 struct cpuset *cpuset_alloc(void)
612 {
613 	struct cpuset *cp = NULL;
614 	int nbits;
615 
616 	if ((cp = calloc(1, sizeof(struct cpuset))) == NULL)
617 		goto err;
618 
619 	nbits = cpuset_cpus_nbits();
620 	if ((cp->cpus = bitmask_alloc(nbits)) == NULL)
621 		goto err;
622 
623 	nbits = cpuset_mems_nbits();
624 	if ((cp->mems = bitmask_alloc(nbits)) == NULL)
625 		goto err;
626 
627 	return cp;
628 err:
629 	if (cp && cp->cpus)
630 		bitmask_free(cp->cpus);
631 	if (cp && cp->mems)
632 		bitmask_free(cp->mems);
633 	free(cp);
634 	return NULL;
635 }
636 
637 /* Free struct cpuset *cp */
cpuset_free(struct cpuset * cp)638 void cpuset_free(struct cpuset *cp)
639 {
640 	if (!cp)
641 		return;
642 	if (cp->cpus)
643 		bitmask_free(cp->cpus);
644 	if (cp->mems)
645 		bitmask_free(cp->mems);
646 	free(cp);
647 }
648 
649 /* Number of bits in a CPU bitmask on current system */
cpuset_cpus_nbits(void)650 int cpuset_cpus_nbits(void)
651 {
652 	if (cpumask_sz == 0)
653 		update_mask_sizes();
654 	return cpumask_sz;
655 }
656 
657 /* Number of bits in a Memory bitmask on current system */
cpuset_mems_nbits(void)658 int cpuset_mems_nbits(void)
659 {
660 	if (nodemask_sz == 0)
661 		update_mask_sizes();
662 	return nodemask_sz;
663 }
664 
665 /* Set CPUs in cpuset cp to bitmask cpus */
cpuset_setcpus(struct cpuset * cp,const struct bitmask * cpus)666 int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus)
667 {
668 	if (cp->cpus)
669 		bitmask_free(cp->cpus);
670 	cp->cpus = bitmask_alloc(bitmask_nbits(cpus));
671 	if (cp->cpus == NULL)
672 		return -1;
673 	bitmask_copy(cp->cpus, cpus);
674 	cp->cpus_valid = 1;
675 	cp->cpus_dirty = 1;
676 	return 0;
677 }
678 
679 /* Set Memory Nodes in cpuset cp to bitmask mems */
cpuset_setmems(struct cpuset * cp,const struct bitmask * mems)680 int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems)
681 {
682 	if (cp->mems)
683 		bitmask_free(cp->mems);
684 	cp->mems = bitmask_alloc(bitmask_nbits(mems));
685 	if (cp->mems == NULL)
686 		return -1;
687 	bitmask_copy(cp->mems, mems);
688 	cp->mems_valid = 1;
689 	cp->mems_dirty = 1;
690 	return 0;
691 }
692 
693 /* Set integer value optname of cpuset cp */
cpuset_set_iopt(struct cpuset * cp,const char * optionname,int value)694 int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value)
695 {
696 	if (streq(optionname, "cpu_exclusive")) {
697 		cp->cpu_exclusive = ! !value;
698 		cp->cpu_exclusive_valid = 1;
699 		cp->cpu_exclusive_dirty = 1;
700 	} else if (streq(optionname, "mem_exclusive")) {
701 		cp->mem_exclusive = ! !value;
702 		cp->mem_exclusive_valid = 1;
703 		cp->mem_exclusive_dirty = 1;
704 	} else if (streq(optionname, "mem_hardwall")) {
705 		cp->mem_hardwall = ! !value;
706 		cp->mem_hardwall_valid = 1;
707 		cp->mem_hardwall_dirty = 1;
708 	} else if (streq(optionname, "notify_on_release")) {
709 		cp->notify_on_release = ! !value;
710 		cp->notify_on_release_valid = 1;
711 		cp->notify_on_release_dirty = 1;
712 	} else if (streq(optionname, "memory_pressure_enabled")) {
713 		cp->memory_pressure_enabled = ! !value;
714 		cp->memory_pressure_enabled_valid = 1;
715 		cp->memory_pressure_enabled_dirty = 1;
716 	} else if (streq(optionname, "memory_migrate")) {
717 		cp->memory_migrate = ! !value;
718 		cp->memory_migrate_valid = 1;
719 		cp->memory_migrate_dirty = 1;
720 	} else if (streq(optionname, "memory_spread_page")) {
721 		cp->memory_spread_page = ! !value;
722 		cp->memory_spread_page_valid = 1;
723 		cp->memory_spread_page_dirty = 1;
724 	} else if (streq(optionname, "memory_spread_slab")) {
725 		cp->memory_spread_slab = ! !value;
726 		cp->memory_spread_slab_valid = 1;
727 		cp->memory_spread_slab_dirty = 1;
728 	} else if (streq(optionname, "sched_load_balance")) {
729 		cp->sched_load_balance = ! !value;
730 		cp->sched_load_balance_valid = 1;
731 		cp->sched_load_balance_dirty = 1;
732 	} else if (streq(optionname, "sched_relax_domain_level")) {
733 		cp->sched_relax_domain_level = value;
734 		cp->sched_relax_domain_level_valid = 1;
735 		cp->sched_relax_domain_level_dirty = 1;
736 	} else
737 		return -2;	/* optionname not recognized */
738 	return 0;
739 }
740 
741 /* [optional] Set string value optname */
cpuset_set_sopt(UNUSED struct cpuset * cp,UNUSED const char * optionname,UNUSED const char * value)742 int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname,
743 		    UNUSED const char *value)
744 {
745 	return -2;		/* For now, all string options unrecognized */
746 }
747 
748 /* Return handle for reading memory_pressure. */
cpuset_open_memory_pressure(const char * cpusetpath)749 int cpuset_open_memory_pressure(const char *cpusetpath)
750 {
751 	char buf[PATH_MAX];
752 
753 	fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure");
754 	return open(buf, O_RDONLY);
755 }
756 
757 /* Return current memory_pressure of cpuset. */
cpuset_read_memory_pressure(int han)758 int cpuset_read_memory_pressure(int han)
759 {
760 	char buf[SMALL_BUFSZ];
761 
762 	if (pread(han, buf, sizeof(buf), 0L) < 0)
763 		return -1;
764 	return atoi(buf);
765 }
766 
767 /* Close handle for reading memory pressure. */
cpuset_close_memory_pressure(int han)768 void cpuset_close_memory_pressure(int han)
769 {
770 	close(han);
771 }
772 
773 /*
774  * Resolve cpuset pointer (to that of current task if cp == NULL).
775  *
776  * If cp not NULL, just return it.  If cp is NULL, return pointer
777  * to temporary cpuset for current task, and set *cp_tofree to
778  * pointer to that same temporary cpuset, to be freed later.
779  *
780  * Return NULL and set errno on error.  Errors can occur when
781  * resolving the current tasks cpuset.
782  */
resolve_cp(const struct cpuset * cp,struct cpuset ** cp_tofree)783 static const struct cpuset *resolve_cp(const struct cpuset *cp,
784 				       struct cpuset **cp_tofree)
785 {
786 	const struct cpuset *rcp;
787 
788 	if (cp) {
789 		rcp = cp;
790 	} else {
791 		struct cpuset *cp1 = cpuset_alloc();
792 		if (cp1 == NULL)
793 			goto err;
794 		if (cpuset_cpusetofpid(cp1, 0) < 0) {
795 			cpuset_free(cp1);
796 			goto err;
797 		}
798 		*cp_tofree = cp1;
799 		rcp = cp1;
800 	}
801 	return rcp;
802 err:
803 	return NULL;
804 }
805 
806 /* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */
cpuset_getcpus(const struct cpuset * cp,struct bitmask * cpus)807 int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus)
808 {
809 	struct cpuset *cp_tofree = NULL;
810 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
811 
812 	if (!cp1)
813 		goto err;
814 	if (cp1->cpus == NULL) {
815 		errno = EINVAL;
816 		goto err;
817 	}
818 	bitmask_copy(cpus, cp1->cpus);
819 	cpuset_free(cp_tofree);
820 	return 0;
821 err:
822 	cpuset_free(cp_tofree);
823 	return -1;
824 }
825 
826 /* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */
cpuset_getmems(const struct cpuset * cp,struct bitmask * mems)827 int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems)
828 {
829 	struct cpuset *cp_tofree = NULL;
830 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
831 
832 	if (!cp1)
833 		goto err;
834 	if (cp1->mems == NULL) {
835 		errno = EINVAL;
836 		goto err;
837 	}
838 	bitmask_copy(mems, cp1->mems);
839 	cpuset_free(cp_tofree);
840 	return 0;
841 err:
842 	cpuset_free(cp_tofree);
843 	return -1;
844 }
845 
846 /* Return number of CPUs in cpuset cp (current task if cp == NULL) */
cpuset_cpus_weight(const struct cpuset * cp)847 int cpuset_cpus_weight(const struct cpuset *cp)
848 {
849 	struct cpuset *cp_tofree = NULL;
850 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
851 	int w = -1;
852 
853 	if (!cp1)
854 		goto err;
855 	if (cp1->cpus == NULL) {
856 		errno = EINVAL;
857 		goto err;
858 	}
859 	w = bitmask_weight(cp1->cpus);
860 	/* fall into ... */
861 err:
862 	cpuset_free(cp_tofree);
863 	return w;
864 }
865 
866 /* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */
cpuset_mems_weight(const struct cpuset * cp)867 int cpuset_mems_weight(const struct cpuset *cp)
868 {
869 	struct cpuset *cp_tofree = NULL;
870 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
871 	int w = -1;
872 
873 	if (!cp1)
874 		goto err;
875 	if (cp1->mems == NULL) {
876 		errno = EINVAL;
877 		goto err;
878 	}
879 	w = bitmask_weight(cp1->mems);
880 	/* fall into ... */
881 err:
882 	cpuset_free(cp_tofree);
883 	return w;
884 }
885 
886 /* Return integer value of option optname in cp */
cpuset_get_iopt(const struct cpuset * cp,const char * optionname)887 int cpuset_get_iopt(const struct cpuset *cp, const char *optionname)
888 {
889 	if (streq(optionname, "cpu_exclusive"))
890 		return cp->cpu_exclusive;
891 	else if (streq(optionname, "mem_exclusive"))
892 		return cp->mem_exclusive;
893 	else if (streq(optionname, "mem_hardwall"))
894 		return cp->mem_hardwall;
895 	else if (streq(optionname, "notify_on_release"))
896 		return cp->notify_on_release;
897 	else if (streq(optionname, "memory_pressure_enabled"))
898 		return cp->memory_pressure_enabled;
899 	else if (streq(optionname, "memory_migrate"))
900 		return cp->memory_migrate;
901 	else if (streq(optionname, "memory_spread_page"))
902 		return cp->memory_spread_page;
903 	else if (streq(optionname, "memory_spread_slab"))
904 		return cp->memory_spread_slab;
905 	else if (streq(optionname, "sched_load_balance"))
906 		return cp->sched_load_balance;
907 	else if (streq(optionname, "sched_relax_domain_level"))
908 		return cp->sched_relax_domain_level;
909 	else
910 		return -2;	/* optionname not recognized */
911 }
912 
913 /* [optional] Return string value of optname */
cpuset_get_sopt(UNUSED const struct cpuset * cp,UNUSED const char * optionname)914 const char *cpuset_get_sopt(UNUSED const struct cpuset *cp,
915 			    UNUSED const char *optionname)
916 {
917 	return NULL;		/* For now, all string options unrecognized */
918 }
919 
read_flag(const char * filepath,char * flagp)920 static int read_flag(const char *filepath, char *flagp)
921 {
922 	char buf[SMALL_BUFSZ];	/* buffer a "0" or "1" flag line */
923 	int fd = -1;
924 
925 	if ((fd = open(filepath, O_RDONLY)) < 0)
926 		goto err;
927 	if (read(fd, buf, sizeof(buf)) < 1)
928 		goto err;
929 	if (atoi(buf))
930 		*flagp = 1;
931 	else
932 		*flagp = 0;
933 	close(fd);
934 	return 0;
935 err:
936 	if (fd >= 0)
937 		close(fd);
938 	return -1;
939 }
940 
load_flag(const char * path,char * flagp,const char * flag)941 static int load_flag(const char *path, char *flagp, const char *flag)
942 {
943 	char buf[PATH_MAX];
944 
945 	pathcat2(buf, sizeof(buf), path, flag);
946 	return read_flag(buf, flagp);
947 }
948 
read_number(const char * filepath,int * numberp)949 static int read_number(const char *filepath, int *numberp)
950 {
951 	char buf[SMALL_BUFSZ];
952 	int fd = -1;
953 
954 	if ((fd = open(filepath, O_RDONLY)) < 0)
955 		goto err;
956 	if (read(fd, buf, sizeof(buf)) < 1)
957 		goto err;
958 	*numberp = atoi(buf);
959 	close(fd);
960 	return 0;
961 err:
962 	if (fd >= 0)
963 		close(fd);
964 	return -1;
965 }
966 
load_number(const char * path,int * numberp,const char * file)967 static int load_number(const char *path, int *numberp, const char *file)
968 {
969 	char buf[PATH_MAX];
970 
971 	pathcat2(buf, sizeof(buf), path, file);
972 	return read_number(buf, numberp);
973 }
974 
read_mask(const char * filepath,struct bitmask ** bmpp,int nbits)975 static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits)
976 {
977 	FILE *fp = NULL;
978 	char *buf = NULL;
979 	int buflen;
980 	struct bitmask *bmp = NULL;
981 
982 	if ((fp = fopen(filepath, "r")) == NULL)
983 		goto err;
984 	buflen = filesize(fp) + 1;	/* + 1 for nul term */
985 	if ((buf = malloc(buflen)) == NULL)
986 		goto err;
987 	if (flgets(buf, buflen, fp) == NULL)
988 		goto err;
989 	fclose(fp);
990 	fp = NULL;
991 
992 	if ((bmp = bitmask_alloc(nbits)) == NULL)
993 		goto err;
994 	if (*buf && bitmask_parselist(buf, bmp) < 0)
995 		goto err;
996 	if (*bmpp)
997 		bitmask_free(*bmpp);
998 	*bmpp = bmp;
999 	free(buf);
1000 	buf = NULL;
1001 	return 0;
1002 err:
1003 	if (buf != NULL)
1004 		free(buf);
1005 	if (fp != NULL)
1006 		fclose(fp);
1007 	if (bmp != NULL)
1008 		bitmask_free(bmp);
1009 	return -1;
1010 }
1011 
load_mask(const char * path,struct bitmask ** bmpp,int nbits,const char * mask)1012 static int load_mask(const char *path, struct bitmask **bmpp,
1013 		     int nbits, const char *mask)
1014 {
1015 	char buf[PATH_MAX];
1016 
1017 	pathcat2(buf, sizeof(buf), path, mask);
1018 	return read_mask(buf, bmpp, nbits);
1019 }
1020 
1021 /* Write string to file at given filepath.  Create or truncate file. */
write_string_file(const char * filepath,const char * str)1022 static int write_string_file(const char *filepath, const char *str)
1023 {
1024 	int fd = -1;
1025 
1026 	if ((fd = open(filepath, O_WRONLY | O_CREAT, 0644)) < 0)
1027 		goto err;
1028 	if (write(fd, str, strlen(str)) < 0)
1029 		goto err;
1030 	close(fd);
1031 	return 0;
1032 err:
1033 	if (fd >= 0)
1034 		close(fd);
1035 	return -1;
1036 }
1037 
1038 /* Size and allocate buffer.  Write bitmask into it.  Caller must free */
sprint_mask_buf(const struct bitmask * bmp)1039 static char *sprint_mask_buf(const struct bitmask *bmp)
1040 {
1041 	char *buf = NULL;
1042 	int buflen;
1043 	char c;
1044 
1045 	/* First bitmask_displaylist() call just to get the length */
1046 	buflen = bitmask_displaylist(&c, 1, bmp) + 1;	/* "+ 1" for nul */
1047 	if ((buf = malloc(buflen)) == NULL)
1048 		return NULL;
1049 	bitmask_displaylist(buf, buflen, bmp);
1050 	return buf;
1051 }
1052 
exists_flag(const char * path,const char * flag)1053 static int exists_flag(const char *path, const char *flag)
1054 {
1055 	char buf[PATH_MAX];
1056 	struct stat statbuf;
1057 	int rc;
1058 
1059 	pathcat2(buf, sizeof(buf), path, flag);
1060 	rc = (stat(buf, &statbuf) == 0);
1061 	errno = 0;
1062 	return rc;
1063 }
1064 
store_flag(const char * path,const char * flag,int val)1065 static int store_flag(const char *path, const char *flag, int val)
1066 {
1067 	char buf[PATH_MAX];
1068 
1069 	pathcat2(buf, sizeof(buf), path, flag);
1070 	return write_string_file(buf, val ? "1" : "0");
1071 }
1072 
store_number(const char * path,const char * file,int val)1073 static int store_number(const char *path, const char *file, int val)
1074 {
1075 	char buf[PATH_MAX];
1076 	char data[SMALL_BUFSZ];
1077 
1078 	memset(data, 0, sizeof(data));
1079 	pathcat2(buf, sizeof(buf), path, file);
1080 	snprintf(data, sizeof(data), "%d", val);
1081 	return write_string_file(buf, data);
1082 }
1083 
store_mask(const char * path,const char * mask,const struct bitmask * bmp)1084 static int store_mask(const char *path, const char *mask,
1085 		      const struct bitmask *bmp)
1086 {
1087 	char maskpath[PATH_MAX];
1088 	char *bp = NULL;
1089 	int rc;
1090 
1091 	if (bmp == NULL)
1092 		return 0;
1093 	pathcat2(maskpath, sizeof(maskpath), path, mask);
1094 	if ((bp = sprint_mask_buf(bmp)) == NULL)
1095 		return -1;
1096 	rc = write_string_file(maskpath, bp);
1097 	free(bp);
1098 	return rc;
1099 }
1100 
1101 /*
1102  * Return 1 if 'cpu' is online, else 0 if offline.  Tests the file
1103  * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents
1104  * were N == cpu number.
1105  */
1106 
cpu_online(unsigned int cpu)1107 char cpu_online(unsigned int cpu)
1108 {
1109 	char online;
1110 	char cpupath[PATH_MAX];
1111 
1112 	(void)snprintf(cpupath, sizeof(cpupath),
1113 		       "/sys/devices/system/cpu/cpu%d/online", cpu);
1114 	if (read_flag(cpupath, &online) < 0)
1115 		return 0;	/* oops - guess that cpu's not there */
1116 	return online;
1117 }
1118 
1119 /*
1120  * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()),
1121  * to the node on which that cpu resides or cpuset_mems_nbits().
1122  *
1123  * To avoid every user having to recalculate this relation
1124  * from various clues in the sysfs file system (below the
1125  * path /sys/devices/system) a copy of this map is kept at
1126  * /var/run/cpunodemap.
1127  *
1128  * The system automatically cleans out files below
1129  * /var/run on each system reboot (see the init script
1130  * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry
1131  * about stale data in this file across reboots.  If the file
1132  * is missing, let the first process that needs it, and has
1133  * permission to write in the /var/run directory, rebuild it.
1134  *
1135  * If using this cached data, remember the mtime of the mapfile
1136  * the last time we read it in case something like a hotplug
1137  * event results in the file being removed and rebuilt, so we
1138  * can detect if we're using a stale cache, and need to reload.
1139  *
1140  * The mtime of this file is set to the time when we did
1141  * the recalculation of the map, from the clues beneath
1142  * /sys/devices/system.  This is done so that a program
1143  * won't see the mapfile it just wrote as being newer than what
1144  * it just wrote out (store_map) and read the same map back in
1145  * (load_file).
1146  */
1147 
1148 /*
1149  * Hold flockfile(stdin) while using cpunodemap for posix thread safety.
1150  *
1151  * Note on locking and flockfile(FILE *):
1152  *
1153  *  We use flockfile() and funlockfile() instead of directly
1154  *  calling pthread_mutex_lock and pthread_mutex_unlock on
1155  *  a pthread_mutex_t, because this avoids forcing the app
1156  *  to link with libpthread.  The glibc implementation of
1157  *  flockfile/funlockfile will fall back to no-ops if libpthread
1158  *  doesn't happen to be linked.
1159  *
1160  *  Since flockfile already has the moderately convoluted
1161  *  combination of weak and strong symbols required to accomplish
1162  *  this, it is easier to use flockfile() on some handy FILE *
1163  *  stream as a surrogate for pthread locking than it is to so
1164  *  re-invent that wheel.
1165  *
1166  *  Forcing all apps that use cpusets to link with libpthread
1167  *  would force non-transparent initialization on apps that
1168  *  might not be prepared to handle it.
1169  *
1170  *  The application using libcpuset should never notice this
1171  *  odd use of flockfile(), because we never return to the
1172  *  application from any libcpuset call with any such lock held.
1173  *  We just use this locking for guarding some non-atomic cached
1174  *  data updates and accesses, internal to some libcpuset calls.
1175  *  Also, flockfile() allows recursive nesting, so if the app
1176  *  calls libcpuset holding such a file lock, we won't deadlock
1177  *  if we go to acquire the same lock.  We'll just get the lock
1178  *  and increment its counter while we hold it.
1179  */
1180 
1181 static struct cpunodemap {
1182 	int *map;		/* map[cpumask_sz]: maps cpu to its node */
1183 	time_t mtime;		/* modtime of mapfile when last read */
1184 } cpunodemap;
1185 
1186 /*
1187  * rebuild_map() - Rebuild cpunodemap[] from scratch.
1188  *
1189  * Situation:
1190  *	Neither our in-memory cpunodemap[] array nor the
1191  *	cache of it in mapfile is current.
1192  * Action:
1193  *	Rebuild it from first principles and the information
1194  *	available below /sys/devices/system.
1195  */
1196 
rebuild_map(void)1197 static void rebuild_map(void)
1198 {
1199 	char buf[PATH_MAX];
1200 	DIR *dir1, *dir2;
1201 	struct dirent *dent1, *dent2;
1202 	int ncpus = cpuset_cpus_nbits();
1203 	int nmems = cpuset_mems_nbits();
1204 	unsigned int cpu, mem;
1205 
1206 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1207 		cpunodemap.map[cpu] = -1;
1208 	pathcat2(buf, sizeof(buf), sysdevices, "node");
1209 	if ((dir1 = opendir(buf)) == NULL)
1210 		return;
1211 	while ((dent1 = readdir(dir1)) != NULL) {
1212 		if (sscanf(dent1->d_name, "node%u", &mem) < 1)
1213 			continue;
1214 		pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name);
1215 		if ((dir2 = opendir(buf)) == NULL)
1216 			continue;
1217 		while ((dent2 = readdir(dir2)) != NULL) {
1218 			if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1)
1219 				continue;
1220 			if (cpu >= (unsigned int)ncpus
1221 			    || mem >= (unsigned int)nmems)
1222 				continue;
1223 			cpunodemap.map[cpu] = mem;
1224 		}
1225 		closedir(dir2);
1226 	}
1227 	closedir(dir1);
1228 	cpunodemap.mtime = time(0);
1229 }
1230 
1231 /*
1232  * load_map() - Load cpunodemap[] from mapfile.
1233  *
1234  * Situation:
1235  *	The cpunodemap in mapfile is more recent than
1236  *	what we have in the cpunodemap[] array.
1237  * Action:
1238  *	Reload the cpunodemap[] array from the file.
1239  */
1240 
load_map(void)1241 static void load_map(void)
1242 {
1243 	char buf[SMALL_BUFSZ];	/* buffer 1 line of mapfile */
1244 	FILE *mapfp;		/* File stream on mapfile */
1245 	int ncpus = cpuset_cpus_nbits();
1246 	int nmems = cpuset_mems_nbits();
1247 	unsigned int cpu, mem;
1248 
1249 	if ((cpunodemap.map = calloc(ncpus, sizeof(int))) == NULL)
1250 		return;
1251 	cpunodemap.mtime = get_mtime(mapfile);
1252 	if ((mapfp = fopen(mapfile, "r")) == NULL)
1253 		return;
1254 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1255 		cpunodemap.map[cpu] = nmems;
1256 	while (flgets(buf, sizeof(buf), mapfp) != NULL) {
1257 		if (sscanf(buf, "%u %u", &cpu, &mem) < 2)
1258 			continue;
1259 		if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems)
1260 			continue;
1261 		cpunodemap.map[cpu] = mem;
1262 	}
1263 	fclose(mapfp);
1264 }
1265 
1266 /*
1267  * store_map() - Write cpunodemap[] out to mapfile.
1268  *
1269  * Situation:
1270  *	The cpunodemap in the cpunodemap[] array is
1271  *	more recent than the one in mapfile.
1272  * Action:
1273  *	Write cpunodemap[] out to mapfile.
1274  */
1275 
store_map(void)1276 static void store_map(void)
1277 {
1278 	char buf[PATH_MAX];
1279 	int fd = -1;
1280 	FILE *mapfp = NULL;
1281 	int ncpus = cpuset_cpus_nbits();
1282 	int nmems = cpuset_mems_nbits();
1283 	unsigned int cpu, mem;
1284 
1285 	snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX");
1286 	if ((fd = mkstemp(buf)) < 0)
1287 		goto err;
1288 	if ((mapfp = fdopen(fd, "w")) == NULL)
1289 		goto err;
1290 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1291 		mem = cpunodemap.map[cpu];
1292 		if (mem < (unsigned int)nmems)
1293 			fprintf(mapfp, "%u %u\n", cpu, mem);
1294 	}
1295 	fclose(mapfp);
1296 	set_mtime(buf, cpunodemap.mtime);
1297 	if (rename(buf, mapfile) < 0)
1298 		goto err;
1299 	/* mkstemp() creates mode 0600 - change to world readable */
1300 	(void)chmod(mapfile, 0444);
1301 	return;
1302 err:
1303 	if (mapfp != NULL) {
1304 		fclose(mapfp);
1305 		fd = -1;
1306 	}
1307 	if (fd >= 0)
1308 		close(fd);
1309 	(void)unlink(buf);
1310 }
1311 
1312 /*
1313  * Load and gain thread safe access to the <cpu, node> map.
1314  *
1315  * Return 0 on success with flockfile(stdin) held.
1316  * Each successful get_map() call must be matched with a
1317  * following put_map() call to release the lock.
1318  *
1319  * On error, return -1 with errno set and no lock held.
1320  */
1321 
get_map(void)1322 static int get_map(void)
1323 {
1324 	time_t file_mtime;
1325 
1326 	flockfile(stdin);
1327 
1328 	if (cpunodemap.map == NULL) {
1329 		cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int));
1330 		if (cpunodemap.map == NULL)
1331 			goto err;
1332 	}
1333 
1334 	/* If no one has a good cpunodemap, rebuild from scratch */
1335 	file_mtime = get_mtime(mapfile);
1336 	if (cpunodemap.mtime == 0 && file_mtime == 0)
1337 		rebuild_map();
1338 
1339 	/* If either cpunodemap[] or mapfile newer, update other with it */
1340 	file_mtime = get_mtime(mapfile);
1341 	if (cpunodemap.mtime < file_mtime)
1342 		load_map();
1343 	else if (cpunodemap.mtime > file_mtime)
1344 		store_map();
1345 	return 0;
1346 err:
1347 	funlockfile(stdin);
1348 	return -1;
1349 }
1350 
put_map(void)1351 static void put_map(void)
1352 {
1353 	funlockfile(stdin);
1354 }
1355 
1356 /* Set cpus to those local to Memory Nodes mems */
cpuset_localcpus(const struct bitmask * mems,struct bitmask * cpus)1357 int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus)
1358 {
1359 	int ncpus = cpuset_cpus_nbits();
1360 	unsigned int cpu;
1361 
1362 	if (check() < 0)
1363 		return -1;
1364 
1365 	get_map();
1366 	bitmask_clearall(cpus);
1367 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1368 		if (bitmask_isbitset(mems, cpunodemap.map[cpu]))
1369 			bitmask_setbit(cpus, cpu);
1370 	}
1371 	put_map();
1372 	return 0;
1373 }
1374 
1375 /* Set mems to those local to CPUs cpus */
cpuset_localmems(const struct bitmask * cpus,struct bitmask * mems)1376 int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems)
1377 {
1378 	int ncpus = cpuset_cpus_nbits();
1379 	unsigned int cpu;
1380 
1381 	if (check() < 0)
1382 		return -1;
1383 
1384 	get_map();
1385 	bitmask_clearall(mems);
1386 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1387 		if (bitmask_isbitset(cpus, cpu))
1388 			bitmask_setbit(mems, cpunodemap.map[cpu]);
1389 	}
1390 	put_map();
1391 	return 0;
1392 }
1393 
1394 /*
1395  * distmap[]
1396  *
1397  * Array of ints of size cpumask_sz by nodemask_sz.
1398  *
1399  * Element distmap[cpu][mem] is the distance between CPU cpu
1400  * and Memory Node mem.  Distances are weighted to roughly
1401  * approximate the cost of memory references, and scaled so that
1402  * the distance from a CPU to its local Memory Node is ten (10).
1403  *
1404  * The first call to cpuset_cpumemdist() builds this map, from
1405  * whatever means the kernel provides to obtain these distances.
1406  *
1407  * These distances derive from ACPI SLIT table entries, which are
1408  * eight bits in size.
1409  *
1410  * Hold flockfile(stdout) while using distmap for posix thread safety.
1411  */
1412 
1413 typedef unsigned char distmap_entry_t;	/* type of distmap[] entries */
1414 
1415 static distmap_entry_t *distmap;	/* maps <cpu, mem> to distance */
1416 
1417 #define DISTMAP_MAX UCHAR_MAX	/* maximum value in distmap[] */
1418 
1419 #define I(i,j) ((i) * nmems + (j))	/* 2-D array index simulation */
1420 
1421 /*
1422  * Parse arch neutral lines from 'distance' files of form:
1423  *
1424  *	46 66 10 20
1425  *
1426  * The lines contain a space separated list of distances, which is parsed
1427  * into array dists[] of each nodes distance from the specified node.
1428  *
1429  * Result is placed in distmap[ncpus][nmems]:
1430  *
1431  *	For each cpu c on node:
1432  *		For each node position n in list of distances:
1433  *			distmap[c][n] = dists[n]
1434  */
1435 
parse_distmap_line(unsigned int node,char * buf)1436 static int parse_distmap_line(unsigned int node, char *buf)
1437 {
1438 	char *p, *q;
1439 	int ncpus = cpuset_cpus_nbits();
1440 	int nmems = cpuset_mems_nbits();
1441 	unsigned int c, n;
1442 	distmap_entry_t *dists = NULL;
1443 	struct bitmask *cpus = NULL, *mems = NULL;
1444 	int ret = -1;
1445 
1446 	p = buf;
1447 	if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1448 		goto err;
1449 	for (n = 0; n < (unsigned int)nmems; n++)
1450 		dists[n] = DISTMAP_MAX;
1451 
1452 	for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1453 		unsigned int d;
1454 
1455 		if ((p = strpbrk(p, "0123456789")) == NULL)
1456 			break;
1457 		d = strtoul(p, &q, 10);
1458 		if (p == q)
1459 			break;
1460 		if (d < DISTMAP_MAX)
1461 			dists[n] = (distmap_entry_t) d;
1462 	}
1463 
1464 	if ((mems = bitmask_alloc(nmems)) == NULL)
1465 		goto err;
1466 	bitmask_setbit(mems, node);
1467 
1468 	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1469 		goto err;
1470 	cpuset_localcpus(mems, cpus);
1471 
1472 	for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1473 	     c = bitmask_next(cpus, c + 1))
1474 		for (n = 0; n < (unsigned int)nmems; n++)
1475 			distmap[I(c, n)] = dists[n];
1476 	ret = 0;
1477 	/* fall into ... */
1478 err:
1479 	bitmask_free(mems);
1480 	bitmask_free(cpus);
1481 	free(dists);
1482 	return ret;
1483 }
1484 
parse_distance_file(unsigned int node,const char * path)1485 static int parse_distance_file(unsigned int node, const char *path)
1486 {
1487 	FILE *fp;
1488 	char *buf = NULL;
1489 	int buflen;
1490 
1491 	if ((fp = fopen(path, "r")) == NULL)
1492 		goto err;
1493 
1494 	buflen = filesize(fp);
1495 
1496 	if ((buf = malloc(buflen)) == NULL)
1497 		goto err;
1498 
1499 	if (flgets(buf, buflen, fp) == NULL)
1500 		goto err;
1501 
1502 	if (parse_distmap_line(node, buf) < 0)
1503 		goto err;
1504 
1505 	free(buf);
1506 	fclose(fp);
1507 	return 0;
1508 err:
1509 	free(buf);
1510 	if (fp)
1511 		fclose(fp);
1512 	return -1;
1513 }
1514 
build_distmap(void)1515 static void build_distmap(void)
1516 {
1517 	static int tried_before = 0;
1518 	int ncpus = cpuset_cpus_nbits();
1519 	int nmems = cpuset_mems_nbits();
1520 	int c, m;
1521 	DIR *dir = NULL;
1522 	struct dirent *dent;
1523 
1524 	if (tried_before)
1525 		goto err;
1526 	tried_before = 1;
1527 
1528 	if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1529 		goto err;
1530 
1531 	for (c = 0; c < ncpus; c++)
1532 		for (m = 0; m < nmems; m++)
1533 			distmap[I(c, m)] = DISTMAP_MAX;
1534 
1535 	if ((dir = opendir(distance_directory)) == NULL)
1536 		goto err;
1537 	while ((dent = readdir(dir)) != NULL) {
1538 		char buf[PATH_MAX];
1539 		unsigned int node;
1540 
1541 		if (sscanf(dent->d_name, "node%u", &node) < 1)
1542 			continue;
1543 		pathcat3(buf, sizeof(buf), distance_directory, dent->d_name,
1544 			 "distance");
1545 		if (parse_distance_file(node, buf) < 0)
1546 			goto err;
1547 	}
1548 	closedir(dir);
1549 	return;
1550 err:
1551 	if (dir)
1552 		closedir(dir);
1553 	free(distmap);
1554 	distmap = NULL;
1555 }
1556 
1557 #ifdef ALTERNATE_SN_DISTMAP
1558 
1559 /*
1560  * Parse SN architecture specific line of form:
1561  *
1562  *	node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10
1563  *
1564  * Second field is node number.  The "dist" field is the colon separated list
1565  * of distances, which is parsed into array dists[] of each nodes distance
1566  * from that node.
1567  *
1568  * Result is placed in distmap[ncpus][nmems]:
1569  *
1570  *	For each cpu c on that node:
1571  *		For each node position n in list of distances:
1572  *			distmap[c][n] = dists[n]
1573  */
1574 
parse_distmap_line_sn(char * buf)1575 static void parse_distmap_line_sn(char *buf)
1576 {
1577 	char *p, *pend, *q;
1578 	int ncpus = cpuset_cpus_nbits();
1579 	int nmems = cpuset_mems_nbits();
1580 	unsigned long c, n, node;
1581 	distmap_entry_t *dists = NULL;
1582 	struct bitmask *cpus = NULL, *mems = NULL;
1583 
1584 	if ((p = strchr(buf, ' ')) == NULL)
1585 		goto err;
1586 	if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems)
1587 		goto err;
1588 	if ((p = strstr(q, " dist ")) == NULL)
1589 		goto err;
1590 	p += strlen(" dist ");
1591 	if ((pend = strchr(p, ' ')) != NULL)
1592 		*pend = '\0';
1593 	if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1594 		goto err;
1595 	for (n = 0; n < (unsigned int)nmems; n++)
1596 		dists[n] = DISTMAP_MAX;
1597 
1598 	for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1599 		unsigned long d;
1600 
1601 		if ((p = strpbrk(p, "0123456789")) == NULL)
1602 			break;
1603 		d = strtoul(p, &q, 10);
1604 		if (p == q)
1605 			break;
1606 		if (d < DISTMAP_MAX)
1607 			dists[n] = (distmap_entry_t) d;
1608 	}
1609 
1610 	if ((mems = bitmask_alloc(nmems)) == NULL)
1611 		goto err;
1612 	bitmask_setbit(mems, node);
1613 
1614 	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1615 		goto err;
1616 	cpuset_localcpus(mems, cpus);
1617 
1618 	for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1619 	     c = bitmask_next(cpus, c + 1))
1620 		for (n = 0; n < (unsigned int)nmems; n++)
1621 			distmap[I(c, n)] = dists[n];
1622 	/* fall into ... */
1623 err:
1624 	bitmask_free(mems);
1625 	bitmask_free(cpus);
1626 	free(dists);
1627 }
1628 
build_distmap_sn(void)1629 static void build_distmap_sn(void)
1630 {
1631 	int ncpus = cpuset_cpus_nbits();
1632 	int nmems = cpuset_mems_nbits();
1633 	int c, m;
1634 	static int tried_before = 0;
1635 	FILE *fp = NULL;
1636 	char *buf = NULL;
1637 	int buflen;
1638 
1639 	if (tried_before)
1640 		goto err;
1641 	tried_before = 1;
1642 
1643 	if ((fp = fopen(sn_topology, "r")) == NULL)
1644 		goto err;
1645 
1646 	if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1647 		goto err;
1648 
1649 	for (c = 0; c < ncpus; c++)
1650 		for (m = 0; m < nmems; m++)
1651 			distmap[I(c, m)] = DISTMAP_MAX;
1652 
1653 	buflen = filesize(fp);
1654 	if ((buf = malloc(buflen)) == NULL)
1655 		goto err;
1656 
1657 	while (flgets(buf, buflen, fp) != NULL)
1658 		if (strprefix(buf, sn_top_node_prefix))
1659 			parse_distmap_line_sn(buf);
1660 
1661 	free(buf);
1662 	fclose(fp);
1663 	return;
1664 err:
1665 	free(buf);
1666 	free(distmap);
1667 	distmap = NULL;
1668 	if (fp)
1669 		fclose(fp);
1670 }
1671 
1672 #endif
1673 
1674 /* [optional] Hardware distance from CPU to Memory Node */
cpuset_cpumemdist(int cpu,int mem)1675 unsigned int cpuset_cpumemdist(int cpu, int mem)
1676 {
1677 	int ncpus = cpuset_cpus_nbits();
1678 	int nmems = cpuset_mems_nbits();
1679 	distmap_entry_t r = DISTMAP_MAX;
1680 
1681 	flockfile(stdout);
1682 
1683 	if (check() < 0)
1684 		goto err;
1685 
1686 	if (distmap == NULL)
1687 		build_distmap();
1688 
1689 #ifdef ALTERNATE_SN_DISTMAP
1690 	if (distmap == NULL)
1691 		build_distmap_sn();
1692 #endif
1693 
1694 	if (distmap == NULL)
1695 		goto err;
1696 
1697 	if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems)
1698 		goto err;
1699 
1700 	r = distmap[I(cpu, mem)];
1701 	/* fall into ... */
1702 err:
1703 	funlockfile(stdout);
1704 	return r;
1705 }
1706 
1707 /* [optional] Return Memory Node closest to cpu */
cpuset_cpu2node(int cpu)1708 int cpuset_cpu2node(int cpu)
1709 {
1710 	int ncpus = cpuset_cpus_nbits();
1711 	int nmems = cpuset_mems_nbits();
1712 	struct bitmask *cpus = NULL, *mems = NULL;
1713 	int r = -1;
1714 
1715 	if (check() < 0)
1716 		goto err;
1717 
1718 	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1719 		goto err;
1720 	bitmask_setbit(cpus, cpu);
1721 
1722 	if ((mems = bitmask_alloc(nmems)) == NULL)
1723 		goto err;
1724 	cpuset_localmems(cpus, mems);
1725 	r = bitmask_first(mems);
1726 	/* fall into ... */
1727 err:
1728 	bitmask_free(cpus);
1729 	bitmask_free(mems);
1730 	return r;
1731 }
1732 
apply_cpuset_settings(const char * path,const struct cpuset * cp)1733 static int apply_cpuset_settings(const char *path, const struct cpuset *cp)
1734 {
1735 	if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) {
1736 		if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0)
1737 			goto err;
1738 	}
1739 
1740 	if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) {
1741 		if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0)
1742 			goto err;
1743 	}
1744 
1745 	if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) {
1746 		if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0)
1747 			goto err;
1748 	}
1749 
1750 	if (cp->notify_on_release_valid && cp->notify_on_release_dirty) {
1751 		if (store_flag(path, "notify_on_release", cp->notify_on_release)
1752 		    < 0)
1753 			goto err;
1754 	}
1755 
1756 	if (cp->memory_migrate_valid &&
1757 	    cp->memory_migrate_dirty && exists_flag(path, "memory_migrate")) {
1758 		if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0)
1759 			goto err;
1760 	}
1761 
1762 	if (cp->memory_pressure_enabled_valid &&
1763 	    cp->memory_pressure_enabled_dirty &&
1764 	    exists_flag(path, "memory_pressure_enabled")) {
1765 		if (store_flag
1766 		    (path, "memory_pressure_enabled",
1767 		     cp->memory_pressure_enabled) < 0)
1768 			goto err;
1769 	}
1770 
1771 	if (cp->memory_spread_page_valid &&
1772 	    cp->memory_spread_page_dirty &&
1773 	    exists_flag(path, "memory_spread_page")) {
1774 		if (store_flag
1775 		    (path, "memory_spread_page", cp->memory_spread_page) < 0)
1776 			goto err;
1777 	}
1778 
1779 	if (cp->memory_spread_slab_valid &&
1780 	    cp->memory_spread_slab_dirty &&
1781 	    exists_flag(path, "memory_spread_slab")) {
1782 		if (store_flag
1783 		    (path, "memory_spread_slab", cp->memory_spread_slab) < 0)
1784 			goto err;
1785 	}
1786 
1787 	if (cp->sched_load_balance_valid &&
1788 	    cp->sched_load_balance_dirty &&
1789 	    exists_flag(path, "sched_load_balance")) {
1790 		if (store_flag
1791 		    (path, "sched_load_balance", cp->sched_load_balance) < 0)
1792 			goto err;
1793 	}
1794 
1795 	if (cp->sched_relax_domain_level_valid &&
1796 	    cp->sched_relax_domain_level_dirty &&
1797 	    exists_flag(path, "sched_relax_domain_level")) {
1798 		if (store_number
1799 		    (path, "sched_relax_domain_level",
1800 		     cp->sched_relax_domain_level) < 0)
1801 			goto err;
1802 	}
1803 
1804 	if (cp->cpus_valid && cp->cpus_dirty) {
1805 		if (store_mask(path, "cpus", cp->cpus) < 0)
1806 			goto err;
1807 	}
1808 
1809 	if (cp->mems_valid && cp->mems_dirty) {
1810 		if (store_mask(path, "mems", cp->mems) < 0)
1811 			goto err;
1812 	}
1813 	return 0;
1814 err:
1815 	return -1;
1816 }
1817 
1818 /*
1819  * get_siblings() - helper routine for cpuset_would_crash_kernel(), below.
1820  *
1821  * Extract max value of any 'siblings' field in /proc/cpuinfo.
1822  * Cache the result - only need to extract once in lifetime of task.
1823  *
1824  * The siblings field is the number of logical CPUs in a physical
1825  * processor package.  It is equal to the product of the number of
1826  * cores in that package, times the number of hyper-threads per core.
1827  * The bug that cpuset_would_crash_kernel() is detecting arises
1828  * when a cpu_exclusive cpuset tries to include just some, not all,
1829  * of the sibling logical CPUs available in a processor package.
1830  *
1831  * In the improbable case that a system has mixed values of siblings
1832  * (some processor packages have more than others, perhaps due to
1833  * partially enabling Hyper-Threading), we take the worse case value,
1834  * the largest siblings value.  This might be overkill.  I don't know
1835  * if this kernel bug considers each processor package's siblings
1836  * separately or not.  But it sure is easier this way ...
1837  *
1838  * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from
1839  * open to close, the first time called.
1840  */
1841 
get_siblings(void)1842 static int get_siblings(void)
1843 {
1844 	static int siblings;
1845 	char buf[32];		/* big enough for one 'siblings' line */
1846 	FILE *fp;
1847 
1848 	if (siblings)
1849 		return siblings;
1850 
1851 	if ((fp = fopen("/proc/cpuinfo", "r")) == NULL)
1852 		return 4;	/* wing it - /proc not mounted ? */
1853 	while (flgets(buf, sizeof(buf), fp) != NULL) {
1854 		int s;
1855 
1856 		if (sscanf(buf, "siblings : %d", &s) < 1)
1857 			continue;
1858 		if (s > siblings)
1859 			siblings = s;
1860 	}
1861 	fclose(fp);
1862 	if (siblings == 0)
1863 		siblings = 1;	/* old kernel, no siblings, default to 1 */
1864 	return siblings;
1865 }
1866 
1867 /*
1868  * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic
1869  * scheduler domain code invoked for cpu_exclusive cpusets that causes
1870  * the kernel to freeze, requiring a hardware reset.
1871  *
1872  * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive'
1873  * cpuset is defined where that cpusets 'cpus' are not on package
1874  * boundaries then the kernel will freeze, usually as soon as this
1875  * cpuset is created, requiring a hardware reset.
1876  *
1877  * A cpusets 'cpus' are not on package boundaries if the cpuset
1878  * includes a proper non-empty subset (some, but not all) of the
1879  * logical cpus on a processor package.  This requires multiple
1880  * logical CPUs per package, available with either Hyper-Thread or
1881  * Multi-Core support.  Without one of these features, there is only
1882  * one logical CPU per physical package, and it's not possible to
1883  * have a proper, non-empty subset of a set of cardinality one.
1884  *
1885  * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC
1886  * on i386 and x86_64 arch's.
1887  *
1888  * The objective of this routine cpuset_would_crash_kernel() is to
1889  * determine if a proposed cpuset setting would crash the kernel due
1890  * to this bug, so that the caller can avoid the crash.
1891  *
1892  * Ideally we'd check for exactly these conditions here, but computing
1893  * the package (identified by the 'physical id' field of /proc/cpuinfo)
1894  * of each cpu in a cpuset is more effort than it's worth here.
1895  *
1896  * Also there is no obvious way to identify exactly whether the kernel
1897  * one is executing on has this bug, short of trying it, and seeing
1898  * if the kernel just crashed.
1899  *
1900  * So for now, we look for a simpler set of conditions, that meets
1901  * our immediate need - avoid this crash on SUSE SLES10 systems that
1902  * are susceptible to it.  We look for the kernel version 2.6.16.*,
1903  * which is the base kernel of SUSE SLES10, and for i386 or x86_64
1904  * processors, which had CONFIG_SCHED_MC enabled.
1905  *
1906  * If these simpler conditions are met, we further simplify the check,
1907  * by presuming that the logical CPUs are numbered on processor
1908  * package boundaries.  If each package has S siblings, we assume
1909  * that CPUs numbered N through N + S -1 are on the same package,
1910  * for any CPU N such that N mod S == 0.
1911  *
1912  * Yes, this is a hack, focused on avoiding kernel freezes on
1913  * susceptible SUSE SLES10 systems.
1914  */
1915 
cpuset_would_crash_kernel(const struct cpuset * cp)1916 static int cpuset_would_crash_kernel(const struct cpuset *cp)
1917 {
1918 	static int susceptible_system = -1;
1919 
1920 	if (!cp->cpu_exclusive)
1921 		goto ok;
1922 
1923 	if (susceptible_system == -1) {
1924 		struct utsname u;
1925 		int rel_2_6_16, arch_i386, arch_x86_64;
1926 
1927 		if (uname(&u) < 0)
1928 			goto fail;
1929 		rel_2_6_16 = strprefix(u.release, "2.6.16.");
1930 		arch_i386 = streq(u.machine, "i386");
1931 		arch_x86_64 = streq(u.machine, "x86_64");
1932 		susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64);
1933 	}
1934 
1935 	if (susceptible_system) {
1936 		int ncpus = cpuset_cpus_nbits();
1937 		int siblings = get_siblings();
1938 		unsigned int cpu;
1939 
1940 		for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) {
1941 			int s, num_set = 0;
1942 
1943 			for (s = 0; s < siblings; s++) {
1944 				if (bitmask_isbitset(cp->cpus, cpu + s))
1945 					num_set++;
1946 			}
1947 
1948 			/* If none or all siblings set, we're still ok */
1949 			if (num_set == 0 || num_set == siblings)
1950 				continue;
1951 
1952 			/* Found one that would crash kernel.  Fail.  */
1953 			errno = ENXIO;
1954 			goto fail;
1955 		}
1956 	}
1957 	/* If not susceptible, or if all ok, fall into "ok" ... */
1958 ok:
1959 	return 0;		/* would not crash */
1960 fail:
1961 	return 1;		/* would crash */
1962 }
1963 
1964 /* compare two cpuset and mark the dirty variable */
mark_dirty_variable(struct cpuset * cp1,const struct cpuset * cp2)1965 static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2)
1966 {
1967 	if (cp1->cpu_exclusive_valid &&
1968 	    cp1->cpu_exclusive != cp2->cpu_exclusive)
1969 		cp1->cpu_exclusive_dirty = 1;
1970 
1971 	if (cp1->mem_exclusive_valid &&
1972 	    cp1->mem_exclusive != cp2->mem_exclusive)
1973 		cp1->mem_exclusive_dirty = 1;
1974 
1975 	if (cp1->mem_hardwall_valid && cp1->mem_hardwall != cp2->mem_hardwall)
1976 		cp1->mem_hardwall_dirty = 1;
1977 
1978 	if (cp1->notify_on_release_valid &&
1979 	    cp1->notify_on_release != cp2->notify_on_release)
1980 		cp1->notify_on_release_dirty = 1;
1981 
1982 	if (cp1->memory_migrate_valid &&
1983 	    cp1->memory_migrate != cp2->memory_migrate)
1984 		cp1->memory_migrate_dirty = 1;
1985 
1986 	if (cp1->memory_pressure_enabled_valid &&
1987 	    cp1->memory_pressure_enabled != cp2->memory_pressure_enabled)
1988 		cp1->memory_pressure_enabled_dirty = 1;
1989 
1990 	if (cp1->memory_spread_page_valid &&
1991 	    cp1->memory_spread_page != cp2->memory_spread_page)
1992 		cp1->memory_spread_page_dirty = 1;
1993 
1994 	if (cp1->memory_spread_slab_valid &&
1995 	    cp1->memory_spread_slab != cp2->memory_spread_slab)
1996 		cp1->memory_spread_slab_dirty = 1;
1997 
1998 	if (cp1->sched_load_balance_valid &&
1999 	    cp1->sched_load_balance != cp2->sched_load_balance)
2000 		cp1->sched_load_balance_dirty = 1;
2001 
2002 	if (cp1->sched_relax_domain_level_valid &&
2003 	    cp1->sched_relax_domain_level != cp2->sched_relax_domain_level)
2004 		cp1->sched_relax_domain_level_dirty = 1;
2005 
2006 	if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus))
2007 		cp1->cpus_dirty = 1;
2008 	if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems))
2009 		cp1->mems_dirty = 1;
2010 }
2011 
2012 /* Create (if new set) or modify cpuset 'cp' at location 'relpath' */
cr_or_mod(const char * relpath,const struct cpuset * cp,int new)2013 static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new)
2014 {
2015 	char buf[PATH_MAX];
2016 	int do_rmdir_on_err = 0;
2017 	int do_restore_cp_sav_on_err = 0;
2018 	struct cpuset *cp_sav = NULL;
2019 	int sav_errno;
2020 
2021 	if (check() < 0)
2022 		goto err;
2023 
2024 	if (cpuset_would_crash_kernel(cp))
2025 		goto err;
2026 
2027 	fullpath(buf, sizeof(buf), relpath);
2028 
2029 	if (new) {
2030 		if (mkdir(buf, 0755) < 0)
2031 			goto err;
2032 		/* we made it, so we should remove it on error */
2033 		do_rmdir_on_err = 1;
2034 	}
2035 
2036 	if ((cp_sav = cpuset_alloc()) == NULL)
2037 		goto err;
2038 	if (cpuset_query(cp_sav, relpath) < 0)
2039 		goto err;
2040 	/* we have old settings to restore on error */
2041 	do_restore_cp_sav_on_err = 1;
2042 
2043 	/* check which variable need to restore on error */
2044 	mark_dirty_variable(cp_sav, cp);
2045 
2046 	if (apply_cpuset_settings(buf, cp) < 0)
2047 		goto err;
2048 
2049 	cpuset_free(cp_sav);
2050 	return 0;
2051 err:
2052 	sav_errno = errno;
2053 	if (do_restore_cp_sav_on_err)
2054 		(void)apply_cpuset_settings(buf, cp_sav);
2055 	if (cp_sav)
2056 		cpuset_free(cp_sav);
2057 	if (do_rmdir_on_err)
2058 		(void)rmdir(buf);
2059 	errno = sav_errno;
2060 	return -1;
2061 }
2062 
2063 /* Create cpuset 'cp' at location 'relpath' */
cpuset_create(const char * relpath,const struct cpuset * cp)2064 int cpuset_create(const char *relpath, const struct cpuset *cp)
2065 {
2066 	return cr_or_mod(relpath, cp, 1);
2067 }
2068 
2069 /* Delete cpuset at location 'path' (if empty) */
cpuset_delete(const char * relpath)2070 int cpuset_delete(const char *relpath)
2071 {
2072 	char buf[PATH_MAX];
2073 
2074 	if (check() < 0)
2075 		goto err;
2076 
2077 	fullpath(buf, sizeof(buf), relpath);
2078 	if (rmdir(buf) < 0)
2079 		goto err;
2080 
2081 	return 0;
2082 err:
2083 	return -1;
2084 }
2085 
2086 /* Set cpuset cp to the cpuset at location 'path' */
cpuset_query(struct cpuset * cp,const char * relpath)2087 int cpuset_query(struct cpuset *cp, const char *relpath)
2088 {
2089 	char buf[PATH_MAX];
2090 
2091 	if (check() < 0)
2092 		goto err;
2093 
2094 	fullpath(buf, sizeof(buf), relpath);
2095 
2096 	if (load_flag(buf, &cp->cpu_exclusive, "cpuset.cpu_exclusive") < 0)
2097 		goto err;
2098 	cp->cpu_exclusive_valid = 1;
2099 
2100 	if (load_flag(buf, &cp->mem_exclusive, "cpuset.mem_exclusive") < 0)
2101 		goto err;
2102 	cp->mem_exclusive_valid = 1;
2103 
2104 	if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0)
2105 		goto err;
2106 	cp->notify_on_release_valid = 1;
2107 
2108 	if (exists_flag(buf, "cpuset.memory_migrate")) {
2109 		if (load_flag(buf, &cp->memory_migrate, "cpuset.memory_migrate") < 0)
2110 			goto err;
2111 		cp->memory_migrate_valid = 1;
2112 	}
2113 
2114 	if (exists_flag(buf, "cpuset.mem_hardwall")) {
2115 		if (load_flag(buf, &cp->mem_hardwall, "cpuset.mem_hardwall") < 0)
2116 			goto err;
2117 		cp->mem_hardwall_valid = 1;
2118 	}
2119 
2120 	if (exists_flag(buf, "cpuset.memory_pressure_enabled")) {
2121 		if (load_flag
2122 		    (buf, &cp->memory_pressure_enabled,
2123 		     "cpuset.memory_pressure_enabled") < 0)
2124 			goto err;
2125 		cp->memory_pressure_enabled_valid = 1;
2126 	}
2127 
2128 	if (exists_flag(buf, "cpuset.memory_spread_page")) {
2129 		if (load_flag
2130 		    (buf, &cp->memory_spread_page, "cpuset.memory_spread_page") < 0)
2131 			goto err;
2132 		cp->memory_spread_page_valid = 1;
2133 	}
2134 
2135 	if (exists_flag(buf, "cpuset.memory_spread_slab")) {
2136 		if (load_flag
2137 		    (buf, &cp->memory_spread_slab, "cpuset.memory_spread_slab") < 0)
2138 			goto err;
2139 		cp->memory_spread_slab_valid = 1;
2140 	}
2141 
2142 	if (exists_flag(buf, "cpuset.sched_load_balance")) {
2143 		if (load_flag
2144 		    (buf, &cp->sched_load_balance, "cpuset.sched_load_balance") < 0)
2145 			goto err;
2146 		cp->sched_load_balance_valid = 1;
2147 	}
2148 
2149 	if (exists_flag(buf, "cpuset.sched_relax_domain_level")) {
2150 		if (load_number
2151 		    (buf, &cp->sched_relax_domain_level,
2152 		     "cpuset.sched_relax_domain_level") < 0)
2153 			goto err;
2154 		cp->sched_relax_domain_level_valid = 1;
2155 	}
2156 
2157 	if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpuset.cpus") < 0)
2158 		goto err;
2159 	cp->cpus_valid = 1;
2160 
2161 	if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "cpuset.mems") < 0)
2162 		goto err;
2163 	cp->mems_valid = 1;
2164 
2165 	return 0;
2166 err:
2167 	return -1;
2168 }
2169 
2170 /* Modify cpuset at location 'relpath' to values of 'cp' */
cpuset_modify(const char * relpath,const struct cpuset * cp)2171 int cpuset_modify(const char *relpath, const struct cpuset *cp)
2172 {
2173 	return cr_or_mod(relpath, cp, 0);
2174 }
2175 
2176 /* Get cpuset path of pid into buf */
cpuset_getcpusetpath(pid_t pid,char * buf,size_t size)2177 char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size)
2178 {
2179 	int fd;			/* dual use: cpuset file for pid and self */
2180 	int rc;			/* dual use: snprintf and read return codes */
2181 
2182 	if (check() < 0)
2183 		return NULL;
2184 
2185 	/* borrow result buf[] to build cpuset file path */
2186 	if (pid == 0)
2187 		rc = snprintf(buf, size, "/proc/self/cpuset");
2188 	else
2189 		rc = snprintf(buf, size, "/proc/%d/cpuset", pid);
2190 	if (rc >= (int)size) {
2191 		errno = E2BIG;
2192 		return NULL;
2193 	}
2194 	if ((fd = open(buf, O_RDONLY)) < 0) {
2195 		int e = errno;
2196 		if (e == ENOENT)
2197 			e = ESRCH;
2198 		if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0)
2199 			e = ENOSYS;
2200 		else
2201 			close(fd);
2202 		errno = e;
2203 		return NULL;
2204 	}
2205 	rc = read(fd, buf, size);
2206 	close(fd);
2207 	if (rc < 0)
2208 		return NULL;
2209 	if (rc >= (int)size) {
2210 		errno = E2BIG;
2211 		return NULL;
2212 	}
2213 	buf[rc] = 0;
2214 	chomp(buf);
2215 	return buf;
2216 
2217 }
2218 
2219 /* Get cpuset 'cp' of pid */
cpuset_cpusetofpid(struct cpuset * cp,pid_t pid)2220 int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid)
2221 {
2222 	char buf[PATH_MAX];
2223 
2224 	if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
2225 		return -1;
2226 	if (cpuset_query(cp, buf) < 0)
2227 		return -1;
2228 	return 0;
2229 }
2230 
2231 /* [optional] Return mountpoint of cpuset filesystem */
cpuset_mountpoint(void)2232 const char *cpuset_mountpoint(void)
2233 {
2234 	if (check() < 0) {
2235 		switch (errno) {
2236 		case ENODEV:
2237 			return "[cpuset filesystem not mounted]";
2238 		default:
2239 			return "[cpuset filesystem not supported]";
2240 		}
2241 	}
2242 	return cpusetmnt;
2243 }
2244 
2245 /* Return true if path is a directory. */
isdir(const char * path)2246 static int isdir(const char *path)
2247 {
2248 	struct stat statbuf;
2249 
2250 	if (stat(path, &statbuf) < 0)
2251 		return 0;
2252 	return S_ISDIR(statbuf.st_mode);
2253 }
2254 
2255 /*
2256  * [optional] cpuset_collides_exclusive() - True if would collide exclusive.
2257  *
2258  * Return true iff the specified cpuset would overlap with any
2259  * sibling cpusets in either cpus or mems, where either this
2260  * cpuset or the sibling is cpu_exclusive or mem_exclusive.
2261  *
2262  * cpuset_create() fails with errno == EINVAL if the requested cpuset
2263  * would overlap with any sibling, where either one is cpu_exclusive or
2264  * mem_exclusive.  This is a common, and not obvious error.  The
2265  * following routine checks for this particular case, so that code
2266  * creating cpusets can better identify the situation, perhaps to issue
2267  * a more informative error message.
2268  *
2269  * Can also be used to diagnose cpuset_modify failures.  This
2270  * routine ignores any existing cpuset with the same path as the
2271  * given 'cpusetpath', and only looks for exclusive collisions with
2272  * sibling cpusets of that path.
2273  *
2274  * In case of any error, returns (0) -- does not collide.  Presumably
2275  * any actual attempt to create or modify a cpuset will encounter the
2276  * same error, and report it usefully.
2277  *
2278  * This routine is not particularly efficient; most likely code creating or
2279  * modifying a cpuset will want to try the operation first, and then if that
2280  * fails with errno EINVAL, perhaps call this routine to determine if an
2281  * exclusive cpuset collision caused the error.
2282  */
2283 
cpuset_collides_exclusive(const char * cpusetpath,const struct cpuset * cp1)2284 int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1)
2285 {
2286 	char parent[PATH_MAX];
2287 	char *p;
2288 	char *pathcopy = NULL;
2289 	char *base;
2290 	DIR *dir = NULL;
2291 	struct dirent *dent;
2292 	struct cpuset *cp2 = NULL;
2293 	struct bitmask *cpus1 = NULL, *cpus2 = NULL;
2294 	struct bitmask *mems1 = NULL, *mems2 = NULL;
2295 	int ret;
2296 
2297 	if (check() < 0)
2298 		goto err;
2299 
2300 	fullpath(parent, sizeof(parent), cpusetpath);
2301 	if (streq(parent, cpusetmnt))
2302 		goto err;	/* only one cpuset root - can't collide */
2303 	pathcopy = strdup(parent);
2304 	p = strrchr(parent, '/');
2305 	if (!p)
2306 		goto err;	/* huh? - impossible - run and hide */
2307 	*p = 0;			/* now parent is dirname of fullpath */
2308 
2309 	p = strrchr(pathcopy, '/');
2310 	base = p + 1;		/* now base is basename of fullpath */
2311 	if (!*base)
2312 		goto err;	/* this is also impossible - run away */
2313 
2314 	if ((dir = opendir(parent)) == NULL)
2315 		goto err;
2316 	if ((cp2 = cpuset_alloc()) == NULL)
2317 		goto err;
2318 	if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2319 		goto err;
2320 	if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2321 		goto err;
2322 	if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2323 		goto err;
2324 	if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2325 		goto err;
2326 
2327 	while ((dent = readdir(dir)) != NULL) {
2328 		char child[PATH_MAX];
2329 
2330 		if (streq(dent->d_name, ".") || streq(dent->d_name, ".."))
2331 			continue;
2332 		if (streq(dent->d_name, base))
2333 			continue;
2334 		pathcat2(child, sizeof(child), parent, dent->d_name);
2335 		if (!isdir(child))
2336 			continue;
2337 		if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0)
2338 			goto err;
2339 		if (cp1->cpu_exclusive || cp2->cpu_exclusive) {
2340 			cpuset_getcpus(cp1, cpus1);
2341 			cpuset_getcpus(cp2, cpus2);
2342 			if (bitmask_intersects(cpus1, cpus2))
2343 				goto collides;
2344 		}
2345 		if (cp1->mem_exclusive || cp2->mem_exclusive) {
2346 			cpuset_getmems(cp1, mems1);
2347 			cpuset_getmems(cp2, mems2);
2348 			if (bitmask_intersects(mems1, mems2))
2349 				goto collides;
2350 		}
2351 	}
2352 err:
2353 	/* error, or did not collide */
2354 	ret = 0;
2355 	goto done;
2356 collides:
2357 	/* collides */
2358 	ret = 1;
2359 	/* fall into ... */
2360 done:
2361 	if (dir)
2362 		closedir(dir);
2363 	cpuset_free(cp2);
2364 	free(pathcopy);
2365 	bitmask_free(cpus1);
2366 	bitmask_free(cpus2);
2367 	bitmask_free(mems1);
2368 	bitmask_free(mems2);
2369 	return ret;
2370 }
2371 
2372 /*
2373  * [optional] cpuset_nuke() - Remove cpuset anyway possible
2374  *
2375  * Remove a cpuset, including killing tasks in it, and
2376  * removing any descendent cpusets and killing their tasks.
2377  *
2378  * Tasks can take a long time (minutes on some configurations)
2379  * to exit.  Loop up to 'seconds' seconds, trying to kill them.
2380  *
2381  * How we do it:
2382  *	1) First, kill all the pids, looping until there are
2383  *	   no more pids in this cpuset or below, or until the
2384  *	   'seconds' timeout limit is exceeded.
2385  *	2) Then depth first recursively rmdir the cpuset directories.
2386  *	3) If by this point the original cpuset is gone, we succeeded.
2387  *
2388  * If the timeout is exceeded, and tasks still exist, fail with
2389  * errno == ETIME.
2390  *
2391  * We sleep a variable amount of time.  After the first attempt to
2392  * kill all the tasks in the cpuset or its descendents, we sleep 1
2393  * second, the next time 2 seconds, increasing 1 second each loop
2394  * up to a max of 10 seconds.  If more loops past 10 are required
2395  * to kill all the tasks, we sleep 10 seconds each subsequent loop.
2396  * In any case, before the last loop, we sleep however many seconds
2397  * remain of the original timeout 'seconds' requested.  The total
2398  * time of all sleeps will be no more than the requested 'seconds'.
2399  *
2400  * If the cpuset started out empty of any tasks, or if the passed in
2401  * 'seconds' was zero, then this routine will return quickly, having
2402  * not slept at all.  Otherwise, this routine will at a minimum send
2403  * a SIGKILL to all the tasks in this cpuset subtree, then sleep one
2404  * second, before looking to see if any tasks remain.  If tasks remain
2405  * in the cpuset subtree, and a longer 'seconds' timeout was requested
2406  * (more than one), it will continue to kill remaining tasks and sleep,
2407  * in a loop, for as long as time and tasks remain.
2408  *
2409  * The signal sent for the kill is hardcoded to SIGKILL (9).  If some
2410  * other signal should be sent first, use a separate code loop,
2411  * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to
2412  * scan the task pids in a cpuset.  If SIGKILL should -not- be sent,
2413  * this cpuset_nuke() routine can still be called to recursively
2414  * remove a cpuset subtree, by specifying a timeout of zero 'seconds'.
2415  *
2416  * On success, returns 0 with errno == 0.
2417  *
2418  * On failure, returns -1, with errno possibly one of:
2419  *  EACCES - search permission denied on intervening directory
2420  *  ETIME - timed out - tasks remain after 'seconds' timeout
2421  *  EMFILE - too many open files
2422  *  ENODEV - /dev/cpuset not mounted
2423  *  ENOENT - component of cpuset path doesn't exist
2424  *  ENOMEM - out of memory
2425  *  ENOSYS - kernel doesn't support cpusets
2426  *  ENOTDIR - component of cpuset path is not a directory
2427  *  EPERM - lacked permission to kill a task
2428  *  EPERM - lacked permission to read cpusets or files therein
2429  */
2430 
2431 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree);
2432 
cpuset_nuke(const char * relpath,unsigned int seconds)2433 int cpuset_nuke(const char *relpath, unsigned int seconds)
2434 {
2435 	unsigned int secs_left = seconds;	/* total sleep seconds left */
2436 	unsigned int secs_loop = 1;	/* how much sleep next loop */
2437 	unsigned int secs_slept;	/* seconds slept in sleep() */
2438 	struct cpuset_pidlist *pl = NULL;	/* pids in cpuset subtree */
2439 	struct cpuset_fts_tree *cs_tree;
2440 	const struct cpuset_fts_entry *cs_entry;
2441 	int ret, sav_errno = 0;
2442 
2443 	if (check() < 0)
2444 		return -1;
2445 
2446 	if (seconds == 0)
2447 		goto rmdir_cpusets;
2448 
2449 	while (1) {
2450 		int plen, j;
2451 
2452 		if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) {
2453 			/* missing cpuset is as good as if already nuked */
2454 			if (errno == ENOENT) {
2455 				ret = 0;
2456 				goto no_more_cpuset;
2457 			}
2458 
2459 			/* other problems reading cpuset are bad news */
2460 			sav_errno = errno;
2461 			goto failed;
2462 		}
2463 
2464 		if ((plen = cpuset_pidlist_length(pl)) == 0)
2465 			goto rmdir_cpusets;
2466 
2467 		for (j = 0; j < plen; j++) {
2468 			pid_t pid;
2469 
2470 			if ((pid = cpuset_get_pidlist(pl, j)) > 1) {
2471 				if (kill(pid, SIGKILL) < 0 && errno != ESRCH) {
2472 					sav_errno = errno;
2473 					goto failed;
2474 				}
2475 			}
2476 		}
2477 
2478 		if (secs_left == 0)
2479 			goto took_too_long;
2480 
2481 		cpuset_freepidlist(pl);
2482 		pl = NULL;
2483 
2484 		secs_slept = secs_loop - sleep(secs_loop);
2485 
2486 		/* Ensure forward progress */
2487 		if (secs_slept == 0)
2488 			secs_slept = 1;
2489 
2490 		/* Ensure sane sleep() return (unnecessary?) */
2491 		if (secs_slept > secs_loop)
2492 			secs_slept = secs_loop;
2493 
2494 		secs_left -= secs_slept;
2495 
2496 		if (secs_loop < 10)
2497 			secs_loop++;
2498 
2499 		secs_loop = MIN(secs_left, secs_loop);
2500 	}
2501 
2502 took_too_long:
2503 	sav_errno = ETIME;
2504 	/* fall into ... */
2505 failed:
2506 	cpuset_freepidlist(pl);
2507 	errno = sav_errno;
2508 	return -1;
2509 
2510 rmdir_cpusets:
2511 	/* Let's try removing cpuset(s) now. */
2512 	cpuset_freepidlist(pl);
2513 
2514 	if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT)
2515 		return -1;
2516 	ret = 0;
2517 	cpuset_fts_reverse(cs_tree);	/* rmdir's must be done bottom up */
2518 	while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2519 		char buf[PATH_MAX];
2520 
2521 		fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry));
2522 		if (rmdir(buf) < 0 && errno != ENOENT) {
2523 			sav_errno = errno;
2524 			ret = -1;
2525 		}
2526 	}
2527 	cpuset_fts_close(cs_tree);
2528 	/* fall into ... */
2529 no_more_cpuset:
2530 	if (ret == 0)
2531 		errno = 0;
2532 	else
2533 		errno = sav_errno;
2534 	return ret;
2535 }
2536 
2537 /*
2538  * When recursively reading all the tasks files from a subtree,
2539  * chain together the read results, one pidblock per tasks file,
2540  * containing the raw unprocessed ascii as read(2) in.  After
2541  * we gather up this raw data, we then go back to count how
2542  * many pid's there are in total, allocate an array of pid_t
2543  * of that size, and transform the raw ascii data into this
2544  * array of pid_t's.
2545  */
2546 
2547 struct pidblock {
2548 	char *buf;
2549 	int buflen;
2550 	struct pidblock *next;
2551 };
2552 
2553 /*
2554  * Chain the raw contents of a file onto the pbhead list.
2555  *
2556  * We malloc "+ 1" extra byte for a nul-terminator, so that
2557  * the strtoul() loop in pid_transform() won't scan past
2558  * the end of pb->buf[] and accidentally find more pids.
2559  */
add_pidblock(const char * file,struct pidblock ** ppbhead)2560 static void add_pidblock(const char *file, struct pidblock **ppbhead)
2561 {
2562 	FILE *fp = NULL;
2563 	struct pidblock *pb = NULL;
2564 	int fsz;
2565 
2566 	if ((fp = fopen(file, "r")) == NULL)
2567 		goto err;
2568 	fsz = filesize(fp);
2569 	if (fsz == 0)
2570 		goto err;
2571 	if ((pb = calloc(1, sizeof(*pb))) == NULL)
2572 		goto err;
2573 	pb->buflen = fsz;
2574 	if ((pb->buf = malloc(pb->buflen + 1)) == NULL)
2575 		goto err;
2576 	if (fread(pb->buf, 1, pb->buflen, fp) > 0) {
2577 		pb->buf[pb->buflen] = '\0';
2578 		pb->next = *ppbhead;
2579 		*ppbhead = pb;
2580 	}
2581 	fclose(fp);
2582 	return;
2583 err:
2584 	if (fp)
2585 		fclose(fp);
2586 	free(pb);
2587 }
2588 
read_task_file(const char * relpath,struct pidblock ** ppbhead)2589 static void read_task_file(const char *relpath, struct pidblock **ppbhead)
2590 {
2591 	char buf[PATH_MAX];
2592 
2593 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2594 	add_pidblock(buf, ppbhead);
2595 }
2596 
2597 struct cpuset_pidlist {
2598 	pid_t *pids;
2599 	int npids;
2600 };
2601 
2602 /* Count how many pids in buf (one per line - just count newlines) */
pidcount(const char * buf,int buflen)2603 static int pidcount(const char *buf, int buflen)
2604 {
2605 	int n = 0;
2606 	const char *cp;
2607 
2608 	for (cp = buf; cp < buf + buflen; cp++) {
2609 		if (*cp == '\n')
2610 			n++;
2611 	}
2612 	return n;
2613 }
2614 
2615 /* Transform one-per-line ascii pids in pb to pid_t entries in pl */
pid_transform(struct pidblock * pb,struct cpuset_pidlist * pl,int n)2616 static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n)
2617 {
2618 	char *a, *b;
2619 
2620 	for (a = pb->buf; a < pb->buf + pb->buflen; a = b) {
2621 		pid_t p = strtoul(a, &b, 10);
2622 		if (a == b)
2623 			break;
2624 		pl->pids[n++] = p;
2625 	}
2626 	return n;
2627 }
2628 
free_pidblocks(struct pidblock * pbhead)2629 static void free_pidblocks(struct pidblock *pbhead)
2630 {
2631 	struct pidblock *pb, *nextpb;
2632 
2633 	for (pb = pbhead; pb; pb = nextpb) {
2634 		nextpb = pb->next;
2635 		free(pb->buf);
2636 		free(pb);
2637 	}
2638 }
2639 
2640 /* numeric comparison routine for qsort */
numericsort(const void * m1,const void * m2)2641 static int numericsort(const void *m1, const void *m2)
2642 {
2643 	pid_t p1 = *(pid_t *) m1;
2644 	pid_t p2 = *(pid_t *) m2;
2645 
2646 	return p1 - p2;
2647 }
2648 
2649 /* Return list pids in cpuset 'path' */
cpuset_init_pidlist(const char * relpath,int recursiveflag)2650 struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath,
2651 					   int recursiveflag)
2652 {
2653 	struct pidblock *pb = NULL;
2654 	struct cpuset_pidlist *pl = NULL;
2655 	struct pidblock *pbhead = NULL;
2656 	int n;
2657 
2658 	if (check() < 0)
2659 		goto err;
2660 
2661 	if (recursiveflag) {
2662 		struct cpuset_fts_tree *cs_tree;
2663 		const struct cpuset_fts_entry *cs_entry;
2664 
2665 		if ((cs_tree = cpuset_fts_open(relpath)) == NULL)
2666 			goto err;
2667 		while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2668 			if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET)
2669 				continue;
2670 			read_task_file(cpuset_fts_get_path(cs_entry), &pbhead);
2671 		}
2672 		cpuset_fts_close(cs_tree);
2673 	} else {
2674 		read_task_file(relpath, &pbhead);
2675 	}
2676 
2677 	if ((pl = calloc(1, sizeof(*pl))) == NULL)
2678 		goto err;
2679 	pl->npids = 0;
2680 	for (pb = pbhead; pb; pb = pb->next)
2681 		pl->npids += pidcount(pb->buf, pb->buflen);
2682 	if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL)
2683 		goto err;
2684 	n = 0;
2685 	for (pb = pbhead; pb; pb = pb->next)
2686 		n = pid_transform(pb, pl, n);
2687 	free_pidblocks(pbhead);
2688 	qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort);
2689 	return pl;
2690 err:
2691 	cpuset_freepidlist(pl);
2692 	free_pidblocks(pbhead);
2693 	return NULL;
2694 }
2695 
2696 /* Return number of elements in pidlist */
cpuset_pidlist_length(const struct cpuset_pidlist * pl)2697 int cpuset_pidlist_length(const struct cpuset_pidlist *pl)
2698 {
2699 	if (pl)
2700 		return pl->npids;
2701 	else
2702 		return 0;
2703 }
2704 
2705 /* Return i'th element of pidlist */
cpuset_get_pidlist(const struct cpuset_pidlist * pl,int i)2706 pid_t cpuset_get_pidlist(const struct cpuset_pidlist * pl, int i)
2707 {
2708 	if (pl && i >= 0 && i < pl->npids)
2709 		return pl->pids[i];
2710 	else
2711 		return (pid_t) - 1;
2712 }
2713 
2714 /* Free pidlist */
cpuset_freepidlist(struct cpuset_pidlist * pl)2715 void cpuset_freepidlist(struct cpuset_pidlist *pl)
2716 {
2717 	if (pl && pl->pids)
2718 		free(pl->pids);
2719 	free(pl);
2720 }
2721 
__cpuset_move(pid_t pid,const char * path)2722 static int __cpuset_move(pid_t pid, const char *path)
2723 {
2724 	char buf[SMALL_BUFSZ];
2725 
2726 	snprintf(buf, sizeof(buf), "%u", pid);
2727 	return write_string_file(path, buf);
2728 }
2729 
2730 /* Move task (pid == 0 for current) to a cpuset */
cpuset_move(pid_t pid,const char * relpath)2731 int cpuset_move(pid_t pid, const char *relpath)
2732 {
2733 	char buf[PATH_MAX];
2734 
2735 	if (check() < 0)
2736 		return -1;
2737 
2738 	if (pid == 0)
2739 		pid = getpid();
2740 
2741 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2742 	return __cpuset_move(pid, buf);
2743 }
2744 
2745 /* Move all tasks in pidlist to a cpuset */
cpuset_move_all(struct cpuset_pidlist * pl,const char * relpath)2746 int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath)
2747 {
2748 	int i;
2749 	char buf[PATH_MAX];
2750 	int ret;
2751 
2752 	if (check() < 0)
2753 		return -1;
2754 
2755 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2756 
2757 	ret = 0;
2758 	for (i = 0; i < pl->npids; i++)
2759 		if (__cpuset_move(pl->pids[i], buf) < 0)
2760 			ret = -1;
2761 	return ret;
2762 }
2763 
2764 /*
2765  * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a
2766  *                                      cpuset to another cpuset
2767  *
2768  * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may
2769  * race with tasks being added to or forking into fromrelpath. Loop
2770  * repeatedly, reading the tasks file of cpuset fromrelpath and writing
2771  * any task pid's found there to the tasks file of cpuset torelpath,
2772  * up to ten attempts, or until the tasks file of cpuset fromrelpath
2773  * is empty, or until fromrelpath is no longer present.
2774  *
2775  * Returns 0 with errno == 0 if able to empty the tasks file of cpuset
2776  * fromrelpath. Of course it is still possible that some independent
2777  * task could add another task to cpuset fromrelpath at the same time
2778  * that such a successful result is being returned, so there can be
2779  * no guarantee that a successful return means that fromrelpath is
2780  * still empty of tasks.
2781  *
2782  * We are careful to allow for the possibility that the cpuset
2783  * fromrelpath might disappear out from under us, perhaps because it
2784  * has notify_on_release set and gets automatically removed as soon
2785  * as we detach its last task from it.  Consider a missing fromrelpath
2786  * to be a successful move.
2787  *
2788  * If called with fromrelpath and torelpath pathnames that evaluate to
2789  * the same cpuset, then treat that as if cpuset_reattach() was called,
2790  * rebinding each task in this cpuset one time, and return success or
2791  * failure depending on the return of that cpuset_reattach() call.
2792  *
2793  * On failure, returns -1, with errno possibly one of:
2794  *  EACCES - search permission denied on intervening directory
2795  *  ENOTEMPTY - tasks remain after multiple attempts to move them
2796  *  EMFILE - too many open files
2797  *  ENODEV - /dev/cpuset not mounted
2798  *  ENOENT - component of cpuset path doesn't exist
2799  *  ENOMEM - out of memory
2800  *  ENOSYS - kernel doesn't support cpusets
2801  *  ENOTDIR - component of cpuset path is not a directory
2802  *  EPERM - lacked permission to kill a task
2803  *  EPERM - lacked permission to read cpusets or files therein
2804  *
2805  * This is an [optional] function. Use cpuset_function to invoke it.
2806  */
2807 
2808 #define NUMBER_MOVE_TASK_ATTEMPTS 10
2809 
cpuset_move_cpuset_tasks(const char * fromrelpath,const char * torelpath)2810 int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath)
2811 {
2812 	char fromfullpath[PATH_MAX];
2813 	char tofullpath[PATH_MAX];
2814 	int i;
2815 	struct cpuset_pidlist *pl = NULL;
2816 	int sav_errno;
2817 
2818 	fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath);
2819 	fullpath(tofullpath, sizeof(tofullpath), torelpath);
2820 
2821 	if (samefile(fromfullpath, tofullpath))
2822 		return cpuset_reattach(fromrelpath);
2823 
2824 	for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) {
2825 		int plen, j;
2826 
2827 		if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) {
2828 			/* missing cpuset is as good as if all moved */
2829 			if (errno == ENOENT)
2830 				goto no_more_cpuset;
2831 
2832 			/* other problems reading cpuset are bad news */
2833 			sav_errno = errno;
2834 			goto failed;
2835 		}
2836 
2837 		if ((plen = cpuset_pidlist_length(pl)) == 0)
2838 			goto no_more_pids;
2839 
2840 		for (j = 0; j < plen; j++) {
2841 			pid_t pid;
2842 
2843 			pid = cpuset_get_pidlist(pl, j);
2844 			if (cpuset_move(pid, torelpath) < 0) {
2845 				/* missing task is as good as if moved */
2846 				if (errno == ESRCH)
2847 					continue;
2848 
2849 				/* other per-task errors are bad news */
2850 				sav_errno = errno;
2851 				goto failed;
2852 			}
2853 		}
2854 
2855 		cpuset_freepidlist(pl);
2856 		pl = NULL;
2857 	}
2858 
2859 	sav_errno = ENOTEMPTY;
2860 	/* fall into ... */
2861 failed:
2862 	cpuset_freepidlist(pl);
2863 	errno = sav_errno;
2864 	return -1;
2865 
2866 no_more_pids:
2867 no_more_cpuset:
2868 	/* Success - all tasks (or entire cpuset ;) gone. */
2869 	cpuset_freepidlist(pl);
2870 	errno = 0;
2871 	return 0;
2872 }
2873 
2874 /* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */
cpuset_migrate(pid_t pid,const char * relpath)2875 int cpuset_migrate(pid_t pid, const char *relpath)
2876 {
2877 	char buf[PATH_MAX];
2878 	char buf2[PATH_MAX];
2879 	char memory_migrate_flag;
2880 	int r;
2881 
2882 	if (check() < 0)
2883 		return -1;
2884 
2885 	if (pid == 0)
2886 		pid = getpid();
2887 
2888 	fullpath(buf2, sizeof(buf2), relpath);
2889 
2890 	if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2891 		return -1;
2892 	if (store_flag(buf2, "memory_migrate", 1) < 0)
2893 		return -1;
2894 
2895 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2896 
2897 	r = __cpuset_move(pid, buf);
2898 
2899 	store_flag(buf2, "memory_migrate", memory_migrate_flag);
2900 	return r;
2901 }
2902 
2903 /* Migrate all tasks in pidlist to a cpuset (moves task and memory) */
cpuset_migrate_all(struct cpuset_pidlist * pl,const char * relpath)2904 int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath)
2905 {
2906 	int i;
2907 	char buf[PATH_MAX];
2908 	char buf2[PATH_MAX];
2909 	char memory_migrate_flag;
2910 	int ret;
2911 
2912 	if (check() < 0)
2913 		return -1;
2914 
2915 	fullpath(buf2, sizeof(buf2), relpath);
2916 
2917 	if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2918 		return -1;
2919 	if (store_flag(buf2, "memory_migrate", 1) < 0)
2920 		return -1;
2921 
2922 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2923 
2924 	ret = 0;
2925 	for (i = 0; i < pl->npids; i++)
2926 		if (__cpuset_move(pl->pids[i], buf) < 0)
2927 			ret = -1;
2928 
2929 	if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0)
2930 		ret = -1;
2931 	return ret;
2932 }
2933 
2934 /* Rebind cpus_allowed of each task in cpuset 'path' */
cpuset_reattach(const char * relpath)2935 int cpuset_reattach(const char *relpath)
2936 {
2937 	struct cpuset_pidlist *pl;
2938 	int rc;
2939 
2940 	if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL)
2941 		return -1;
2942 	rc = cpuset_move_all(pl, relpath);
2943 	cpuset_freepidlist(pl);
2944 	return rc;
2945 }
2946 
2947 /* Map cpuset relative cpu number to system wide cpu number */
cpuset_c_rel_to_sys_cpu(const struct cpuset * cp,int cpu)2948 int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu)
2949 {
2950 	struct cpuset *cp_tofree = NULL;
2951 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2952 	int pos = -1;
2953 
2954 	if (!cp1)
2955 		goto err;
2956 	pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu);
2957 	/* fall into ... */
2958 err:
2959 	cpuset_free(cp_tofree);
2960 	return pos;
2961 }
2962 
2963 /* Map system wide cpu number to cpuset relative cpu number */
cpuset_c_sys_to_rel_cpu(const struct cpuset * cp,int cpu)2964 int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu)
2965 {
2966 	struct cpuset *cp_tofree = NULL;
2967 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2968 	int pos = -1;
2969 
2970 	if (!cp1)
2971 		goto err;
2972 	pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu);
2973 	/* fall into ... */
2974 err:
2975 	cpuset_free(cp_tofree);
2976 	return pos;
2977 }
2978 
2979 /* Map cpuset relative mem number to system wide mem number */
cpuset_c_rel_to_sys_mem(const struct cpuset * cp,int mem)2980 int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem)
2981 {
2982 	struct cpuset *cp_tofree = NULL;
2983 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2984 	int pos = -1;
2985 
2986 	if (!cp1)
2987 		goto err;
2988 	pos = bitmask_rel_to_abs_pos(cp1->mems, mem);
2989 	/* fall into ... */
2990 err:
2991 	cpuset_free(cp_tofree);
2992 	return pos;
2993 }
2994 
2995 /* Map system wide mem number to cpuset relative mem number */
cpuset_c_sys_to_rel_mem(const struct cpuset * cp,int mem)2996 int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem)
2997 {
2998 	struct cpuset *cp_tofree = NULL;
2999 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
3000 	int pos = -1;
3001 
3002 	if (!cp1)
3003 		goto err;
3004 	pos = bitmask_abs_to_rel_pos(cp1->mems, mem);
3005 	/* fall into ... */
3006 err:
3007 	cpuset_free(cp_tofree);
3008 	return pos;
3009 }
3010 
3011 /* Map pid's cpuset relative cpu number to system wide cpu number */
cpuset_p_rel_to_sys_cpu(pid_t pid,int cpu)3012 int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu)
3013 {
3014 	struct cpuset *cp;
3015 	int rc = -1;
3016 
3017 	if ((cp = cpuset_alloc()) == NULL)
3018 		goto done;
3019 	if (cpuset_cpusetofpid(cp, pid) < 0)
3020 		goto done;
3021 	rc = cpuset_c_rel_to_sys_cpu(cp, cpu);
3022 done:
3023 	cpuset_free(cp);
3024 	return rc;
3025 }
3026 
3027 /* Map system wide cpu number to pid's cpuset relative cpu number */
cpuset_p_sys_to_rel_cpu(pid_t pid,int cpu)3028 int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu)
3029 {
3030 	struct cpuset *cp;
3031 	int rc = -1;
3032 
3033 	if ((cp = cpuset_alloc()) == NULL)
3034 		goto done;
3035 	if (cpuset_cpusetofpid(cp, pid) < 0)
3036 		goto done;
3037 	rc = cpuset_c_sys_to_rel_cpu(cp, cpu);
3038 done:
3039 	cpuset_free(cp);
3040 	return rc;
3041 }
3042 
3043 /* Map pid's cpuset relative mem number to system wide mem number */
cpuset_p_rel_to_sys_mem(pid_t pid,int mem)3044 int cpuset_p_rel_to_sys_mem(pid_t pid, int mem)
3045 {
3046 	struct cpuset *cp;
3047 	int rc = -1;
3048 
3049 	if ((cp = cpuset_alloc()) == NULL)
3050 		goto done;
3051 	if (cpuset_cpusetofpid(cp, pid) < 0)
3052 		goto done;
3053 	rc = cpuset_c_rel_to_sys_mem(cp, mem);
3054 done:
3055 	cpuset_free(cp);
3056 	return rc;
3057 }
3058 
3059 /* Map system wide mem number to pid's cpuset relative mem number */
cpuset_p_sys_to_rel_mem(pid_t pid,int mem)3060 int cpuset_p_sys_to_rel_mem(pid_t pid, int mem)
3061 {
3062 	struct cpuset *cp;
3063 	int rc = -1;
3064 
3065 	if ((cp = cpuset_alloc()) == NULL)
3066 		goto done;
3067 	if (cpuset_cpusetofpid(cp, pid) < 0)
3068 		goto done;
3069 	rc = cpuset_c_sys_to_rel_mem(cp, mem);
3070 done:
3071 	cpuset_free(cp);
3072 	return rc;
3073 }
3074 
3075 /*
3076  * Override glibc's calls for get/set affinity - they have
3077  * something using cpu_set_t that will die when NR_CPUS > 1024.
3078  * Go directly to the 'real' system calls.  Also override calls
3079  * for get_mempolicy and set_mempolicy.  None of these
3080  * calls are yet (July 2004) guaranteed to be in all glibc versions
3081  * that we care about.
3082  */
3083 
sched_setaffinity(pid_t pid,unsigned len,unsigned long * mask)3084 static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask)
3085 {
3086 	return tst_syscall(__NR_sched_setaffinity, pid, len, mask);
3087 }
3088 
get_mempolicy(int * policy,unsigned long * nmask,unsigned long maxnode,void * addr,int flags)3089 static int get_mempolicy(int *policy, unsigned long *nmask,
3090 			 unsigned long maxnode, void *addr, int flags)
3091 {
3092 	return tst_syscall(__NR_get_mempolicy, policy, nmask, maxnode,
3093 		addr, flags);
3094 }
3095 
set_mempolicy(int mode,unsigned long * nmask,unsigned long maxnode)3096 static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode)
3097 {
3098 	return tst_syscall(__NR_set_mempolicy, mode, nmask, maxnode);
3099 }
3100 
3101 struct cpuset_placement {
3102 	struct bitmask *cpus;
3103 	struct bitmask *mems;
3104 	char *path;
3105 };
3106 
3107 /* Allocate and fill in a placement struct - cpatures current placement */
cpuset_get_placement(pid_t pid)3108 struct cpuset_placement *cpuset_get_placement(pid_t pid)
3109 {
3110 	struct cpuset_placement *plc;
3111 	struct cpuset *cp = NULL;
3112 	char buf[PATH_MAX];
3113 	int nbits;
3114 
3115 	if ((plc = calloc(1, sizeof(*plc))) == NULL)
3116 		goto err;
3117 
3118 	nbits = cpuset_cpus_nbits();
3119 	if ((plc->cpus = bitmask_alloc(nbits)) == NULL)
3120 		goto err;
3121 
3122 	nbits = cpuset_mems_nbits();
3123 	if ((plc->mems = bitmask_alloc(nbits)) == NULL)
3124 		goto err;
3125 
3126 	if ((cp = cpuset_alloc()) == NULL)
3127 		goto err;
3128 	if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
3129 		goto err;
3130 	if (cpuset_query(cp, buf) < 0)
3131 		goto err;
3132 
3133 	bitmask_copy(plc->cpus, cp->cpus);
3134 	bitmask_copy(plc->mems, cp->mems);
3135 	plc->path = strdup(buf);
3136 
3137 	cpuset_free(cp);
3138 	return plc;
3139 err:
3140 	cpuset_free(cp);
3141 	cpuset_free_placement(plc);
3142 	return NULL;
3143 }
3144 
3145 /* Compare two placement structs - use to detect changes in placement */
cpuset_equal_placement(const struct cpuset_placement * plc1,const struct cpuset_placement * plc2)3146 int cpuset_equal_placement(const struct cpuset_placement *plc1,
3147 			   const struct cpuset_placement *plc2)
3148 {
3149 	return bitmask_equal(plc1->cpus, plc2->cpus) &&
3150 	    bitmask_equal(plc1->mems, plc2->mems) &&
3151 	    streq(plc1->path, plc2->path);
3152 }
3153 
3154 /* Free a placement struct */
cpuset_free_placement(struct cpuset_placement * plc)3155 void cpuset_free_placement(struct cpuset_placement *plc)
3156 {
3157 	if (!plc)
3158 		return;
3159 	bitmask_free(plc->cpus);
3160 	bitmask_free(plc->mems);
3161 	free(plc->path);
3162 	free(plc);
3163 }
3164 
3165 /*
3166  * A cpuset_fts_open() call constructs a linked list of entries
3167  * called a "cpuset_fts_tree", with one entry per cpuset below
3168  * the specified path.  The cpuset_fts_read() routine returns the
3169  * next entry on this list.  The various cpuset_fts_get_*() calls
3170  * return attributes of the specified entry.  The cpuset_fts_close()
3171  * call frees the linked list and all associated data.  All cpuset
3172  * entries and attributes for the cpuset_fts_tree returned from a
3173  * given cpuset_fts_open() call remain allocated and unchanged until
3174  * that cpuset_fts_tree is closed by a cpuset_fts_close() call.  Any
3175  * subsequent changes to the cpuset filesystem will go unnoticed
3176  * (not affect open cpuset_fts_tree's.)
3177  */
3178 
3179 struct cpuset_fts_entry;
3180 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree);
3181 
3182 struct cpuset_fts_tree {
3183 	struct cpuset_fts_entry *head;	/* head of linked entry list */
3184 	struct cpuset_fts_entry *next;	/* cpuset_fts_read() offset */
3185 };
3186 
3187 struct cpuset_fts_entry {
3188 	struct cpuset_fts_entry *next;	/* linked entry list chain */
3189 	struct cpuset *cpuset;
3190 	struct stat *stat;
3191 	char *path;
3192 	int info;
3193 	int err;
3194 };
3195 
3196 /* Open a handle on a cpuset hierarchy.  All the real work is done here. */
cpuset_fts_open(const char * cpusetpath)3197 struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath)
3198 {
3199 	FTS *fts = NULL;
3200 	FTSENT *ftsent;
3201 	char *path_argv[2];
3202 	char buf[PATH_MAX];
3203 	struct cpuset_fts_tree *cs_tree = NULL;
3204 	struct cpuset_fts_entry *ep;	/* the latest new list entry */
3205 	struct cpuset_fts_entry **pnlep;	/* ptr to next list entry ptr */
3206 	char *relpath;
3207 	int fts_flags;
3208 
3209 	fullpath(buf, sizeof(buf), cpusetpath);
3210 	path_argv[0] = buf;
3211 	path_argv[1] = NULL;
3212 
3213 	fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV;
3214 	fts = fts_open(path_argv, fts_flags, NULL);
3215 	if (fts == NULL)
3216 		goto err;
3217 
3218 	cs_tree = malloc(sizeof(*cs_tree));
3219 	if (cs_tree == NULL)
3220 		goto err;
3221 	pnlep = &cs_tree->head;
3222 	*pnlep = NULL;
3223 
3224 	while ((ftsent = fts_read(fts)) != NULL) {
3225 		if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR)
3226 			continue;
3227 
3228 		/* ftsent is a directory (perhaps unreadable) ==> cpuset */
3229 		ep = calloc(1, sizeof(*ep));
3230 		if (ep == NULL)
3231 			goto err;
3232 		*pnlep = ep;
3233 		pnlep = &ep->next;
3234 
3235 		/* Set entry's path, and if DNR, error */
3236 		relpath = ftsent->fts_path + strlen(cpusetmnt);
3237 		if (strlen(relpath) == 0)
3238 			relpath = "/";
3239 		ep->path = strdup(relpath);
3240 		if (ep->path == NULL)
3241 			goto err;
3242 		if (ftsent->fts_info == FTS_DNR) {
3243 			ep->info = CPUSET_FTS_ERR_DNR;
3244 			ep->err = ftsent->fts_errno;
3245 			continue;
3246 		}
3247 
3248 		/* ftsent is a -readable- cpuset: set entry's stat, etc */
3249 		ep->stat = calloc(1, sizeof(struct stat));
3250 		if (ep->stat == NULL)
3251 			goto err;
3252 		if (stat(ftsent->fts_path, ep->stat) < 0) {
3253 			ep->info = CPUSET_FTS_ERR_STAT;
3254 			ep->err = ftsent->fts_errno;
3255 			continue;
3256 		}
3257 
3258 		ep->cpuset = calloc(1, sizeof(struct cpuset));
3259 		if (ep->cpuset == NULL)
3260 			goto err;
3261 		if (cpuset_query(ep->cpuset, relpath) < 0) {
3262 			ep->info = CPUSET_FTS_ERR_CPUSET;
3263 			ep->err = errno;
3264 			continue;
3265 		}
3266 		ep->info = CPUSET_FTS_CPUSET;
3267 	}
3268 
3269 	(void)fts_close(fts);
3270 	cpuset_fts_rewind(cs_tree);
3271 	return cs_tree;
3272 
3273 err:
3274 	if (cs_tree)
3275 		cpuset_fts_close(cs_tree);
3276 	if (fts)
3277 		(void)fts_close(fts);
3278 	return NULL;
3279 }
3280 
3281 /* Return pointer to next cpuset entry in hierarchy */
cpuset_fts_read(struct cpuset_fts_tree * cs_tree)3282 const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree)
3283 {
3284 	const struct cpuset_fts_entry *cs_entry = cs_tree->next;
3285 	if (cs_tree->next != NULL)	/* seek to next entry */
3286 		cs_tree->next = cs_tree->next->next;
3287 	return cs_entry;
3288 }
3289 
3290 /* Reverse list of cpusets, in place.  Simulates pre-order/post-order flip. */
cpuset_fts_reverse(struct cpuset_fts_tree * cs_tree)3291 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree)
3292 {
3293 	struct cpuset_fts_entry *cs1, *cs2, *cs3;
3294 
3295 	/*
3296 	 * At each step, cs1 < cs2 < cs3 and the cs2->next pointer
3297 	 * is redirected from cs3 to cs1.
3298 	 */
3299 
3300 	cs1 = cs2 = NULL;
3301 	cs3 = cs_tree->head;
3302 	while (cs3) {
3303 		cs1 = cs2;
3304 		cs2 = cs3;
3305 		cs3 = cs3->next;
3306 		cs2->next = cs1;
3307 	}
3308 	cs_tree->head = cs2;
3309 	cpuset_fts_rewind(cs_tree);
3310 }
3311 
3312 /* Rewind cpuset list to beginning */
cpuset_fts_rewind(struct cpuset_fts_tree * cs_tree)3313 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree)
3314 {
3315 	cs_tree->next = cs_tree->head;
3316 }
3317 
3318 /* Return pointer to nul-terminated cpuset path of entry in hierarchy */
cpuset_fts_get_path(const struct cpuset_fts_entry * cs_entry)3319 const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry)
3320 {
3321 	return cs_entry->path;
3322 }
3323 
3324 /* Return pointer to stat(2) structure of a cpuset entry's directory */
cpuset_fts_get_stat(const struct cpuset_fts_entry * cs_entry)3325 const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry)
3326 {
3327 	return cs_entry->stat;
3328 }
3329 
3330 /* Return pointer to cpuset structure of a cpuset entry */
cpuset_fts_get_cpuset(const struct cpuset_fts_entry * cs_entry)3331 const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry
3332 					   *cs_entry)
3333 {
3334 	return cs_entry->cpuset;
3335 }
3336 
3337 /* Return value of errno (0 if no error) on attempted cpuset operations */
cpuset_fts_get_errno(const struct cpuset_fts_entry * cs_entry)3338 int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry)
3339 {
3340 	return cs_entry->err;
3341 }
3342 
3343 /* Return operation identity causing error */
cpuset_fts_get_info(const struct cpuset_fts_entry * cs_entry)3344 int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry)
3345 {
3346 	return cs_entry->info;
3347 }
3348 
3349 /* Close a cpuset hierarchy handle (free's all associated memory) */
cpuset_fts_close(struct cpuset_fts_tree * cs_tree)3350 void cpuset_fts_close(struct cpuset_fts_tree *cs_tree)
3351 {
3352 	struct cpuset_fts_entry *cs_entry = cs_tree->head;
3353 
3354 	while (cs_entry) {
3355 		struct cpuset_fts_entry *ep = cs_entry;
3356 
3357 		cs_entry = cs_entry->next;
3358 		free(ep->path);
3359 		free(ep->stat);
3360 		cpuset_free(ep->cpuset);
3361 		free(ep);
3362 	}
3363 	free(cs_tree);
3364 }
3365 
3366 /* Bind current task to cpu (uses sched_setaffinity(2)) */
cpuset_cpubind(int cpu)3367 int cpuset_cpubind(int cpu)
3368 {
3369 	struct bitmask *bmp;
3370 	int r;
3371 
3372 	if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3373 		return -1;
3374 	bitmask_setbit(bmp, cpu);
3375 	r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp));
3376 	bitmask_free(bmp);
3377 	return r;
3378 }
3379 
3380 /*
3381  * int cpuset_latestcpu(pid_t pid)
3382  *
3383  * Return most recent CPU on which task pid executed.  If pid == 0,
3384  * examine current task.
3385  *
3386  * The last used CPU is visible for a given pid as field #39 (starting
3387  * with #1) in the file /proc/pid/stat.  Currently this file has 41
3388  * fields, in which case this is the 3rd to the last field.
3389  *
3390  * Unfortunately field #2 is a command name and might have embedded
3391  * whitespace.  So we can't just count white space separated fields.
3392  * Fortunately, this command name is surrounded by parentheses, as
3393  * for example "(sh)", and that closing parenthesis is the last ')'
3394  * character in the line.  No remaining fields can have embedded
3395  * whitespace or parentheses.  So instead of looking for the 39th
3396  * white space separated field, we can look for the 37th white space
3397  * separated field past the last ')' character on the line.
3398  */
3399 
3400 /* Return most recent CPU on which task pid executed */
cpuset_latestcpu(pid_t pid)3401 int cpuset_latestcpu(pid_t pid)
3402 {
3403 	char buf[PATH_MAX];
3404 	char *bp;
3405 	int fd = -1;
3406 	int cpu = -1;
3407 
3408 	if (pid == 0)
3409 		snprintf(buf, sizeof(buf), "/proc/self/stat");
3410 	else
3411 		snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
3412 
3413 	if ((fd = open(buf, O_RDONLY)) < 0)
3414 		goto err;
3415 	if (read(fd, buf, sizeof(buf)) < 1)
3416 		goto err;
3417 	close(fd);
3418 
3419 	bp = strrchr(buf, ')');
3420 	if (bp)
3421 		sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %u",	/* 37th field past ')' */
3422 		       &cpu);
3423 	if (cpu < 0)
3424 		errno = EINVAL;
3425 	return cpu;
3426 err:
3427 	if (fd >= 0)
3428 		close(fd);
3429 	return -1;
3430 }
3431 
3432 /* Bind current task to memory (uses set_mempolicy(2)) */
cpuset_membind(int mem)3433 int cpuset_membind(int mem)
3434 {
3435 	struct bitmask *bmp;
3436 	int r;
3437 
3438 	if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3439 		return -1;
3440 	bitmask_setbit(bmp, mem);
3441 	r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp), bitmask_nbits(bmp) + 1);
3442 	bitmask_free(bmp);
3443 	return r;
3444 }
3445 
3446 /* [optional] Return Memory Node holding page at specified addr */
cpuset_addr2node(void * addr)3447 int cpuset_addr2node(void *addr)
3448 {
3449 	int node = -1;
3450 
3451 	if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE | MPOL_F_ADDR)) {
3452 		/* I realize this seems redundant, but I _want_ to make sure
3453 		 * that this value is -1. */
3454 		node = -1;
3455 	}
3456 	return node;
3457 }
3458 
3459 /*
3460  * Transform cpuset into Text Format Representation in buffer 'buf',
3461  * of length 'buflen', nul-terminated if space allows.  Return number
3462  * of characters that would have been written, if enough space had
3463  * been available, in the same way that snprintf() does.
3464  */
3465 
3466 /* Export cpuset settings to a regular file */
cpuset_export(const struct cpuset * cp,char * buf,int buflen)3467 int cpuset_export(const struct cpuset *cp, char *buf, int buflen)
3468 {
3469 	char *tmp = NULL;
3470 	int n = 0;
3471 
3472 	if (cp->cpu_exclusive)
3473 		n += snprintf(buf + n, MAX(buflen - n, 0), "cpu_exclusive\n");
3474 
3475 	if (cp->mem_exclusive)
3476 		n += snprintf(buf + n, MAX(buflen - n, 0), "mem_exclusive\n");
3477 
3478 	if (cp->notify_on_release)
3479 		n += snprintf(buf + n, MAX(buflen - n, 0),
3480 			      "notify_on_release\n");
3481 
3482 	if (cp->memory_pressure_enabled)
3483 		n += snprintf(buf + n, MAX(buflen - n, 0),
3484 			      "memory_pressure_enabled\n");
3485 
3486 	if (cp->memory_migrate)
3487 		n += snprintf(buf + n, MAX(buflen - n, 0), "memory_migrate\n");
3488 
3489 	if (cp->memory_spread_page)
3490 		n += snprintf(buf + n, MAX(buflen - n, 0),
3491 			      "memory_spread_page\n");
3492 
3493 	if (cp->memory_spread_slab)
3494 		n += snprintf(buf + n, MAX(buflen - n, 0),
3495 			      "memory_spread_slab\n");
3496 
3497 	if ((tmp = sprint_mask_buf(cp->cpus)) == NULL)
3498 		return -1;
3499 	n += snprintf(buf + n, MAX(buflen - n, 0), "cpus %s\n", tmp);
3500 	free(tmp);
3501 	tmp = NULL;
3502 
3503 	if ((tmp = sprint_mask_buf(cp->mems)) == NULL)
3504 		return -1;
3505 	n += snprintf(buf + n, MAX(buflen - n, 0), "mems %s\n", tmp);
3506 	free(tmp);
3507 	tmp = NULL;
3508 
3509 	return n;
3510 }
3511 
import_list(UNUSED const char * tok,const char * arg,struct bitmask * bmp,char * emsg,int elen)3512 static int import_list(UNUSED const char *tok, const char *arg,
3513 		       struct bitmask *bmp, char *emsg, int elen)
3514 {
3515 	if (bitmask_parselist(arg, bmp) < 0) {
3516 		if (emsg)
3517 			snprintf(emsg, elen, "Invalid list format: %s", arg);
3518 		return -1;
3519 	}
3520 	return 0;
3521 }
3522 
stolower(char * s)3523 static void stolower(char *s)
3524 {
3525 	while (*s) {
3526 		unsigned char c = *s;
3527 		*s = tolower(c);
3528 		s++;
3529 	}
3530 }
3531 
3532 /* Import cpuset settings from a regular file */
cpuset_import(struct cpuset * cp,const char * buf,int * elinenum,char * emsg,int elen)3533 int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum,
3534 		  char *emsg, int elen)
3535 {
3536 	char *linebuf = NULL;
3537 	int linebuflen;
3538 	int linenum = 0;
3539 	int offset = 0;
3540 
3541 	linebuflen = strlen(buf) + 1;
3542 	if ((linebuf = malloc(linebuflen)) == NULL) {
3543 		if (emsg)
3544 			snprintf(emsg, elen, "Insufficient memory");
3545 		goto err;
3546 	}
3547 
3548 	while (slgets(linebuf, linebuflen, buf, &offset)) {
3549 		char *tok, *arg;
3550 		char *ptr;	/* for strtok_r */
3551 
3552 		linenum++;
3553 		if ((tok = strchr(linebuf, '#')) != NULL)
3554 			*tok = 0;
3555 		if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL)
3556 			continue;
3557 		stolower(tok);
3558 
3559 		arg = strtok_r(0, " \t", &ptr);
3560 
3561 		if (streq(tok, "cpu_exclusive")) {
3562 			cp->cpu_exclusive = 1;
3563 			goto eol;
3564 		}
3565 		if (streq(tok, "mem_exclusive")) {
3566 			cp->mem_exclusive = 1;
3567 			goto eol;
3568 		}
3569 		if (streq(tok, "notify_on_release")) {
3570 			cp->notify_on_release = 1;
3571 			goto eol;
3572 		}
3573 		if (streq(tok, "memory_pressure_enabled")) {
3574 			cp->memory_pressure_enabled = 1;
3575 			goto eol;
3576 		}
3577 		if (streq(tok, "memory_migrate")) {
3578 			cp->memory_migrate = 1;
3579 			goto eol;
3580 		}
3581 		if (streq(tok, "memory_spread_page")) {
3582 			cp->memory_spread_page = 1;
3583 			goto eol;
3584 		}
3585 		if (streq(tok, "memory_spread_slab")) {
3586 			cp->memory_spread_slab = 1;
3587 			goto eol;
3588 		}
3589 		if (streq(tok, "cpu") || streq(tok, "cpus")) {
3590 			if (import_list(tok, arg, cp->cpus, emsg, elen) < 0)
3591 				goto err;
3592 			goto eol;
3593 		}
3594 		if (streq(tok, "mem") || streq(tok, "mems")) {
3595 			if (import_list(tok, arg, cp->mems, emsg, elen) < 0)
3596 				goto err;
3597 			goto eol;
3598 		}
3599 		if (emsg)
3600 			snprintf(emsg, elen, "Unrecognized token: '%s'", tok);
3601 		goto err;
3602 eol:
3603 		if ((tok = strtok_r(0, " \t", &ptr)) != NULL) {
3604 			if (emsg)
3605 				snprintf(emsg, elen, "Surplus token: '%s'",
3606 					 tok);
3607 			goto err;
3608 		}
3609 		continue;
3610 	}
3611 
3612 	free(linebuf);
3613 
3614 	if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems))
3615 		cpuset_localcpus(cp->mems, cp->cpus);
3616 	else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems))
3617 		cpuset_localmems(cp->cpus, cp->mems);
3618 
3619 	/*
3620 	 * All cpuset attributes are determined in an import.
3621 	 * Those that aren't explicitly specified are presumed
3622 	 * to be unchanged (zero, if it's a freshly allocated
3623 	 * struct cpuset.)
3624 	 */
3625 
3626 	cp->cpus_valid = 1;
3627 	cp->mems_valid = 1;
3628 	cp->cpu_exclusive_valid = 1;
3629 	cp->mem_exclusive_valid = 1;
3630 	cp->notify_on_release_valid = 1;
3631 	cp->memory_migrate_valid = 1;
3632 	cp->memory_pressure_enabled_valid = 1;
3633 	cp->memory_spread_page_valid = 1;
3634 	cp->memory_spread_slab_valid = 1;
3635 
3636 	return 0;
3637 err:
3638 	if (elinenum)
3639 		*elinenum = linenum;
3640 	free(linebuf);
3641 	return -1;
3642 }
3643 
3644 /* Pin current task CPU (and memory) */
cpuset_pin(int relcpu)3645 int cpuset_pin(int relcpu)
3646 {
3647 	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3648 	int cpu, r;
3649 
3650 	if (check() < 0)
3651 		return -1;
3652 
3653 	do {
3654 		cpuset_free_placement(plc1);
3655 		plc1 = cpuset_get_placement(0);
3656 
3657 		r = 0;
3658 		if (cpuset_unpin() < 0)
3659 			r = -1;
3660 		cpu = cpuset_p_rel_to_sys_cpu(0, relcpu);
3661 		if (cpuset_cpubind(cpu) < 0)
3662 			r = -1;
3663 
3664 		cpuset_free_placement(plc2);
3665 		plc2 = cpuset_get_placement(0);
3666 	} while (!cpuset_equal_placement(plc1, plc2));
3667 
3668 	cpuset_free_placement(plc1);
3669 	cpuset_free_placement(plc2);
3670 	return r;
3671 }
3672 
3673 /* Return number CPUs in current tasks cpuset */
cpuset_size(void)3674 int cpuset_size(void)
3675 {
3676 	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3677 	int r;
3678 
3679 	if (check() < 0)
3680 		return -1;
3681 
3682 	do {
3683 		cpuset_free_placement(plc1);
3684 		plc1 = cpuset_get_placement(0);
3685 
3686 		r = cpuset_cpus_weight(0);
3687 
3688 		cpuset_free_placement(plc2);
3689 		plc2 = cpuset_get_placement(0);
3690 	} while (!cpuset_equal_placement(plc1, plc2));
3691 
3692 	cpuset_free_placement(plc1);
3693 	cpuset_free_placement(plc2);
3694 	return r;
3695 }
3696 
3697 /* Return relative CPU number, within current cpuset, last executed on */
cpuset_where(void)3698 int cpuset_where(void)
3699 {
3700 	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3701 	int r;
3702 
3703 	if (check() < 0)
3704 		return -1;
3705 
3706 	do {
3707 		cpuset_free_placement(plc1);
3708 		plc1 = cpuset_get_placement(0);
3709 
3710 		r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0));
3711 
3712 		cpuset_free_placement(plc2);
3713 		plc2 = cpuset_get_placement(0);
3714 	} while (!cpuset_equal_placement(plc1, plc2));
3715 
3716 	cpuset_free_placement(plc1);
3717 	cpuset_free_placement(plc2);
3718 	return r;
3719 }
3720 
3721 /* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */
cpuset_unpin(void)3722 int cpuset_unpin(void)
3723 {
3724 	struct bitmask *cpus = NULL, *mems = NULL;
3725 	int r = -1;
3726 
3727 	if (check() < 0)
3728 		goto err;
3729 
3730 	/*
3731 	 * Don't need cpuset_*_placement() guard against concurrent
3732 	 * cpuset migration, because none of the following depends
3733 	 * on the tasks cpuset placement.
3734 	 */
3735 
3736 	if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3737 		goto err;
3738 	bitmask_setall(cpus);
3739 	if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0)
3740 		goto err;
3741 
3742 	if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3743 		goto err;
3744 	if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems),
3745 			  bitmask_nbits(mems) + 1) < 0)
3746 		goto err;
3747 	r = 0;
3748 	/* fall into ... */
3749 err:
3750 	bitmask_free(cpus);
3751 	bitmask_free(mems);
3752 	return r;
3753 
3754 }
3755 
3756 struct cpuset_function_list {
3757 	const char *fname;
3758 	void *func;
3759 } flist[] = {
3760 	{
3761 	"cpuset_version", cpuset_version}, {
3762 	"cpuset_alloc", cpuset_alloc}, {
3763 	"cpuset_free", cpuset_free}, {
3764 	"cpuset_cpus_nbits", cpuset_cpus_nbits}, {
3765 	"cpuset_mems_nbits", cpuset_mems_nbits}, {
3766 	"cpuset_setcpus", cpuset_setcpus}, {
3767 	"cpuset_setmems", cpuset_setmems}, {
3768 	"cpuset_set_iopt", cpuset_set_iopt}, {
3769 	"cpuset_set_sopt", cpuset_set_sopt}, {
3770 	"cpuset_getcpus", cpuset_getcpus}, {
3771 	"cpuset_getmems", cpuset_getmems}, {
3772 	"cpuset_cpus_weight", cpuset_cpus_weight}, {
3773 	"cpuset_mems_weight", cpuset_mems_weight}, {
3774 	"cpuset_get_iopt", cpuset_get_iopt}, {
3775 	"cpuset_get_sopt", cpuset_get_sopt}, {
3776 	"cpuset_localcpus", cpuset_localcpus}, {
3777 	"cpuset_localmems", cpuset_localmems}, {
3778 	"cpuset_cpumemdist", cpuset_cpumemdist}, {
3779 	"cpuset_cpu2node", cpuset_cpu2node}, {
3780 	"cpuset_addr2node", cpuset_addr2node}, {
3781 	"cpuset_create", cpuset_create}, {
3782 	"cpuset_delete", cpuset_delete}, {
3783 	"cpuset_query", cpuset_query}, {
3784 	"cpuset_modify", cpuset_modify}, {
3785 	"cpuset_getcpusetpath", cpuset_getcpusetpath}, {
3786 	"cpuset_cpusetofpid", cpuset_cpusetofpid}, {
3787 	"cpuset_mountpoint", cpuset_mountpoint}, {
3788 	"cpuset_collides_exclusive", cpuset_collides_exclusive}, {
3789 	"cpuset_nuke", cpuset_nuke}, {
3790 	"cpuset_init_pidlist", cpuset_init_pidlist}, {
3791 	"cpuset_pidlist_length", cpuset_pidlist_length}, {
3792 	"cpuset_get_pidlist", cpuset_get_pidlist}, {
3793 	"cpuset_freepidlist", cpuset_freepidlist}, {
3794 	"cpuset_move", cpuset_move}, {
3795 	"cpuset_move_all", cpuset_move_all}, {
3796 	"cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks}, {
3797 	"cpuset_migrate", cpuset_migrate}, {
3798 	"cpuset_migrate_all", cpuset_migrate_all}, {
3799 	"cpuset_reattach", cpuset_reattach}, {
3800 	"cpuset_open_memory_pressure", cpuset_open_memory_pressure}, {
3801 	"cpuset_read_memory_pressure", cpuset_read_memory_pressure}, {
3802 	"cpuset_close_memory_pressure", cpuset_close_memory_pressure}, {
3803 	"cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu}, {
3804 	"cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu}, {
3805 	"cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem}, {
3806 	"cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem}, {
3807 	"cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu}, {
3808 	"cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu}, {
3809 	"cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem}, {
3810 	"cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem}, {
3811 	"cpuset_get_placement", cpuset_get_placement}, {
3812 	"cpuset_equal_placement", cpuset_equal_placement}, {
3813 	"cpuset_free_placement", cpuset_free_placement}, {
3814 	"cpuset_fts_open", cpuset_fts_open}, {
3815 	"cpuset_fts_read", cpuset_fts_read}, {
3816 	"cpuset_fts_reverse", cpuset_fts_reverse}, {
3817 	"cpuset_fts_rewind", cpuset_fts_rewind}, {
3818 	"cpuset_fts_get_path", cpuset_fts_get_path}, {
3819 	"cpuset_fts_get_stat", cpuset_fts_get_stat}, {
3820 	"cpuset_fts_get_cpuset", cpuset_fts_get_cpuset}, {
3821 	"cpuset_fts_get_errno", cpuset_fts_get_errno}, {
3822 	"cpuset_fts_get_info", cpuset_fts_get_info}, {
3823 	"cpuset_fts_close", cpuset_fts_close}, {
3824 	"cpuset_cpubind", cpuset_cpubind}, {
3825 	"cpuset_latestcpu", cpuset_latestcpu}, {
3826 	"cpuset_membind", cpuset_membind}, {
3827 	"cpuset_export", cpuset_export}, {
3828 	"cpuset_import", cpuset_import}, {
3829 	"cpuset_function", cpuset_function}, {
3830 	"cpuset_pin", cpuset_pin}, {
3831 	"cpuset_size", cpuset_size}, {
3832 	"cpuset_where", cpuset_where}, {
3833 "cpuset_unpin", cpuset_unpin},};
3834 
3835 /* Return pointer to a libcpuset.so function, or NULL */
cpuset_function(const char * function_name)3836 void *cpuset_function(const char *function_name)
3837 {
3838 	unsigned int i;
3839 
3840 	for (i = 0; i < sizeof(flist) / sizeof(flist[0]); i++)
3841 		if (streq(function_name, flist[i].fname))
3842 			return flist[i].func;
3843 	return NULL;
3844 }
3845 
3846 /* Fortran interface to basic cpuset routines */
cpuset_pin_(int * ptr_relcpu)3847 int cpuset_pin_(int *ptr_relcpu)
3848 {
3849 	return cpuset_pin(*ptr_relcpu);
3850 }
3851 
cpuset_size_(void)3852 int cpuset_size_(void)
3853 {
3854 	return cpuset_size();
3855 }
3856 
cpuset_where_(void)3857 int cpuset_where_(void)
3858 {
3859 	return cpuset_where();
3860 }
3861 
cpuset_unpin_(void)3862 int cpuset_unpin_(void)
3863 {
3864 	return cpuset_unpin();
3865 }
3866 
3867 #endif /* HAVE_LINUX_MEMPOLICY_H */
3868