• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * cpuset user library implementation.
3  *
4  * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved.
5  *
6  * Paul Jackson <pj@sgi.com>
7  */
8 
9 /*
10  *  This program is free software; you can redistribute it and/or modify
11  *  it under the terms of the GNU Lesser General Public License as published by
12  *  the Free Software Foundation; either version 2.1 of the License, or
13  *  (at your option) any later version.
14  *
15  *  This program is distributed in the hope that it will be useful,
16  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *  GNU Lesser General Public License for more details.
19  *
20  *  You should have received a copy of the GNU Lesser General Public License
21  *  along with this program; if not, write to the Free Software
22  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
23  */
24 
25 #define _XOPEN_SOURCE 500	/* need to see pread() */
26 #define _BSD_SOURCE 1		/* need to see syscall() */
27 #include <unistd.h>
28 
29 #include <ctype.h>
30 #include <dirent.h>
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <fts.h>
34 #include <limits.h>
35 #include <signal.h>
36 #include <stdint.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <sys/stat.h>
41 #include <sys/syscall.h>
42 #include <sys/types.h>
43 #include <time.h>
44 #include <utime.h>
45 #include <sys/utsname.h>	/* for cpuset_would_crash_kernel() */
46 
47 #include "bitmask.h"
48 #include "cpuset.h"
49 #include "common.h"
50 #include "test.h"
51 #include "lapi/syscalls.h"
52 #include "config.h"
53 
54 #if HAVE_LINUX_MEMPOLICY_H
55 #include <linux/mempolicy.h>
56 
57 /* Bump version, and update Change History, when libcpuset API changes */
58 #define CPUSET_VERSION 3
59 
60 /*
61  * For a history of what changed in each version, see the "Change
62  * History" section, at the end of the libcpuset master document.
63  */
64 
cpuset_version(void)65 int cpuset_version(void)
66 {
67 	return CPUSET_VERSION;
68 }
69 
70 struct cpuset {
71 	struct bitmask *cpus;
72 	struct bitmask *mems;
73 	char cpu_exclusive;
74 	char mem_exclusive;
75 	char mem_hardwall;
76 	char notify_on_release;
77 	char memory_migrate;
78 	char memory_pressure_enabled;
79 	char memory_spread_page;
80 	char memory_spread_slab;
81 	char sched_load_balance;
82 	int sched_relax_domain_level;
83 
84 	/*
85 	 * Each field 'x' above gets an 'x_valid' field below.
86 	 * The apply_cpuset_settings() will only set those fields whose
87 	 * corresponding *_valid flags are set.  The cpuset_alloc()
88 	 * routine clears these flags as part of the clear in calloc(),
89 	 * and the various cpuset_set*() routines set these flags when
90 	 * setting the corresponding value.
91 	 *
92 	 * The purpose of these valid fields is to ensure that when
93 	 * we create a new cpuset, we don't accidentally overwrite
94 	 * some non-zero kernel default, such as an inherited
95 	 * memory_spread_* flag, just because the user application
96 	 * code didn't override the default zero settings resulting
97 	 * from the calloc() call in cpuset_alloc().
98 	 *
99 	 * The choice of 'char' for the type of the flags above,
100 	 * but a bitfield for the flags below, is somewhat capricious.
101 	 */
102 	unsigned cpus_valid:1;
103 	unsigned mems_valid:1;
104 	unsigned cpu_exclusive_valid:1;
105 	unsigned mem_exclusive_valid:1;
106 	unsigned mem_hardwall_valid:1;
107 	unsigned notify_on_release_valid:1;
108 	unsigned memory_migrate_valid:1;
109 	unsigned memory_pressure_enabled_valid:1;
110 	unsigned memory_spread_page_valid:1;
111 	unsigned memory_spread_slab_valid:1;
112 	unsigned sched_load_balance_valid:1;
113 	unsigned sched_relax_domain_level_valid:1;
114 
115 	/*
116 	 * if the relative variable was modified, use following flags
117 	 * to put a mark
118 	 */
119 	unsigned cpus_dirty:1;
120 	unsigned mems_dirty:1;
121 	unsigned cpu_exclusive_dirty:1;
122 	unsigned mem_exclusive_dirty:1;
123 	unsigned mem_hardwall_dirty:1;
124 	unsigned notify_on_release_dirty:1;
125 	unsigned memory_migrate_dirty:1;
126 	unsigned memory_pressure_enabled_dirty:1;
127 	unsigned memory_spread_page_dirty:1;
128 	unsigned memory_spread_slab_dirty:1;
129 	unsigned sched_load_balance_dirty:1;
130 	unsigned sched_relax_domain_level_dirty:1;
131 };
132 
133 /* Presumed cpuset file system mount point */
134 static const char *cpusetmnt = "/dev/cpuset";
135 
136 /* Stashed copy of cpunodemap[], mapping each cpu to its node. */
137 static const char *mapfile = "/var/run/cpunodemap";
138 
139 /* The primary source for the cpunodemap[] is available below here. */
140 static const char *sysdevices = "/sys/devices/system";
141 
142 /* small buffer size - for reading boolean flags or map file (1 or 2 ints) */
143 #define SMALL_BUFSZ 16
144 
145 /*
146  * The 'mask_size_file' is used to ferrit out the kernel cpumask_t
147  * and nodemask_t sizes.  The lines in this file that begin with the
148  * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask
149  * and nodemask string, respectively.  The lengths of these strings
150  * reflect the kernel's internal cpumask_t and nodemask_t sizes,
151  * which sizes are needed to correctly call the sched_setaffinity
152  * and set_mempolicy system calls, and to size user level
153  * bitmasks to match the kernels.
154  */
155 
156 static const char *mask_size_file = "/proc/self/status";
157 static const char *cpumask_prefix = "Cpus_allowed:\t";
158 static const char *nodemask_prefix = "Mems_allowed:\t";
159 
160 /*
161  * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits.
162  *
163  * The first time we need these, we parse the Cpus_allowed and
164  * Mems_allowed lines from mask_size_file ("/proc/self/status").
165  */
166 
167 static int cpumask_sz;
168 static int nodemask_sz;
169 
170 /*
171  * These defaults only kick in if we fail to size the kernel
172  * cpumask and nodemask by reading the Cpus_allowed and
173  * Mems_allowed fields from the /proc/self/status file.
174  */
175 
176 #define DEFCPUBITS (512)
177 #define DEFNODEBITS (DEFCPUBITS/2)
178 
179 /*
180  * Arch-neutral API for obtaining NUMA distances between CPUs
181  * and Memory Nodes, via the files:
182  *	/sys/devices/system/node/nodeN/distance
183  * which have lines such as:
184  *	46 66 10 20
185  * which say that for cpu on node N (from the path above), the
186  * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20,
187  * respectively.
188  */
189 
190 static const char *distance_directory = "/sys/devices/system/node";
191 
192 /*
193  * Someday, we should disable, then later discard, the SN code
194  * marked ALTERNATE_SN_DISTMAP.
195  */
196 
197 #define ALTERNATE_SN_DISTMAP 1
198 #ifdef ALTERNATE_SN_DISTMAP
199 
200 /*
201  * Alternative SN (SGI ia64) architecture specific API for obtaining
202  * NUMA distances between CPUs and Memory Nodes is via the file
203  * /proc/sgi_sn/sn_topology, which has lines such as:
204  *
205  *   node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20
206  *
207  * which says that for each CPU on node 2, the distance to nodes
208  * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively.
209  *
210  * This file has other lines as well, which start with other
211  * keywords than "node".  Ignore these other lines.
212  */
213 
214 static const char *sn_topology = "/proc/sgi_sn/sn_topology";
215 static const char *sn_top_node_prefix = "node ";
216 
217 #endif
218 
219 /*
220  * Check that cpusets supported, /dev/cpuset mounted.
221  * If ok, return 0.
222  * If not, return -1 and set errno:
223  *	ENOSYS - kernel doesn't support cpusets
224  *	ENODEV - /dev/cpuset not mounted
225  */
226 
227 static enum {
228 	check_notdone,
229 	check_enosys,
230 	check_enodev,
231 	check_ok
232 } check_state = check_notdone;
233 
check()234 static int check()
235 {
236 	if (check_state == check_notdone) {
237 		struct stat statbuf;
238 
239 		if (stat("/proc/self/cpuset", &statbuf) < 0) {
240 			check_state = check_enosys;
241 			goto done;
242 		}
243 
244 		if (stat("/dev/cpuset/tasks", &statbuf) < 0) {
245 			check_state = check_enodev;
246 			goto done;
247 		}
248 
249 		check_state = check_ok;
250 	}
251 done:
252 	switch (check_state) {
253 	case check_enosys:
254 		errno = ENOSYS;
255 		return -1;
256 	case check_enodev:
257 		errno = ENODEV;
258 		return -1;
259 	default:
260 		break;
261 	}
262 	return 0;
263 }
264 
chomp(char * s)265 static void chomp(char *s)
266 {
267 	char *t;
268 
269 	for (t = s + strlen(s) - 1; t >= s; t--) {
270 		if (*t == '\n' || *t == '\r')
271 			*t = '\0';
272 		else
273 			break;
274 	}
275 }
276 
277 /*
278  * Determine number of bytes in a seekable open file, without
279  * assuming that stat(2) on that file has a useful size.
280  * Has side affect of leaving the file rewound to the beginnning.
281  */
filesize(FILE * fp)282 static int filesize(FILE * fp)
283 {
284 	int sz = 0;
285 	rewind(fp);
286 	while (fgetc(fp) != EOF)
287 		sz++;
288 	rewind(fp);
289 	return sz;
290 }
291 
292 /* Are strings s1 and s2 equal? */
streq(const char * s1,const char * s2)293 static int streq(const char *s1, const char *s2)
294 {
295 	return strcmp(s1, s2) == 0;
296 }
297 
298 /* Is string 'pre' a prefix of string 's'? */
strprefix(const char * s,const char * pre)299 static int strprefix(const char *s, const char *pre)
300 {
301 	return strncmp(s, pre, strlen(pre)) == 0;
302 }
303 
304 /*
305  * char *flgets(char *buf, int buflen, FILE *fp)
306  *
307  * Obtain one line from input file fp.  Copy up to first
308  * buflen-1 chars of line into buffer buf, discarding any remainder
309  * of line.  Stop reading at newline, discarding newline.
310  * Nul terminate result and return pointer to buffer buf
311  * on success, or NULL if nothing more to read or failure.
312  */
313 
flgets(char * buf,int buflen,FILE * fp)314 static char *flgets(char *buf, int buflen, FILE * fp)
315 {
316 	int c = -1;
317 	char *bp;
318 
319 	bp = buf;
320 	while ((--buflen > 0) && ((c = getc(fp)) >= 0)) {
321 		if (c == '\n')
322 			goto newline;
323 		*bp++ = c;
324 	}
325 	if ((c < 0) && (bp == buf))
326 		return NULL;
327 
328 	if (c > 0) {
329 		while ((c = getc(fp)) >= 0) {
330 			if (c == '\n')
331 				break;
332 		}
333 	}
334 
335 newline:
336 	*bp++ = '\0';
337 	return buf;
338 }
339 
340 /*
341  * sgetc(const char *inputbuf, int *offsetptr)
342  *
343  * Return next char from nul-terminated input buffer inputbuf,
344  * starting at offset *offsetptr.  Increment *offsetptr.
345  * If next char would be nul ('\0'), return EOF and don't
346  * increment *offsetptr.
347  */
348 
sgetc(const char * inputbuf,int * offsetptr)349 static int sgetc(const char *inputbuf, int *offsetptr)
350 {
351 	char c;
352 
353 	if ((c = inputbuf[*offsetptr]) != 0) {
354 		*offsetptr = *offsetptr + 1;
355 		return c;
356 	} else {
357 		return EOF;
358 	}
359 }
360 
361 /*
362  * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
363  *
364  * Obtain next line from nul-terminated input buffer 'inputbuf',
365  * starting at offset *offsetptr.  Copy up to first buflen-1
366  * chars of line into output buffer buf, discarding any remainder
367  * of line.  Stop reading at newline, discarding newline.
368  * Nul terminate result and return pointer to output buffer
369  * buf on success, or NULL if nothing more to read.
370  */
371 
slgets(char * buf,int buflen,const char * inputbuf,int * offsetptr)372 static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
373 {
374 	int c = -1;
375 	char *bp;
376 
377 	bp = buf;
378 	while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) {
379 		if (c == '\n')
380 			goto newline;
381 		*bp++ = c;
382 	}
383 	if ((c < 0) && (bp == buf))
384 		return NULL;
385 
386 	if (c > 0) {
387 		while ((c = sgetc(inputbuf, offsetptr)) >= 0) {
388 			if (c == '\n')
389 				break;
390 		}
391 	}
392 
393 newline:
394 	*bp++ = '\0';
395 	return buf;
396 }
397 
398 /*
399  * time_t get_mtime(char *path)
400  *
401  * Return modtime of file at location path, else return 0.
402  */
403 
get_mtime(const char * path)404 static time_t get_mtime(const char *path)
405 {
406 	struct stat statbuf;
407 
408 	if (stat(path, &statbuf) != 0)
409 		return 0;
410 	return statbuf.st_mtime;
411 }
412 
413 /*
414  * int set_mtime(const char *path, time_t mtime)
415  *
416  * Set modtime of file 'path' to 'mtime'.  Return 0 on success,
417  * or -1 on error, setting errno.
418  */
419 
set_mtime(const char * path,time_t mtime)420 static int set_mtime(const char *path, time_t mtime)
421 {
422 	struct utimbuf times;
423 
424 	times.actime = mtime;
425 	times.modtime = mtime;
426 	return utime(path, &times);
427 }
428 
429 /*
430  * True if two pathnames resolve to same file.
431  * False if either path can not be stat'd,
432  * or if the two paths resolve to a different file.
433  */
434 
samefile(const char * path1,const char * path2)435 static int samefile(const char *path1, const char *path2)
436 {
437 	struct stat sb1, sb2;
438 
439 	if (stat(path1, &sb1) != 0)
440 		return 0;
441 	if (stat(path2, &sb2) != 0)
442 		return 0;
443 	return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev;
444 }
445 
446 #define slash(c) (*(c) == '/')
447 #define eocomp(c) (slash(c) || !*(c))
448 #define dot1(c) (*(c) == '.' && eocomp(c+1))
449 
450 /* In place path compression.  Remove extra dots and slashes. */
pathcomp(char * p)451 static char *pathcomp(char *p)
452 {
453 	char *a = p;
454 	char *b = p;
455 
456 	if (!p || !*p)
457 		return p;
458 	if (slash(p))
459 		*b++ = *a++;
460 	for (;;) {
461 		if (slash(a))
462 			while (slash(++a))
463 				continue;
464 		if (!*a) {
465 			if (b == p)
466 				*b++ = '.';
467 			*b = '\0';
468 			return (p);
469 		} else if (dot1(a)) {
470 			a++;
471 		} else {
472 			if ((b != p) && !slash(b - 1))
473 				*b++ = '/';
474 			while (!eocomp(a))
475 				*b++ = *a++;
476 		}
477 	}
478 }
479 
480 #undef slash
481 #undef eocomp
482 #undef dot1
483 
484 /*
485  * pathcat2(buf, buflen, name1, name2)
486  *
487  * Return buf, of length buflen, with name1/name2 stored in it.
488  */
489 
pathcat2(char * buf,int buflen,const char * name1,const char * name2)490 static char *pathcat2(char *buf, int buflen, const char *name1,
491 		      const char *name2)
492 {
493 	(void)snprintf(buf, buflen, "%s/%s", name1, name2);
494 	return pathcomp(buf);
495 }
496 
497 /*
498  * pathcat3(buf, buflen, name1, name2, name3)
499  *
500  * Return buf, of length buflen, with name1/name2/name3 stored in it.
501  */
502 
pathcat3(char * buf,int buflen,const char * name1,const char * name2,const char * name3)503 static char *pathcat3(char *buf, int buflen, const char *name1,
504 		      const char *name2, const char *name3)
505 {
506 	(void)snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3);
507 	return pathcomp(buf);
508 }
509 
510 /*
511  * fullpath(buf, buflen, name)
512  *
513  * Put full path of cpuset 'name' in buffer 'buf'.  If name
514  * starts with a slash (``/``) character, then this a path
515  * relative to ``/dev/cpuset``, otherwise it is relative to
516  * the current tasks cpuset.  Return 0 on success, else
517  * -1 on error, setting errno.
518  */
519 
fullpath(char * buf,int buflen,const char * name)520 static int fullpath(char *buf, int buflen, const char *name)
521 {
522 	int len;
523 
524 	/* easy case */
525 	if (*name == '/') {
526 		pathcat2(buf, buflen, cpusetmnt, name);
527 		pathcomp(buf);
528 		return 0;
529 	}
530 
531 	/* hard case */
532 	snprintf(buf, buflen, "%s/", cpusetmnt);
533 	len = strlen(buf);
534 	if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL)
535 		return -1;
536 	if (strlen(buf) >= buflen - 1 - strlen(name)) {
537 		errno = E2BIG;
538 		return -1;
539 	}
540 	strcat(buf, "/");
541 	strcat(buf, name);
542 	pathcomp(buf);
543 	return 0;
544 }
545 
546 /*
547  * fullpath2(buf, buflen, name1, name2)
548  *
549  * Like fullpath(), only concatenate two pathname components on end.
550  */
551 
fullpath2(char * buf,int buflen,const char * name1,const char * name2)552 static int fullpath2(char *buf, int buflen, const char *name1,
553 		     const char *name2)
554 {
555 	if (fullpath(buf, buflen, name1) < 0)
556 		return -1;
557 	if (strlen(buf) >= buflen - 1 - strlen(name2)) {
558 		errno = E2BIG;
559 		return -1;
560 	}
561 	strcat(buf, "/");
562 	strcat(buf, name2);
563 	pathcomp(buf);
564 	return 0;
565 }
566 
567 /*
568  * Convert the string length of an ascii hex mask to the number
569  * of bits represented by that mask.
570  *
571  * The cpumask and nodemask values in /proc/self/status are in an
572  * ascii format that uses 9 characters for each 32 bits of mask.
573  */
s2nbits(const char * s)574 static int s2nbits(const char *s)
575 {
576 	return strlen(s) * 32 / 9;
577 }
578 
update_mask_sizes()579 static void update_mask_sizes()
580 {
581 	FILE *fp = NULL;
582 	char *buf = NULL;
583 	int fsize;
584 
585 	if ((fp = fopen(mask_size_file, "r")) == NULL)
586 		goto done;
587 	fsize = filesize(fp);
588 	if ((buf = malloc(fsize)) == NULL)
589 		goto done;
590 
591 	/*
592 	 * Beware: mask sizing arithmetic is fussy.
593 	 * The trailing newline left by fgets() is required.
594 	 */
595 	while (fgets(buf, fsize, fp)) {
596 		if (strprefix(buf, cpumask_prefix))
597 			cpumask_sz = s2nbits(buf + strlen(cpumask_prefix));
598 		if (strprefix(buf, nodemask_prefix))
599 			nodemask_sz = s2nbits(buf + strlen(nodemask_prefix));
600 	}
601 done:
602 	free(buf);
603 	if (fp != NULL)
604 		fclose(fp);
605 	if (cpumask_sz == 0)
606 		cpumask_sz = DEFCPUBITS;
607 	if (nodemask_sz == 0)
608 		nodemask_sz = DEFNODEBITS;
609 }
610 
611 /* Allocate a new struct cpuset */
cpuset_alloc()612 struct cpuset *cpuset_alloc()
613 {
614 	struct cpuset *cp = NULL;
615 	int nbits;
616 
617 	if ((cp = calloc(1, sizeof(struct cpuset))) == NULL)
618 		goto err;
619 
620 	nbits = cpuset_cpus_nbits();
621 	if ((cp->cpus = bitmask_alloc(nbits)) == NULL)
622 		goto err;
623 
624 	nbits = cpuset_mems_nbits();
625 	if ((cp->mems = bitmask_alloc(nbits)) == NULL)
626 		goto err;
627 
628 	return cp;
629 err:
630 	if (cp && cp->cpus)
631 		bitmask_free(cp->cpus);
632 	if (cp && cp->mems)
633 		bitmask_free(cp->mems);
634 	free(cp);
635 	return NULL;
636 }
637 
638 /* Free struct cpuset *cp */
cpuset_free(struct cpuset * cp)639 void cpuset_free(struct cpuset *cp)
640 {
641 	if (!cp)
642 		return;
643 	if (cp->cpus)
644 		bitmask_free(cp->cpus);
645 	if (cp->mems)
646 		bitmask_free(cp->mems);
647 	free(cp);
648 }
649 
650 /* Number of bits in a CPU bitmask on current system */
cpuset_cpus_nbits()651 int cpuset_cpus_nbits()
652 {
653 	if (cpumask_sz == 0)
654 		update_mask_sizes();
655 	return cpumask_sz;
656 }
657 
658 /* Number of bits in a Memory bitmask on current system */
cpuset_mems_nbits()659 int cpuset_mems_nbits()
660 {
661 	if (nodemask_sz == 0)
662 		update_mask_sizes();
663 	return nodemask_sz;
664 }
665 
666 /* Set CPUs in cpuset cp to bitmask cpus */
cpuset_setcpus(struct cpuset * cp,const struct bitmask * cpus)667 int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus)
668 {
669 	if (cp->cpus)
670 		bitmask_free(cp->cpus);
671 	cp->cpus = bitmask_alloc(bitmask_nbits(cpus));
672 	if (cp->cpus == NULL)
673 		return -1;
674 	bitmask_copy(cp->cpus, cpus);
675 	cp->cpus_valid = 1;
676 	cp->cpus_dirty = 1;
677 	return 0;
678 }
679 
680 /* Set Memory Nodes in cpuset cp to bitmask mems */
cpuset_setmems(struct cpuset * cp,const struct bitmask * mems)681 int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems)
682 {
683 	if (cp->mems)
684 		bitmask_free(cp->mems);
685 	cp->mems = bitmask_alloc(bitmask_nbits(mems));
686 	if (cp->mems == NULL)
687 		return -1;
688 	bitmask_copy(cp->mems, mems);
689 	cp->mems_valid = 1;
690 	cp->mems_dirty = 1;
691 	return 0;
692 }
693 
694 /* Set integer value optname of cpuset cp */
cpuset_set_iopt(struct cpuset * cp,const char * optionname,int value)695 int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value)
696 {
697 	if (streq(optionname, "cpu_exclusive")) {
698 		cp->cpu_exclusive = ! !value;
699 		cp->cpu_exclusive_valid = 1;
700 		cp->cpu_exclusive_dirty = 1;
701 	} else if (streq(optionname, "mem_exclusive")) {
702 		cp->mem_exclusive = ! !value;
703 		cp->mem_exclusive_valid = 1;
704 		cp->mem_exclusive_dirty = 1;
705 	} else if (streq(optionname, "mem_hardwall")) {
706 		cp->mem_hardwall = ! !value;
707 		cp->mem_hardwall_valid = 1;
708 		cp->mem_hardwall_dirty = 1;
709 	} else if (streq(optionname, "notify_on_release")) {
710 		cp->notify_on_release = ! !value;
711 		cp->notify_on_release_valid = 1;
712 		cp->notify_on_release_dirty = 1;
713 	} else if (streq(optionname, "memory_pressure_enabled")) {
714 		cp->memory_pressure_enabled = ! !value;
715 		cp->memory_pressure_enabled_valid = 1;
716 		cp->memory_pressure_enabled_dirty = 1;
717 	} else if (streq(optionname, "memory_migrate")) {
718 		cp->memory_migrate = ! !value;
719 		cp->memory_migrate_valid = 1;
720 		cp->memory_migrate_dirty = 1;
721 	} else if (streq(optionname, "memory_spread_page")) {
722 		cp->memory_spread_page = ! !value;
723 		cp->memory_spread_page_valid = 1;
724 		cp->memory_spread_page_dirty = 1;
725 	} else if (streq(optionname, "memory_spread_slab")) {
726 		cp->memory_spread_slab = ! !value;
727 		cp->memory_spread_slab_valid = 1;
728 		cp->memory_spread_slab_dirty = 1;
729 	} else if (streq(optionname, "sched_load_balance")) {
730 		cp->sched_load_balance = ! !value;
731 		cp->sched_load_balance_valid = 1;
732 		cp->sched_load_balance_dirty = 1;
733 	} else if (streq(optionname, "sched_relax_domain_level")) {
734 		cp->sched_relax_domain_level = value;
735 		cp->sched_relax_domain_level_valid = 1;
736 		cp->sched_relax_domain_level_dirty = 1;
737 	} else
738 		return -2;	/* optionname not recognized */
739 	return 0;
740 }
741 
742 /* [optional] Set string value optname */
cpuset_set_sopt(UNUSED struct cpuset * cp,UNUSED const char * optionname,UNUSED const char * value)743 int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname,
744 		    UNUSED const char *value)
745 {
746 	return -2;		/* For now, all string options unrecognized */
747 }
748 
749 /* Return handle for reading memory_pressure. */
cpuset_open_memory_pressure(const char * cpusetpath)750 int cpuset_open_memory_pressure(const char *cpusetpath)
751 {
752 	char buf[PATH_MAX];
753 
754 	fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure");
755 	return open(buf, O_RDONLY);
756 }
757 
758 /* Return current memory_pressure of cpuset. */
cpuset_read_memory_pressure(int han)759 int cpuset_read_memory_pressure(int han)
760 {
761 	char buf[SMALL_BUFSZ];
762 
763 	if (pread(han, buf, sizeof(buf), 0L) < 0)
764 		return -1;
765 	return atoi(buf);
766 }
767 
768 /* Close handle for reading memory pressure. */
cpuset_close_memory_pressure(int han)769 void cpuset_close_memory_pressure(int han)
770 {
771 	close(han);
772 }
773 
774 /*
775  * Resolve cpuset pointer (to that of current task if cp == NULL).
776  *
777  * If cp not NULL, just return it.  If cp is NULL, return pointer
778  * to temporary cpuset for current task, and set *cp_tofree to
779  * pointer to that same temporary cpuset, to be freed later.
780  *
781  * Return NULL and set errno on error.  Errors can occur when
782  * resolving the current tasks cpuset.
783  */
resolve_cp(const struct cpuset * cp,struct cpuset ** cp_tofree)784 static const struct cpuset *resolve_cp(const struct cpuset *cp,
785 				       struct cpuset **cp_tofree)
786 {
787 	const struct cpuset *rcp;
788 
789 	if (cp) {
790 		rcp = cp;
791 	} else {
792 		struct cpuset *cp1 = cpuset_alloc();
793 		if (cp1 == NULL)
794 			goto err;
795 		if (cpuset_cpusetofpid(cp1, 0) < 0) {
796 			cpuset_free(cp1);
797 			goto err;
798 		}
799 		*cp_tofree = cp1;
800 		rcp = cp1;
801 	}
802 	return rcp;
803 err:
804 	return NULL;
805 }
806 
807 /* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */
cpuset_getcpus(const struct cpuset * cp,struct bitmask * cpus)808 int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus)
809 {
810 	struct cpuset *cp_tofree = NULL;
811 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
812 
813 	if (!cp1)
814 		goto err;
815 	if (cp1->cpus == NULL) {
816 		errno = EINVAL;
817 		goto err;
818 	}
819 	bitmask_copy(cpus, cp1->cpus);
820 	cpuset_free(cp_tofree);
821 	return 0;
822 err:
823 	cpuset_free(cp_tofree);
824 	return -1;
825 }
826 
827 /* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */
cpuset_getmems(const struct cpuset * cp,struct bitmask * mems)828 int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems)
829 {
830 	struct cpuset *cp_tofree = NULL;
831 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
832 
833 	if (!cp1)
834 		goto err;
835 	if (cp1->mems == NULL) {
836 		errno = EINVAL;
837 		goto err;
838 	}
839 	bitmask_copy(mems, cp1->mems);
840 	cpuset_free(cp_tofree);
841 	return 0;
842 err:
843 	cpuset_free(cp_tofree);
844 	return -1;
845 }
846 
847 /* Return number of CPUs in cpuset cp (current task if cp == NULL) */
cpuset_cpus_weight(const struct cpuset * cp)848 int cpuset_cpus_weight(const struct cpuset *cp)
849 {
850 	struct cpuset *cp_tofree = NULL;
851 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
852 	int w = -1;
853 
854 	if (!cp1)
855 		goto err;
856 	if (cp1->cpus == NULL) {
857 		errno = EINVAL;
858 		goto err;
859 	}
860 	w = bitmask_weight(cp1->cpus);
861 	/* fall into ... */
862 err:
863 	cpuset_free(cp_tofree);
864 	return w;
865 }
866 
867 /* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */
cpuset_mems_weight(const struct cpuset * cp)868 int cpuset_mems_weight(const struct cpuset *cp)
869 {
870 	struct cpuset *cp_tofree = NULL;
871 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
872 	int w = -1;
873 
874 	if (!cp1)
875 		goto err;
876 	if (cp1->mems == NULL) {
877 		errno = EINVAL;
878 		goto err;
879 	}
880 	w = bitmask_weight(cp1->mems);
881 	/* fall into ... */
882 err:
883 	cpuset_free(cp_tofree);
884 	return w;
885 }
886 
887 /* Return integer value of option optname in cp */
cpuset_get_iopt(const struct cpuset * cp,const char * optionname)888 int cpuset_get_iopt(const struct cpuset *cp, const char *optionname)
889 {
890 	if (streq(optionname, "cpu_exclusive"))
891 		return cp->cpu_exclusive;
892 	else if (streq(optionname, "mem_exclusive"))
893 		return cp->mem_exclusive;
894 	else if (streq(optionname, "mem_hardwall"))
895 		return cp->mem_hardwall;
896 	else if (streq(optionname, "notify_on_release"))
897 		return cp->notify_on_release;
898 	else if (streq(optionname, "memory_pressure_enabled"))
899 		return cp->memory_pressure_enabled;
900 	else if (streq(optionname, "memory_migrate"))
901 		return cp->memory_migrate;
902 	else if (streq(optionname, "memory_spread_page"))
903 		return cp->memory_spread_page;
904 	else if (streq(optionname, "memory_spread_slab"))
905 		return cp->memory_spread_slab;
906 	else if (streq(optionname, "sched_load_balance"))
907 		return cp->sched_load_balance;
908 	else if (streq(optionname, "sched_relax_domain_level"))
909 		return cp->sched_relax_domain_level;
910 	else
911 		return -2;	/* optionname not recognized */
912 }
913 
914 /* [optional] Return string value of optname */
cpuset_get_sopt(UNUSED const struct cpuset * cp,UNUSED const char * optionname)915 const char *cpuset_get_sopt(UNUSED const struct cpuset *cp,
916 			    UNUSED const char *optionname)
917 {
918 	return NULL;		/* For now, all string options unrecognized */
919 }
920 
read_flag(const char * filepath,char * flagp)921 static int read_flag(const char *filepath, char *flagp)
922 {
923 	char buf[SMALL_BUFSZ];	/* buffer a "0" or "1" flag line */
924 	int fd = -1;
925 
926 	if ((fd = open(filepath, O_RDONLY)) < 0)
927 		goto err;
928 	if (read(fd, buf, sizeof(buf)) < 1)
929 		goto err;
930 	if (atoi(buf))
931 		*flagp = 1;
932 	else
933 		*flagp = 0;
934 	close(fd);
935 	return 0;
936 err:
937 	if (fd >= 0)
938 		close(fd);
939 	return -1;
940 }
941 
load_flag(const char * path,char * flagp,const char * flag)942 static int load_flag(const char *path, char *flagp, const char *flag)
943 {
944 	char buf[PATH_MAX];
945 
946 	pathcat2(buf, sizeof(buf), path, flag);
947 	return read_flag(buf, flagp);
948 }
949 
read_number(const char * filepath,int * numberp)950 static int read_number(const char *filepath, int *numberp)
951 {
952 	char buf[SMALL_BUFSZ];
953 	int fd = -1;
954 
955 	if ((fd = open(filepath, O_RDONLY)) < 0)
956 		goto err;
957 	if (read(fd, buf, sizeof(buf)) < 1)
958 		goto err;
959 	*numberp = atoi(buf);
960 	close(fd);
961 	return 0;
962 err:
963 	if (fd >= 0)
964 		close(fd);
965 	return -1;
966 }
967 
load_number(const char * path,int * numberp,const char * file)968 static int load_number(const char *path, int *numberp, const char *file)
969 {
970 	char buf[PATH_MAX];
971 
972 	pathcat2(buf, sizeof(buf), path, file);
973 	return read_number(buf, numberp);
974 }
975 
read_mask(const char * filepath,struct bitmask ** bmpp,int nbits)976 static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits)
977 {
978 	FILE *fp = NULL;
979 	char *buf = NULL;
980 	int buflen;
981 	struct bitmask *bmp = NULL;
982 
983 	if ((fp = fopen(filepath, "r")) == NULL)
984 		goto err;
985 	buflen = filesize(fp) + 1;	/* + 1 for nul term */
986 	if ((buf = malloc(buflen)) == NULL)
987 		goto err;
988 	if (flgets(buf, buflen, fp) == NULL)
989 		goto err;
990 	fclose(fp);
991 	fp = NULL;
992 
993 	if ((bmp = bitmask_alloc(nbits)) == NULL)
994 		goto err;
995 	if (*buf && bitmask_parselist(buf, bmp) < 0)
996 		goto err;
997 	if (*bmpp)
998 		bitmask_free(*bmpp);
999 	*bmpp = bmp;
1000 	free(buf);
1001 	buf = NULL;
1002 	return 0;
1003 err:
1004 	if (buf != NULL)
1005 		free(buf);
1006 	if (fp != NULL)
1007 		fclose(fp);
1008 	if (bmp != NULL)
1009 		bitmask_free(bmp);
1010 	return -1;
1011 }
1012 
load_mask(const char * path,struct bitmask ** bmpp,int nbits,const char * mask)1013 static int load_mask(const char *path, struct bitmask **bmpp,
1014 		     int nbits, const char *mask)
1015 {
1016 	char buf[PATH_MAX];
1017 
1018 	pathcat2(buf, sizeof(buf), path, mask);
1019 	return read_mask(buf, bmpp, nbits);
1020 }
1021 
1022 /* Write string to file at given filepath.  Create or truncate file. */
write_string_file(const char * filepath,const char * str)1023 static int write_string_file(const char *filepath, const char *str)
1024 {
1025 	int fd = -1;
1026 
1027 	if ((fd = open(filepath, O_WRONLY | O_CREAT, 0644)) < 0)
1028 		goto err;
1029 	if (write(fd, str, strlen(str)) < 0)
1030 		goto err;
1031 	close(fd);
1032 	return 0;
1033 err:
1034 	if (fd >= 0)
1035 		close(fd);
1036 	return -1;
1037 }
1038 
1039 /* Size and allocate buffer.  Write bitmask into it.  Caller must free */
sprint_mask_buf(const struct bitmask * bmp)1040 static char *sprint_mask_buf(const struct bitmask *bmp)
1041 {
1042 	char *buf = NULL;
1043 	int buflen;
1044 	char c;
1045 
1046 	/* First bitmask_displaylist() call just to get the length */
1047 	buflen = bitmask_displaylist(&c, 1, bmp) + 1;	/* "+ 1" for nul */
1048 	if ((buf = malloc(buflen)) == NULL)
1049 		return NULL;
1050 	bitmask_displaylist(buf, buflen, bmp);
1051 	return buf;
1052 }
1053 
exists_flag(const char * path,const char * flag)1054 static int exists_flag(const char *path, const char *flag)
1055 {
1056 	char buf[PATH_MAX];
1057 	struct stat statbuf;
1058 	int rc;
1059 
1060 	pathcat2(buf, sizeof(buf), path, flag);
1061 	rc = (stat(buf, &statbuf) == 0);
1062 	errno = 0;
1063 	return rc;
1064 }
1065 
store_flag(const char * path,const char * flag,int val)1066 static int store_flag(const char *path, const char *flag, int val)
1067 {
1068 	char buf[PATH_MAX];
1069 
1070 	pathcat2(buf, sizeof(buf), path, flag);
1071 	return write_string_file(buf, val ? "1" : "0");
1072 }
1073 
store_number(const char * path,const char * file,int val)1074 static int store_number(const char *path, const char *file, int val)
1075 {
1076 	char buf[PATH_MAX];
1077 	char data[SMALL_BUFSZ];
1078 
1079 	memset(data, 0, sizeof(data));
1080 	pathcat2(buf, sizeof(buf), path, file);
1081 	snprintf(data, sizeof(data), "%d", val);
1082 	return write_string_file(buf, data);
1083 }
1084 
store_mask(const char * path,const char * mask,const struct bitmask * bmp)1085 static int store_mask(const char *path, const char *mask,
1086 		      const struct bitmask *bmp)
1087 {
1088 	char maskpath[PATH_MAX];
1089 	char *bp = NULL;
1090 	int rc;
1091 
1092 	if (bmp == NULL)
1093 		return 0;
1094 	pathcat2(maskpath, sizeof(maskpath), path, mask);
1095 	if ((bp = sprint_mask_buf(bmp)) == NULL)
1096 		return -1;
1097 	rc = write_string_file(maskpath, bp);
1098 	free(bp);
1099 	return rc;
1100 }
1101 
1102 /*
1103  * Return 1 if 'cpu' is online, else 0 if offline.  Tests the file
1104  * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents
1105  * were N == cpu number.
1106  */
1107 
cpu_online(unsigned int cpu)1108 char cpu_online(unsigned int cpu)
1109 {
1110 	char online;
1111 	char cpupath[PATH_MAX];
1112 
1113 	(void)snprintf(cpupath, sizeof(cpupath),
1114 		       "/sys/devices/system/cpu/cpu%d/online", cpu);
1115 	if (read_flag(cpupath, &online) < 0)
1116 		return 0;	/* oops - guess that cpu's not there */
1117 	return online;
1118 }
1119 
1120 /*
1121  * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()),
1122  * to the node on which that cpu resides or cpuset_mems_nbits().
1123  *
1124  * To avoid every user having to recalculate this relation
1125  * from various clues in the sysfs file system (below the
1126  * path /sys/devices/system) a copy of this map is kept at
1127  * /var/run/cpunodemap.
1128  *
1129  * The system automatically cleans out files below
1130  * /var/run on each system reboot (see the init script
1131  * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry
1132  * about stale data in this file across reboots.  If the file
1133  * is missing, let the first process that needs it, and has
1134  * permission to write in the /var/run directory, rebuild it.
1135  *
1136  * If using this cached data, remember the mtime of the mapfile
1137  * the last time we read it in case something like a hotplug
1138  * event results in the file being removed and rebuilt, so we
1139  * can detect if we're using a stale cache, and need to reload.
1140  *
1141  * The mtime of this file is set to the time when we did
1142  * the recalculation of the map, from the clues beneath
1143  * /sys/devices/system.  This is done so that a program
1144  * won't see the mapfile it just wrote as being newer than what
1145  * it just wrote out (store_map) and read the same map back in
1146  * (load_file).
1147  */
1148 
1149 /*
1150  * Hold flockfile(stdin) while using cpunodemap for posix thread safety.
1151  *
1152  * Note on locking and flockfile(FILE *):
1153  *
1154  *  We use flockfile() and funlockfile() instead of directly
1155  *  calling pthread_mutex_lock and pthread_mutex_unlock on
1156  *  a pthread_mutex_t, because this avoids forcing the app
1157  *  to link with libpthread.  The glibc implementation of
1158  *  flockfile/funlockfile will fall back to no-ops if libpthread
1159  *  doesn't happen to be linked.
1160  *
1161  *  Since flockfile already has the moderately convoluted
1162  *  combination of weak and strong symbols required to accomplish
1163  *  this, it is easier to use flockfile() on some handy FILE *
1164  *  stream as a surrogate for pthread locking than it is to so
1165  *  re-invent that wheel.
1166  *
1167  *  Forcing all apps that use cpusets to link with libpthread
1168  *  would force non-transparent initialization on apps that
1169  *  might not be prepared to handle it.
1170  *
1171  *  The application using libcpuset should never notice this
1172  *  odd use of flockfile(), because we never return to the
1173  *  application from any libcpuset call with any such lock held.
1174  *  We just use this locking for guarding some non-atomic cached
1175  *  data updates and accesses, internal to some libcpuset calls.
1176  *  Also, flockfile() allows recursive nesting, so if the app
1177  *  calls libcpuset holding such a file lock, we won't deadlock
1178  *  if we go to acquire the same lock.  We'll just get the lock
1179  *  and increment its counter while we hold it.
1180  */
1181 
1182 static struct cpunodemap {
1183 	int *map;		/* map[cpumask_sz]: maps cpu to its node */
1184 	time_t mtime;		/* modtime of mapfile when last read */
1185 } cpunodemap;
1186 
1187 /*
1188  * rebuild_map() - Rebuild cpunodemap[] from scratch.
1189  *
1190  * Situation:
1191  *	Neither our in-memory cpunodemap[] array nor the
1192  *	cache of it in mapfile is current.
1193  * Action:
1194  *	Rebuild it from first principles and the information
1195  *	available below /sys/devices/system.
1196  */
1197 
rebuild_map()1198 static void rebuild_map()
1199 {
1200 	char buf[PATH_MAX];
1201 	DIR *dir1, *dir2;
1202 	struct dirent *dent1, *dent2;
1203 	int ncpus = cpuset_cpus_nbits();
1204 	int nmems = cpuset_mems_nbits();
1205 	unsigned int cpu, mem;
1206 
1207 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1208 		cpunodemap.map[cpu] = -1;
1209 	pathcat2(buf, sizeof(buf), sysdevices, "node");
1210 	if ((dir1 = opendir(buf)) == NULL)
1211 		return;
1212 	while ((dent1 = readdir(dir1)) != NULL) {
1213 		if (sscanf(dent1->d_name, "node%u", &mem) < 1)
1214 			continue;
1215 		pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name);
1216 		if ((dir2 = opendir(buf)) == NULL)
1217 			continue;
1218 		while ((dent2 = readdir(dir2)) != NULL) {
1219 			if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1)
1220 				continue;
1221 			if (cpu >= (unsigned int)ncpus
1222 			    || mem >= (unsigned int)nmems)
1223 				continue;
1224 			cpunodemap.map[cpu] = mem;
1225 		}
1226 		closedir(dir2);
1227 	}
1228 	closedir(dir1);
1229 	cpunodemap.mtime = time(0);
1230 }
1231 
1232 /*
1233  * load_map() - Load cpunodemap[] from mapfile.
1234  *
1235  * Situation:
1236  *	The cpunodemap in mapfile is more recent than
1237  *	what we have in the cpunodemap[] array.
1238  * Action:
1239  *	Reload the cpunodemap[] array from the file.
1240  */
1241 
load_map()1242 static void load_map()
1243 {
1244 	char buf[SMALL_BUFSZ];	/* buffer 1 line of mapfile */
1245 	FILE *mapfp;		/* File stream on mapfile */
1246 	int ncpus = cpuset_cpus_nbits();
1247 	int nmems = cpuset_mems_nbits();
1248 	unsigned int cpu, mem;
1249 
1250 	if ((cpunodemap.map = calloc(ncpus, sizeof(int))) == NULL)
1251 		return;
1252 	cpunodemap.mtime = get_mtime(mapfile);
1253 	if ((mapfp = fopen(mapfile, "r")) == NULL)
1254 		return;
1255 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1256 		cpunodemap.map[cpu] = nmems;
1257 	while (flgets(buf, sizeof(buf), mapfp) != NULL) {
1258 		if (sscanf(buf, "%u %u", &cpu, &mem) < 2)
1259 			continue;
1260 		if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems)
1261 			continue;
1262 		cpunodemap.map[cpu] = mem;
1263 	}
1264 	fclose(mapfp);
1265 }
1266 
1267 /*
1268  * store_map() - Write cpunodemap[] out to mapfile.
1269  *
1270  * Situation:
1271  *	The cpunodemap in the cpunodemap[] array is
1272  *	more recent than the one in mapfile.
1273  * Action:
1274  *	Write cpunodemap[] out to mapfile.
1275  */
1276 
store_map()1277 static void store_map()
1278 {
1279 	char buf[PATH_MAX];
1280 	int fd = -1;
1281 	FILE *mapfp = NULL;
1282 	int ncpus = cpuset_cpus_nbits();
1283 	int nmems = cpuset_mems_nbits();
1284 	unsigned int cpu, mem;
1285 
1286 	snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX");
1287 	if ((fd = mkstemp(buf)) < 0)
1288 		goto err;
1289 	if ((mapfp = fdopen(fd, "w")) == NULL)
1290 		goto err;
1291 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1292 		mem = cpunodemap.map[cpu];
1293 		if (mem < (unsigned int)nmems)
1294 			fprintf(mapfp, "%u %u\n", cpu, mem);
1295 	}
1296 	fclose(mapfp);
1297 	set_mtime(buf, cpunodemap.mtime);
1298 	if (rename(buf, mapfile) < 0)
1299 		goto err;
1300 	/* mkstemp() creates mode 0600 - change to world readable */
1301 	(void)chmod(mapfile, 0444);
1302 	return;
1303 err:
1304 	if (mapfp != NULL) {
1305 		fclose(mapfp);
1306 		fd = -1;
1307 	}
1308 	if (fd >= 0)
1309 		close(fd);
1310 	(void)unlink(buf);
1311 }
1312 
1313 /*
1314  * Load and gain thread safe access to the <cpu, node> map.
1315  *
1316  * Return 0 on success with flockfile(stdin) held.
1317  * Each successful get_map() call must be matched with a
1318  * following put_map() call to release the lock.
1319  *
1320  * On error, return -1 with errno set and no lock held.
1321  */
1322 
get_map()1323 static int get_map()
1324 {
1325 	time_t file_mtime;
1326 
1327 	flockfile(stdin);
1328 
1329 	if (cpunodemap.map == NULL) {
1330 		cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int));
1331 		if (cpunodemap.map == NULL)
1332 			goto err;
1333 	}
1334 
1335 	/* If no one has a good cpunodemap, rebuild from scratch */
1336 	file_mtime = get_mtime(mapfile);
1337 	if (cpunodemap.mtime == 0 && file_mtime == 0)
1338 		rebuild_map();
1339 
1340 	/* If either cpunodemap[] or mapfile newer, update other with it */
1341 	file_mtime = get_mtime(mapfile);
1342 	if (cpunodemap.mtime < file_mtime)
1343 		load_map();
1344 	else if (cpunodemap.mtime > file_mtime)
1345 		store_map();
1346 	return 0;
1347 err:
1348 	funlockfile(stdin);
1349 	return -1;
1350 }
1351 
put_map()1352 static void put_map()
1353 {
1354 	funlockfile(stdin);
1355 }
1356 
1357 /* Set cpus to those local to Memory Nodes mems */
cpuset_localcpus(const struct bitmask * mems,struct bitmask * cpus)1358 int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus)
1359 {
1360 	int ncpus = cpuset_cpus_nbits();
1361 	unsigned int cpu;
1362 
1363 	if (check() < 0)
1364 		return -1;
1365 
1366 	get_map();
1367 	bitmask_clearall(cpus);
1368 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1369 		if (bitmask_isbitset(mems, cpunodemap.map[cpu]))
1370 			bitmask_setbit(cpus, cpu);
1371 	}
1372 	put_map();
1373 	return 0;
1374 }
1375 
1376 /* Set mems to those local to CPUs cpus */
cpuset_localmems(const struct bitmask * cpus,struct bitmask * mems)1377 int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems)
1378 {
1379 	int ncpus = cpuset_cpus_nbits();
1380 	unsigned int cpu;
1381 
1382 	if (check() < 0)
1383 		return -1;
1384 
1385 	get_map();
1386 	bitmask_clearall(mems);
1387 	for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1388 		if (bitmask_isbitset(cpus, cpu))
1389 			bitmask_setbit(mems, cpunodemap.map[cpu]);
1390 	}
1391 	put_map();
1392 	return 0;
1393 }
1394 
1395 /*
1396  * distmap[]
1397  *
1398  * Array of ints of size cpumask_sz by nodemask_sz.
1399  *
1400  * Element distmap[cpu][mem] is the distance between CPU cpu
1401  * and Memory Node mem.  Distances are weighted to roughly
1402  * approximate the cost of memory references, and scaled so that
1403  * the distance from a CPU to its local Memory Node is ten (10).
1404  *
1405  * The first call to cpuset_cpumemdist() builds this map, from
1406  * whatever means the kernel provides to obtain these distances.
1407  *
1408  * These distances derive from ACPI SLIT table entries, which are
1409  * eight bits in size.
1410  *
1411  * Hold flockfile(stdout) while using distmap for posix thread safety.
1412  */
1413 
1414 typedef unsigned char distmap_entry_t;	/* type of distmap[] entries */
1415 
1416 static distmap_entry_t *distmap;	/* maps <cpu, mem> to distance */
1417 
1418 #define DISTMAP_MAX UCHAR_MAX	/* maximum value in distmap[] */
1419 
1420 #define I(i,j) ((i) * nmems + (j))	/* 2-D array index simulation */
1421 
1422 /*
1423  * Parse arch neutral lines from 'distance' files of form:
1424  *
1425  *	46 66 10 20
1426  *
1427  * The lines contain a space separated list of distances, which is parsed
1428  * into array dists[] of each nodes distance from the specified node.
1429  *
1430  * Result is placed in distmap[ncpus][nmems]:
1431  *
1432  *	For each cpu c on node:
1433  *		For each node position n in list of distances:
1434  *			distmap[c][n] = dists[n]
1435  */
1436 
parse_distmap_line(unsigned int node,char * buf)1437 static int parse_distmap_line(unsigned int node, char *buf)
1438 {
1439 	char *p, *q;
1440 	int ncpus = cpuset_cpus_nbits();
1441 	int nmems = cpuset_mems_nbits();
1442 	unsigned int c, n;
1443 	distmap_entry_t *dists = NULL;
1444 	struct bitmask *cpus = NULL, *mems = NULL;
1445 	int ret = -1;
1446 
1447 	p = buf;
1448 	if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1449 		goto err;
1450 	for (n = 0; n < (unsigned int)nmems; n++)
1451 		dists[n] = DISTMAP_MAX;
1452 
1453 	for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1454 		unsigned int d;
1455 
1456 		if ((p = strpbrk(p, "0123456789")) == NULL)
1457 			break;
1458 		d = strtoul(p, &q, 10);
1459 		if (p == q)
1460 			break;
1461 		if (d < DISTMAP_MAX)
1462 			dists[n] = (distmap_entry_t) d;
1463 	}
1464 
1465 	if ((mems = bitmask_alloc(nmems)) == NULL)
1466 		goto err;
1467 	bitmask_setbit(mems, node);
1468 
1469 	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1470 		goto err;
1471 	cpuset_localcpus(mems, cpus);
1472 
1473 	for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1474 	     c = bitmask_next(cpus, c + 1))
1475 		for (n = 0; n < (unsigned int)nmems; n++)
1476 			distmap[I(c, n)] = dists[n];
1477 	ret = 0;
1478 	/* fall into ... */
1479 err:
1480 	bitmask_free(mems);
1481 	bitmask_free(cpus);
1482 	free(dists);
1483 	return ret;
1484 }
1485 
parse_distance_file(unsigned int node,const char * path)1486 static int parse_distance_file(unsigned int node, const char *path)
1487 {
1488 	FILE *fp;
1489 	char *buf = NULL;
1490 	int buflen;
1491 
1492 	if ((fp = fopen(path, "r")) == NULL)
1493 		goto err;
1494 
1495 	buflen = filesize(fp);
1496 
1497 	if ((buf = malloc(buflen)) == NULL)
1498 		goto err;
1499 
1500 	if (flgets(buf, buflen, fp) == NULL)
1501 		goto err;
1502 
1503 	if (parse_distmap_line(node, buf) < 0)
1504 		goto err;
1505 
1506 	free(buf);
1507 	fclose(fp);
1508 	return 0;
1509 err:
1510 	free(buf);
1511 	if (fp)
1512 		fclose(fp);
1513 	return -1;
1514 }
1515 
build_distmap()1516 static void build_distmap()
1517 {
1518 	static int tried_before = 0;
1519 	int ncpus = cpuset_cpus_nbits();
1520 	int nmems = cpuset_mems_nbits();
1521 	int c, m;
1522 	DIR *dir = NULL;
1523 	struct dirent *dent;
1524 
1525 	if (tried_before)
1526 		goto err;
1527 	tried_before = 1;
1528 
1529 	if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1530 		goto err;
1531 
1532 	for (c = 0; c < ncpus; c++)
1533 		for (m = 0; m < nmems; m++)
1534 			distmap[I(c, m)] = DISTMAP_MAX;
1535 
1536 	if ((dir = opendir(distance_directory)) == NULL)
1537 		goto err;
1538 	while ((dent = readdir(dir)) != NULL) {
1539 		char buf[PATH_MAX];
1540 		unsigned int node;
1541 
1542 		if (sscanf(dent->d_name, "node%u", &node) < 1)
1543 			continue;
1544 		pathcat3(buf, sizeof(buf), distance_directory, dent->d_name,
1545 			 "distance");
1546 		if (parse_distance_file(node, buf) < 0)
1547 			goto err;
1548 	}
1549 	closedir(dir);
1550 	return;
1551 err:
1552 	if (dir)
1553 		closedir(dir);
1554 	free(distmap);
1555 	distmap = NULL;
1556 }
1557 
1558 #ifdef ALTERNATE_SN_DISTMAP
1559 
1560 /*
1561  * Parse SN architecture specific line of form:
1562  *
1563  *	node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10
1564  *
1565  * Second field is node number.  The "dist" field is the colon separated list
1566  * of distances, which is parsed into array dists[] of each nodes distance
1567  * from that node.
1568  *
1569  * Result is placed in distmap[ncpus][nmems]:
1570  *
1571  *	For each cpu c on that node:
1572  *		For each node position n in list of distances:
1573  *			distmap[c][n] = dists[n]
1574  */
1575 
parse_distmap_line_sn(char * buf)1576 static void parse_distmap_line_sn(char *buf)
1577 {
1578 	char *p, *pend, *q;
1579 	int ncpus = cpuset_cpus_nbits();
1580 	int nmems = cpuset_mems_nbits();
1581 	unsigned long c, n, node;
1582 	distmap_entry_t *dists = NULL;
1583 	struct bitmask *cpus = NULL, *mems = NULL;
1584 
1585 	if ((p = strchr(buf, ' ')) == NULL)
1586 		goto err;
1587 	if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems)
1588 		goto err;
1589 	if ((p = strstr(q, " dist ")) == NULL)
1590 		goto err;
1591 	p += strlen(" dist ");
1592 	if ((pend = strchr(p, ' ')) != NULL)
1593 		*pend = '\0';
1594 	if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1595 		goto err;
1596 	for (n = 0; n < (unsigned int)nmems; n++)
1597 		dists[n] = DISTMAP_MAX;
1598 
1599 	for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1600 		unsigned long d;
1601 
1602 		if ((p = strpbrk(p, "0123456789")) == NULL)
1603 			break;
1604 		d = strtoul(p, &q, 10);
1605 		if (p == q)
1606 			break;
1607 		if (d < DISTMAP_MAX)
1608 			dists[n] = (distmap_entry_t) d;
1609 	}
1610 
1611 	if ((mems = bitmask_alloc(nmems)) == NULL)
1612 		goto err;
1613 	bitmask_setbit(mems, node);
1614 
1615 	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1616 		goto err;
1617 	cpuset_localcpus(mems, cpus);
1618 
1619 	for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1620 	     c = bitmask_next(cpus, c + 1))
1621 		for (n = 0; n < (unsigned int)nmems; n++)
1622 			distmap[I(c, n)] = dists[n];
1623 	/* fall into ... */
1624 err:
1625 	bitmask_free(mems);
1626 	bitmask_free(cpus);
1627 	free(dists);
1628 }
1629 
build_distmap_sn()1630 static void build_distmap_sn()
1631 {
1632 	int ncpus = cpuset_cpus_nbits();
1633 	int nmems = cpuset_mems_nbits();
1634 	int c, m;
1635 	static int tried_before = 0;
1636 	FILE *fp = NULL;
1637 	char *buf = NULL;
1638 	int buflen;
1639 
1640 	if (tried_before)
1641 		goto err;
1642 	tried_before = 1;
1643 
1644 	if ((fp = fopen(sn_topology, "r")) == NULL)
1645 		goto err;
1646 
1647 	if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1648 		goto err;
1649 
1650 	for (c = 0; c < ncpus; c++)
1651 		for (m = 0; m < nmems; m++)
1652 			distmap[I(c, m)] = DISTMAP_MAX;
1653 
1654 	buflen = filesize(fp);
1655 	if ((buf = malloc(buflen)) == NULL)
1656 		goto err;
1657 
1658 	while (flgets(buf, buflen, fp) != NULL)
1659 		if (strprefix(buf, sn_top_node_prefix))
1660 			parse_distmap_line_sn(buf);
1661 
1662 	free(buf);
1663 	fclose(fp);
1664 	return;
1665 err:
1666 	free(buf);
1667 	free(distmap);
1668 	distmap = NULL;
1669 	if (fp)
1670 		fclose(fp);
1671 }
1672 
1673 #endif
1674 
1675 /* [optional] Hardware distance from CPU to Memory Node */
cpuset_cpumemdist(int cpu,int mem)1676 unsigned int cpuset_cpumemdist(int cpu, int mem)
1677 {
1678 	int ncpus = cpuset_cpus_nbits();
1679 	int nmems = cpuset_mems_nbits();
1680 	distmap_entry_t r = DISTMAP_MAX;
1681 
1682 	flockfile(stdout);
1683 
1684 	if (check() < 0)
1685 		goto err;
1686 
1687 	if (distmap == NULL)
1688 		build_distmap();
1689 
1690 #ifdef ALTERNATE_SN_DISTMAP
1691 	if (distmap == NULL)
1692 		build_distmap_sn();
1693 #endif
1694 
1695 	if (distmap == NULL)
1696 		goto err;
1697 
1698 	if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems)
1699 		goto err;
1700 
1701 	r = distmap[I(cpu, mem)];
1702 	/* fall into ... */
1703 err:
1704 	funlockfile(stdout);
1705 	return r;
1706 }
1707 
1708 /* [optional] Return Memory Node closest to cpu */
cpuset_cpu2node(int cpu)1709 int cpuset_cpu2node(int cpu)
1710 {
1711 	int ncpus = cpuset_cpus_nbits();
1712 	int nmems = cpuset_mems_nbits();
1713 	struct bitmask *cpus = NULL, *mems = NULL;
1714 	int r = -1;
1715 
1716 	if (check() < 0)
1717 		goto err;
1718 
1719 	if ((cpus = bitmask_alloc(ncpus)) == NULL)
1720 		goto err;
1721 	bitmask_setbit(cpus, cpu);
1722 
1723 	if ((mems = bitmask_alloc(nmems)) == NULL)
1724 		goto err;
1725 	cpuset_localmems(cpus, mems);
1726 	r = bitmask_first(mems);
1727 	/* fall into ... */
1728 err:
1729 	bitmask_free(cpus);
1730 	bitmask_free(mems);
1731 	return r;
1732 }
1733 
apply_cpuset_settings(const char * path,const struct cpuset * cp)1734 static int apply_cpuset_settings(const char *path, const struct cpuset *cp)
1735 {
1736 	if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) {
1737 		if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0)
1738 			goto err;
1739 	}
1740 
1741 	if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) {
1742 		if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0)
1743 			goto err;
1744 	}
1745 
1746 	if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) {
1747 		if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0)
1748 			goto err;
1749 	}
1750 
1751 	if (cp->notify_on_release_valid && cp->notify_on_release_dirty) {
1752 		if (store_flag(path, "notify_on_release", cp->notify_on_release)
1753 		    < 0)
1754 			goto err;
1755 	}
1756 
1757 	if (cp->memory_migrate_valid &&
1758 	    cp->memory_migrate_dirty && exists_flag(path, "memory_migrate")) {
1759 		if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0)
1760 			goto err;
1761 	}
1762 
1763 	if (cp->memory_pressure_enabled_valid &&
1764 	    cp->memory_pressure_enabled_dirty &&
1765 	    exists_flag(path, "memory_pressure_enabled")) {
1766 		if (store_flag
1767 		    (path, "memory_pressure_enabled",
1768 		     cp->memory_pressure_enabled) < 0)
1769 			goto err;
1770 	}
1771 
1772 	if (cp->memory_spread_page_valid &&
1773 	    cp->memory_spread_page_dirty &&
1774 	    exists_flag(path, "memory_spread_page")) {
1775 		if (store_flag
1776 		    (path, "memory_spread_page", cp->memory_spread_page) < 0)
1777 			goto err;
1778 	}
1779 
1780 	if (cp->memory_spread_slab_valid &&
1781 	    cp->memory_spread_slab_dirty &&
1782 	    exists_flag(path, "memory_spread_slab")) {
1783 		if (store_flag
1784 		    (path, "memory_spread_slab", cp->memory_spread_slab) < 0)
1785 			goto err;
1786 	}
1787 
1788 	if (cp->sched_load_balance_valid &&
1789 	    cp->sched_load_balance_dirty &&
1790 	    exists_flag(path, "sched_load_balance")) {
1791 		if (store_flag
1792 		    (path, "sched_load_balance", cp->sched_load_balance) < 0)
1793 			goto err;
1794 	}
1795 
1796 	if (cp->sched_relax_domain_level_valid &&
1797 	    cp->sched_relax_domain_level_dirty &&
1798 	    exists_flag(path, "sched_relax_domain_level")) {
1799 		if (store_number
1800 		    (path, "sched_relax_domain_level",
1801 		     cp->sched_relax_domain_level) < 0)
1802 			goto err;
1803 	}
1804 
1805 	if (cp->cpus_valid && cp->cpus_dirty) {
1806 		if (store_mask(path, "cpus", cp->cpus) < 0)
1807 			goto err;
1808 	}
1809 
1810 	if (cp->mems_valid && cp->mems_dirty) {
1811 		if (store_mask(path, "mems", cp->mems) < 0)
1812 			goto err;
1813 	}
1814 	return 0;
1815 err:
1816 	return -1;
1817 }
1818 
1819 /*
1820  * get_siblings() - helper routine for cpuset_would_crash_kernel(), below.
1821  *
1822  * Extract max value of any 'siblings' field in /proc/cpuinfo.
1823  * Cache the result - only need to extract once in lifetime of task.
1824  *
1825  * The siblings field is the number of logical CPUs in a physical
1826  * processor package.  It is equal to the product of the number of
1827  * cores in that package, times the number of hyper-threads per core.
1828  * The bug that cpuset_would_crash_kernel() is detecting arises
1829  * when a cpu_exclusive cpuset tries to include just some, not all,
1830  * of the sibling logical CPUs available in a processor package.
1831  *
1832  * In the improbable case that a system has mixed values of siblings
1833  * (some processor packages have more than others, perhaps due to
1834  * partially enabling Hyper-Threading), we take the worse case value,
1835  * the largest siblings value.  This might be overkill.  I don't know
1836  * if this kernel bug considers each processor package's siblings
1837  * separately or not.  But it sure is easier this way ...
1838  *
1839  * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from
1840  * open to close, the first time called.
1841  */
1842 
get_siblings()1843 static int get_siblings()
1844 {
1845 	static int siblings;
1846 	char buf[32];		/* big enough for one 'siblings' line */
1847 	FILE *fp;
1848 
1849 	if (siblings)
1850 		return siblings;
1851 
1852 	if ((fp = fopen("/proc/cpuinfo", "r")) == NULL)
1853 		return 4;	/* wing it - /proc not mounted ? */
1854 	while (flgets(buf, sizeof(buf), fp) != NULL) {
1855 		int s;
1856 
1857 		if (sscanf(buf, "siblings : %d", &s) < 1)
1858 			continue;
1859 		if (s > siblings)
1860 			siblings = s;
1861 	}
1862 	fclose(fp);
1863 	if (siblings == 0)
1864 		siblings = 1;	/* old kernel, no siblings, default to 1 */
1865 	return siblings;
1866 }
1867 
1868 /*
1869  * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic
1870  * scheduler domain code invoked for cpu_exclusive cpusets that causes
1871  * the kernel to freeze, requiring a hardware reset.
1872  *
1873  * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive'
1874  * cpuset is defined where that cpusets 'cpus' are not on package
1875  * boundaries then the kernel will freeze, usually as soon as this
1876  * cpuset is created, requiring a hardware reset.
1877  *
1878  * A cpusets 'cpus' are not on package boundaries if the cpuset
1879  * includes a proper non-empty subset (some, but not all) of the
1880  * logical cpus on a processor package.  This requires multiple
1881  * logical CPUs per package, available with either Hyper-Thread or
1882  * Multi-Core support.  Without one of these features, there is only
1883  * one logical CPU per physical package, and it's not possible to
1884  * have a proper, non-empty subset of a set of cardinality one.
1885  *
1886  * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC
1887  * on i386 and x86_64 arch's.
1888  *
1889  * The objective of this routine cpuset_would_crash_kernel() is to
1890  * determine if a proposed cpuset setting would crash the kernel due
1891  * to this bug, so that the caller can avoid the crash.
1892  *
1893  * Ideally we'd check for exactly these conditions here, but computing
1894  * the package (identified by the 'physical id' field of /proc/cpuinfo)
1895  * of each cpu in a cpuset is more effort than it's worth here.
1896  *
1897  * Also there is no obvious way to identify exactly whether the kernel
1898  * one is executing on has this bug, short of trying it, and seeing
1899  * if the kernel just crashed.
1900  *
1901  * So for now, we look for a simpler set of conditions, that meets
1902  * our immediate need - avoid this crash on SUSE SLES10 systems that
1903  * are susceptible to it.  We look for the kernel version 2.6.16.*,
1904  * which is the base kernel of SUSE SLES10, and for i386 or x86_64
1905  * processors, which had CONFIG_SCHED_MC enabled.
1906  *
1907  * If these simpler conditions are met, we further simplify the check,
1908  * by presuming that the logical CPUs are numbered on processor
1909  * package boundaries.  If each package has S siblings, we assume
1910  * that CPUs numbered N through N + S -1 are on the same package,
1911  * for any CPU N such that N mod S == 0.
1912  *
1913  * Yes, this is a hack, focused on avoiding kernel freezes on
1914  * susceptible SUSE SLES10 systems.
1915  */
1916 
cpuset_would_crash_kernel(const struct cpuset * cp)1917 static int cpuset_would_crash_kernel(const struct cpuset *cp)
1918 {
1919 	static int susceptible_system = -1;
1920 
1921 	if (!cp->cpu_exclusive)
1922 		goto ok;
1923 
1924 	if (susceptible_system == -1) {
1925 		struct utsname u;
1926 		int rel_2_6_16, arch_i386, arch_x86_64;
1927 
1928 		if (uname(&u) < 0)
1929 			goto fail;
1930 		rel_2_6_16 = strprefix(u.release, "2.6.16.");
1931 		arch_i386 = streq(u.machine, "i386");
1932 		arch_x86_64 = streq(u.machine, "x86_64");
1933 		susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64);
1934 	}
1935 
1936 	if (susceptible_system) {
1937 		int ncpus = cpuset_cpus_nbits();
1938 		int siblings = get_siblings();
1939 		unsigned int cpu;
1940 
1941 		for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) {
1942 			int s, num_set = 0;
1943 
1944 			for (s = 0; s < siblings; s++) {
1945 				if (bitmask_isbitset(cp->cpus, cpu + s))
1946 					num_set++;
1947 			}
1948 
1949 			/* If none or all siblings set, we're still ok */
1950 			if (num_set == 0 || num_set == siblings)
1951 				continue;
1952 
1953 			/* Found one that would crash kernel.  Fail.  */
1954 			errno = ENXIO;
1955 			goto fail;
1956 		}
1957 	}
1958 	/* If not susceptible, or if all ok, fall into "ok" ... */
1959 ok:
1960 	return 0;		/* would not crash */
1961 fail:
1962 	return 1;		/* would crash */
1963 }
1964 
1965 /* compare two cpuset and mark the dirty variable */
mark_dirty_variable(struct cpuset * cp1,const struct cpuset * cp2)1966 static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2)
1967 {
1968 	if (cp1->cpu_exclusive_valid &&
1969 	    cp1->cpu_exclusive != cp2->cpu_exclusive)
1970 		cp1->cpu_exclusive_dirty = 1;
1971 
1972 	if (cp1->mem_exclusive_valid &&
1973 	    cp1->mem_exclusive != cp2->mem_exclusive)
1974 		cp1->mem_exclusive_dirty = 1;
1975 
1976 	if (cp1->mem_hardwall_valid && cp1->mem_hardwall != cp2->mem_hardwall)
1977 		cp1->mem_hardwall_dirty = 1;
1978 
1979 	if (cp1->notify_on_release_valid &&
1980 	    cp1->notify_on_release != cp2->notify_on_release)
1981 		cp1->notify_on_release_dirty = 1;
1982 
1983 	if (cp1->memory_migrate_valid &&
1984 	    cp1->memory_migrate != cp2->memory_migrate)
1985 		cp1->memory_migrate_dirty = 1;
1986 
1987 	if (cp1->memory_pressure_enabled_valid &&
1988 	    cp1->memory_pressure_enabled != cp2->memory_pressure_enabled)
1989 		cp1->memory_pressure_enabled_dirty = 1;
1990 
1991 	if (cp1->memory_spread_page_valid &&
1992 	    cp1->memory_spread_page != cp2->memory_spread_page)
1993 		cp1->memory_spread_page_dirty = 1;
1994 
1995 	if (cp1->memory_spread_slab_valid &&
1996 	    cp1->memory_spread_slab != cp2->memory_spread_slab)
1997 		cp1->memory_spread_slab_dirty = 1;
1998 
1999 	if (cp1->sched_load_balance_valid &&
2000 	    cp1->sched_load_balance != cp2->sched_load_balance)
2001 		cp1->sched_load_balance_dirty = 1;
2002 
2003 	if (cp1->sched_relax_domain_level_valid &&
2004 	    cp1->sched_relax_domain_level != cp2->sched_relax_domain_level)
2005 		cp1->sched_relax_domain_level_dirty = 1;
2006 
2007 	if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus))
2008 		cp1->cpus_dirty = 1;
2009 	if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems))
2010 		cp1->mems_dirty = 1;
2011 }
2012 
2013 /* Create (if new set) or modify cpuset 'cp' at location 'relpath' */
cr_or_mod(const char * relpath,const struct cpuset * cp,int new)2014 static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new)
2015 {
2016 	char buf[PATH_MAX];
2017 	int do_rmdir_on_err = 0;
2018 	int do_restore_cp_sav_on_err = 0;
2019 	struct cpuset *cp_sav = NULL;
2020 	int sav_errno;
2021 
2022 	if (check() < 0)
2023 		goto err;
2024 
2025 	if (cpuset_would_crash_kernel(cp))
2026 		goto err;
2027 
2028 	fullpath(buf, sizeof(buf), relpath);
2029 
2030 	if (new) {
2031 		if (mkdir(buf, 0755) < 0)
2032 			goto err;
2033 		/* we made it, so we should remove it on error */
2034 		do_rmdir_on_err = 1;
2035 	}
2036 
2037 	if ((cp_sav = cpuset_alloc()) == NULL)
2038 		goto err;
2039 	if (cpuset_query(cp_sav, relpath) < 0)
2040 		goto err;
2041 	/* we have old settings to restore on error */
2042 	do_restore_cp_sav_on_err = 1;
2043 
2044 	/* check which variable need to restore on error */
2045 	mark_dirty_variable(cp_sav, cp);
2046 
2047 	if (apply_cpuset_settings(buf, cp) < 0)
2048 		goto err;
2049 
2050 	cpuset_free(cp_sav);
2051 	return 0;
2052 err:
2053 	sav_errno = errno;
2054 	if (do_restore_cp_sav_on_err)
2055 		(void)apply_cpuset_settings(buf, cp_sav);
2056 	if (cp_sav)
2057 		cpuset_free(cp_sav);
2058 	if (do_rmdir_on_err)
2059 		(void)rmdir(buf);
2060 	errno = sav_errno;
2061 	return -1;
2062 }
2063 
2064 /* Create cpuset 'cp' at location 'relpath' */
cpuset_create(const char * relpath,const struct cpuset * cp)2065 int cpuset_create(const char *relpath, const struct cpuset *cp)
2066 {
2067 	return cr_or_mod(relpath, cp, 1);
2068 }
2069 
2070 /* Delete cpuset at location 'path' (if empty) */
cpuset_delete(const char * relpath)2071 int cpuset_delete(const char *relpath)
2072 {
2073 	char buf[PATH_MAX];
2074 
2075 	if (check() < 0)
2076 		goto err;
2077 
2078 	fullpath(buf, sizeof(buf), relpath);
2079 	if (rmdir(buf) < 0)
2080 		goto err;
2081 
2082 	return 0;
2083 err:
2084 	return -1;
2085 }
2086 
2087 /* Set cpuset cp to the cpuset at location 'path' */
cpuset_query(struct cpuset * cp,const char * relpath)2088 int cpuset_query(struct cpuset *cp, const char *relpath)
2089 {
2090 	char buf[PATH_MAX];
2091 
2092 	if (check() < 0)
2093 		goto err;
2094 
2095 	fullpath(buf, sizeof(buf), relpath);
2096 
2097 	if (load_flag(buf, &cp->cpu_exclusive, "cpu_exclusive") < 0)
2098 		goto err;
2099 	cp->cpu_exclusive_valid = 1;
2100 
2101 	if (load_flag(buf, &cp->mem_exclusive, "mem_exclusive") < 0)
2102 		goto err;
2103 	cp->mem_exclusive_valid = 1;
2104 
2105 	if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0)
2106 		goto err;
2107 	cp->notify_on_release_valid = 1;
2108 
2109 	if (exists_flag(buf, "memory_migrate")) {
2110 		if (load_flag(buf, &cp->memory_migrate, "memory_migrate") < 0)
2111 			goto err;
2112 		cp->memory_migrate_valid = 1;
2113 	}
2114 
2115 	if (exists_flag(buf, "mem_hardwall")) {
2116 		if (load_flag(buf, &cp->mem_hardwall, "mem_hardwall") < 0)
2117 			goto err;
2118 		cp->mem_hardwall_valid = 1;
2119 	}
2120 
2121 	if (exists_flag(buf, "memory_pressure_enabled")) {
2122 		if (load_flag
2123 		    (buf, &cp->memory_pressure_enabled,
2124 		     "memory_pressure_enabled") < 0)
2125 			goto err;
2126 		cp->memory_pressure_enabled_valid = 1;
2127 	}
2128 
2129 	if (exists_flag(buf, "memory_spread_page")) {
2130 		if (load_flag
2131 		    (buf, &cp->memory_spread_page, "memory_spread_page") < 0)
2132 			goto err;
2133 		cp->memory_spread_page_valid = 1;
2134 	}
2135 
2136 	if (exists_flag(buf, "memory_spread_slab")) {
2137 		if (load_flag
2138 		    (buf, &cp->memory_spread_slab, "memory_spread_slab") < 0)
2139 			goto err;
2140 		cp->memory_spread_slab_valid = 1;
2141 	}
2142 
2143 	if (exists_flag(buf, "sched_load_balance")) {
2144 		if (load_flag
2145 		    (buf, &cp->sched_load_balance, "sched_load_balance") < 0)
2146 			goto err;
2147 		cp->sched_load_balance_valid = 1;
2148 	}
2149 
2150 	if (exists_flag(buf, "sched_relax_domain_level")) {
2151 		if (load_number
2152 		    (buf, &cp->sched_relax_domain_level,
2153 		     "sched_relax_domain_level") < 0)
2154 			goto err;
2155 		cp->sched_relax_domain_level_valid = 1;
2156 	}
2157 
2158 	if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpus") < 0)
2159 		goto err;
2160 	cp->cpus_valid = 1;
2161 
2162 	if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "mems") < 0)
2163 		goto err;
2164 	cp->mems_valid = 1;
2165 
2166 	return 0;
2167 err:
2168 	return -1;
2169 }
2170 
2171 /* Modify cpuset at location 'relpath' to values of 'cp' */
cpuset_modify(const char * relpath,const struct cpuset * cp)2172 int cpuset_modify(const char *relpath, const struct cpuset *cp)
2173 {
2174 	return cr_or_mod(relpath, cp, 0);
2175 }
2176 
2177 /* Get cpuset path of pid into buf */
cpuset_getcpusetpath(pid_t pid,char * buf,size_t size)2178 char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size)
2179 {
2180 	int fd;			/* dual use: cpuset file for pid and self */
2181 	int rc;			/* dual use: snprintf and read return codes */
2182 
2183 	if (check() < 0)
2184 		return NULL;
2185 
2186 	/* borrow result buf[] to build cpuset file path */
2187 	if (pid == 0)
2188 		rc = snprintf(buf, size, "/proc/self/cpuset");
2189 	else
2190 		rc = snprintf(buf, size, "/proc/%d/cpuset", pid);
2191 	if (rc >= (int)size) {
2192 		errno = E2BIG;
2193 		return NULL;
2194 	}
2195 	if ((fd = open(buf, O_RDONLY)) < 0) {
2196 		int e = errno;
2197 		if (e == ENOENT)
2198 			e = ESRCH;
2199 		if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0)
2200 			e = ENOSYS;
2201 		else
2202 			close(fd);
2203 		errno = e;
2204 		return NULL;
2205 	}
2206 	rc = read(fd, buf, size);
2207 	close(fd);
2208 	if (rc < 0)
2209 		return NULL;
2210 	if (rc >= (int)size) {
2211 		errno = E2BIG;
2212 		return NULL;
2213 	}
2214 	buf[rc] = 0;
2215 	chomp(buf);
2216 	return buf;
2217 
2218 }
2219 
2220 /* Get cpuset 'cp' of pid */
cpuset_cpusetofpid(struct cpuset * cp,pid_t pid)2221 int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid)
2222 {
2223 	char buf[PATH_MAX];
2224 
2225 	if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
2226 		return -1;
2227 	if (cpuset_query(cp, buf) < 0)
2228 		return -1;
2229 	return 0;
2230 }
2231 
2232 /* [optional] Return mountpoint of cpuset filesystem */
cpuset_mountpoint()2233 const char *cpuset_mountpoint()
2234 {
2235 	if (check() < 0) {
2236 		switch (errno) {
2237 		case ENODEV:
2238 			return "[cpuset filesystem not mounted]";
2239 		default:
2240 			return "[cpuset filesystem not supported]";
2241 		}
2242 	}
2243 	return cpusetmnt;
2244 }
2245 
2246 /* Return true if path is a directory. */
isdir(const char * path)2247 static int isdir(const char *path)
2248 {
2249 	struct stat statbuf;
2250 
2251 	if (stat(path, &statbuf) < 0)
2252 		return 0;
2253 	return S_ISDIR(statbuf.st_mode);
2254 }
2255 
2256 /*
2257  * [optional] cpuset_collides_exclusive() - True if would collide exclusive.
2258  *
2259  * Return true iff the specified cpuset would overlap with any
2260  * sibling cpusets in either cpus or mems, where either this
2261  * cpuset or the sibling is cpu_exclusive or mem_exclusive.
2262  *
2263  * cpuset_create() fails with errno == EINVAL if the requested cpuset
2264  * would overlap with any sibling, where either one is cpu_exclusive or
2265  * mem_exclusive.  This is a common, and not obvious error.  The
2266  * following routine checks for this particular case, so that code
2267  * creating cpusets can better identify the situation, perhaps to issue
2268  * a more informative error message.
2269  *
2270  * Can also be used to diagnose cpuset_modify failures.  This
2271  * routine ignores any existing cpuset with the same path as the
2272  * given 'cpusetpath', and only looks for exclusive collisions with
2273  * sibling cpusets of that path.
2274  *
2275  * In case of any error, returns (0) -- does not collide.  Presumably
2276  * any actual attempt to create or modify a cpuset will encounter the
2277  * same error, and report it usefully.
2278  *
2279  * This routine is not particularly efficient; most likely code creating or
2280  * modifying a cpuset will want to try the operation first, and then if that
2281  * fails with errno EINVAL, perhaps call this routine to determine if an
2282  * exclusive cpuset collision caused the error.
2283  */
2284 
cpuset_collides_exclusive(const char * cpusetpath,const struct cpuset * cp1)2285 int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1)
2286 {
2287 	char parent[PATH_MAX];
2288 	char *p;
2289 	char *pathcopy = NULL;
2290 	char *base;
2291 	DIR *dir = NULL;
2292 	struct dirent *dent;
2293 	struct cpuset *cp2 = NULL;
2294 	struct bitmask *cpus1 = NULL, *cpus2 = NULL;
2295 	struct bitmask *mems1 = NULL, *mems2 = NULL;
2296 	int ret;
2297 
2298 	if (check() < 0)
2299 		goto err;
2300 
2301 	fullpath(parent, sizeof(parent), cpusetpath);
2302 	if (streq(parent, cpusetmnt))
2303 		goto err;	/* only one cpuset root - can't collide */
2304 	pathcopy = strdup(parent);
2305 	p = strrchr(parent, '/');
2306 	if (!p)
2307 		goto err;	/* huh? - impossible - run and hide */
2308 	*p = 0;			/* now parent is dirname of fullpath */
2309 
2310 	p = strrchr(pathcopy, '/');
2311 	base = p + 1;		/* now base is basename of fullpath */
2312 	if (!*base)
2313 		goto err;	/* this is also impossible - run away */
2314 
2315 	if ((dir = opendir(parent)) == NULL)
2316 		goto err;
2317 	if ((cp2 = cpuset_alloc()) == NULL)
2318 		goto err;
2319 	if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2320 		goto err;
2321 	if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2322 		goto err;
2323 	if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2324 		goto err;
2325 	if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2326 		goto err;
2327 
2328 	while ((dent = readdir(dir)) != NULL) {
2329 		char child[PATH_MAX];
2330 
2331 		if (streq(dent->d_name, ".") || streq(dent->d_name, ".."))
2332 			continue;
2333 		if (streq(dent->d_name, base))
2334 			continue;
2335 		pathcat2(child, sizeof(child), parent, dent->d_name);
2336 		if (!isdir(child))
2337 			continue;
2338 		if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0)
2339 			goto err;
2340 		if (cp1->cpu_exclusive || cp2->cpu_exclusive) {
2341 			cpuset_getcpus(cp1, cpus1);
2342 			cpuset_getcpus(cp2, cpus2);
2343 			if (bitmask_intersects(cpus1, cpus2))
2344 				goto collides;
2345 		}
2346 		if (cp1->mem_exclusive || cp2->mem_exclusive) {
2347 			cpuset_getmems(cp1, mems1);
2348 			cpuset_getmems(cp2, mems2);
2349 			if (bitmask_intersects(mems1, mems2))
2350 				goto collides;
2351 		}
2352 	}
2353 err:
2354 	/* error, or did not collide */
2355 	ret = 0;
2356 	goto done;
2357 collides:
2358 	/* collides */
2359 	ret = 1;
2360 	/* fall into ... */
2361 done:
2362 	if (dir)
2363 		closedir(dir);
2364 	cpuset_free(cp2);
2365 	free(pathcopy);
2366 	bitmask_free(cpus1);
2367 	bitmask_free(cpus2);
2368 	bitmask_free(mems1);
2369 	bitmask_free(mems2);
2370 	return ret;
2371 }
2372 
2373 /*
2374  * [optional] cpuset_nuke() - Remove cpuset anyway possible
2375  *
2376  * Remove a cpuset, including killing tasks in it, and
2377  * removing any descendent cpusets and killing their tasks.
2378  *
2379  * Tasks can take a long time (minutes on some configurations)
2380  * to exit.  Loop up to 'seconds' seconds, trying to kill them.
2381  *
2382  * How we do it:
2383  *	1) First, kill all the pids, looping until there are
2384  *	   no more pids in this cpuset or below, or until the
2385  *	   'seconds' timeout limit is exceeded.
2386  *	2) Then depth first recursively rmdir the cpuset directories.
2387  *	3) If by this point the original cpuset is gone, we succeeded.
2388  *
2389  * If the timeout is exceeded, and tasks still exist, fail with
2390  * errno == ETIME.
2391  *
2392  * We sleep a variable amount of time.  After the first attempt to
2393  * kill all the tasks in the cpuset or its descendents, we sleep 1
2394  * second, the next time 2 seconds, increasing 1 second each loop
2395  * up to a max of 10 seconds.  If more loops past 10 are required
2396  * to kill all the tasks, we sleep 10 seconds each subsequent loop.
2397  * In any case, before the last loop, we sleep however many seconds
2398  * remain of the original timeout 'seconds' requested.  The total
2399  * time of all sleeps will be no more than the requested 'seconds'.
2400  *
2401  * If the cpuset started out empty of any tasks, or if the passed in
2402  * 'seconds' was zero, then this routine will return quickly, having
2403  * not slept at all.  Otherwise, this routine will at a minimum send
2404  * a SIGKILL to all the tasks in this cpuset subtree, then sleep one
2405  * second, before looking to see if any tasks remain.  If tasks remain
2406  * in the cpuset subtree, and a longer 'seconds' timeout was requested
2407  * (more than one), it will continue to kill remaining tasks and sleep,
2408  * in a loop, for as long as time and tasks remain.
2409  *
2410  * The signal sent for the kill is hardcoded to SIGKILL (9).  If some
2411  * other signal should be sent first, use a separate code loop,
2412  * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to
2413  * scan the task pids in a cpuset.  If SIGKILL should -not- be sent,
2414  * this cpuset_nuke() routine can still be called to recursively
2415  * remove a cpuset subtree, by specifying a timeout of zero 'seconds'.
2416  *
2417  * On success, returns 0 with errno == 0.
2418  *
2419  * On failure, returns -1, with errno possibly one of:
2420  *  EACCES - search permission denied on intervening directory
2421  *  ETIME - timed out - tasks remain after 'seconds' timeout
2422  *  EMFILE - too many open files
2423  *  ENODEV - /dev/cpuset not mounted
2424  *  ENOENT - component of cpuset path doesn't exist
2425  *  ENOMEM - out of memory
2426  *  ENOSYS - kernel doesn't support cpusets
2427  *  ENOTDIR - component of cpuset path is not a directory
2428  *  EPERM - lacked permission to kill a task
2429  *  EPERM - lacked permission to read cpusets or files therein
2430  */
2431 
2432 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree);
2433 
cpuset_nuke(const char * relpath,unsigned int seconds)2434 int cpuset_nuke(const char *relpath, unsigned int seconds)
2435 {
2436 	unsigned int secs_left = seconds;	/* total sleep seconds left */
2437 	unsigned int secs_loop = 1;	/* how much sleep next loop */
2438 	unsigned int secs_slept;	/* seconds slept in sleep() */
2439 	struct cpuset_pidlist *pl = NULL;	/* pids in cpuset subtree */
2440 	struct cpuset_fts_tree *cs_tree;
2441 	const struct cpuset_fts_entry *cs_entry;
2442 	int ret, sav_errno = 0;
2443 
2444 	if (check() < 0)
2445 		return -1;
2446 
2447 	if (seconds == 0)
2448 		goto rmdir_cpusets;
2449 
2450 	while (1) {
2451 		int plen, j;
2452 
2453 		if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) {
2454 			/* missing cpuset is as good as if already nuked */
2455 			if (errno == ENOENT) {
2456 				ret = 0;
2457 				goto no_more_cpuset;
2458 			}
2459 
2460 			/* other problems reading cpuset are bad news */
2461 			sav_errno = errno;
2462 			goto failed;
2463 		}
2464 
2465 		if ((plen = cpuset_pidlist_length(pl)) == 0)
2466 			goto rmdir_cpusets;
2467 
2468 		for (j = 0; j < plen; j++) {
2469 			pid_t pid;
2470 
2471 			if ((pid = cpuset_get_pidlist(pl, j)) > 1) {
2472 				if (kill(pid, SIGKILL) < 0 && errno != ESRCH) {
2473 					sav_errno = errno;
2474 					goto failed;
2475 				}
2476 			}
2477 		}
2478 
2479 		if (secs_left == 0)
2480 			goto took_too_long;
2481 
2482 		cpuset_freepidlist(pl);
2483 		pl = NULL;
2484 
2485 		secs_slept = secs_loop - sleep(secs_loop);
2486 
2487 		/* Ensure forward progress */
2488 		if (secs_slept == 0)
2489 			secs_slept = 1;
2490 
2491 		/* Ensure sane sleep() return (unnecessary?) */
2492 		if (secs_slept > secs_loop)
2493 			secs_slept = secs_loop;
2494 
2495 		secs_left -= secs_slept;
2496 
2497 		if (secs_loop < 10)
2498 			secs_loop++;
2499 
2500 		secs_loop = MIN(secs_left, secs_loop);
2501 	}
2502 
2503 took_too_long:
2504 	sav_errno = ETIME;
2505 	/* fall into ... */
2506 failed:
2507 	cpuset_freepidlist(pl);
2508 	errno = sav_errno;
2509 	return -1;
2510 
2511 rmdir_cpusets:
2512 	/* Let's try removing cpuset(s) now. */
2513 	cpuset_freepidlist(pl);
2514 
2515 	if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT)
2516 		return -1;
2517 	ret = 0;
2518 	cpuset_fts_reverse(cs_tree);	/* rmdir's must be done bottom up */
2519 	while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2520 		char buf[PATH_MAX];
2521 
2522 		fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry));
2523 		if (rmdir(buf) < 0 && errno != ENOENT) {
2524 			sav_errno = errno;
2525 			ret = -1;
2526 		}
2527 	}
2528 	cpuset_fts_close(cs_tree);
2529 	/* fall into ... */
2530 no_more_cpuset:
2531 	if (ret == 0)
2532 		errno = 0;
2533 	else
2534 		errno = sav_errno;
2535 	return ret;
2536 }
2537 
2538 /*
2539  * When recursively reading all the tasks files from a subtree,
2540  * chain together the read results, one pidblock per tasks file,
2541  * containing the raw unprocessed ascii as read(2) in.  After
2542  * we gather up this raw data, we then go back to count how
2543  * many pid's there are in total, allocate an array of pid_t
2544  * of that size, and transform the raw ascii data into this
2545  * array of pid_t's.
2546  */
2547 
2548 struct pidblock {
2549 	char *buf;
2550 	int buflen;
2551 	struct pidblock *next;
2552 };
2553 
2554 /*
2555  * Chain the raw contents of a file onto the pbhead list.
2556  *
2557  * We malloc "+ 1" extra byte for a nul-terminator, so that
2558  * the strtoul() loop in pid_transform() won't scan past
2559  * the end of pb->buf[] and accidentally find more pids.
2560  */
add_pidblock(const char * file,struct pidblock ** ppbhead)2561 static void add_pidblock(const char *file, struct pidblock **ppbhead)
2562 {
2563 	FILE *fp = NULL;
2564 	struct pidblock *pb = NULL;
2565 	int fsz;
2566 
2567 	if ((fp = fopen(file, "r")) == NULL)
2568 		goto err;
2569 	fsz = filesize(fp);
2570 	if (fsz == 0)
2571 		goto err;
2572 	if ((pb = calloc(1, sizeof(*pb))) == NULL)
2573 		goto err;
2574 	pb->buflen = fsz;
2575 	if ((pb->buf = malloc(pb->buflen + 1)) == NULL)
2576 		goto err;
2577 	if (fread(pb->buf, 1, pb->buflen, fp) > 0) {
2578 		pb->buf[pb->buflen] = '\0';
2579 		pb->next = *ppbhead;
2580 		*ppbhead = pb;
2581 	}
2582 	fclose(fp);
2583 	return;
2584 err:
2585 	if (fp)
2586 		fclose(fp);
2587 	free(pb);
2588 }
2589 
read_task_file(const char * relpath,struct pidblock ** ppbhead)2590 static void read_task_file(const char *relpath, struct pidblock **ppbhead)
2591 {
2592 	char buf[PATH_MAX];
2593 
2594 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2595 	add_pidblock(buf, ppbhead);
2596 }
2597 
2598 struct cpuset_pidlist {
2599 	pid_t *pids;
2600 	int npids;
2601 };
2602 
2603 /* Count how many pids in buf (one per line - just count newlines) */
pidcount(const char * buf,int buflen)2604 static int pidcount(const char *buf, int buflen)
2605 {
2606 	int n = 0;
2607 	const char *cp;
2608 
2609 	for (cp = buf; cp < buf + buflen; cp++) {
2610 		if (*cp == '\n')
2611 			n++;
2612 	}
2613 	return n;
2614 }
2615 
2616 /* Transform one-per-line ascii pids in pb to pid_t entries in pl */
pid_transform(struct pidblock * pb,struct cpuset_pidlist * pl,int n)2617 static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n)
2618 {
2619 	char *a, *b;
2620 
2621 	for (a = pb->buf; a < pb->buf + pb->buflen; a = b) {
2622 		pid_t p = strtoul(a, &b, 10);
2623 		if (a == b)
2624 			break;
2625 		pl->pids[n++] = p;
2626 	}
2627 	return n;
2628 }
2629 
free_pidblocks(struct pidblock * pbhead)2630 static void free_pidblocks(struct pidblock *pbhead)
2631 {
2632 	struct pidblock *pb, *nextpb;
2633 
2634 	for (pb = pbhead; pb; pb = nextpb) {
2635 		nextpb = pb->next;
2636 		free(pb->buf);
2637 		free(pb);
2638 	}
2639 }
2640 
2641 /* numeric comparison routine for qsort */
numericsort(const void * m1,const void * m2)2642 static int numericsort(const void *m1, const void *m2)
2643 {
2644 	pid_t p1 = *(pid_t *) m1;
2645 	pid_t p2 = *(pid_t *) m2;
2646 
2647 	return p1 - p2;
2648 }
2649 
2650 /* Return list pids in cpuset 'path' */
cpuset_init_pidlist(const char * relpath,int recursiveflag)2651 struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath,
2652 					   int recursiveflag)
2653 {
2654 	struct pidblock *pb = NULL;
2655 	struct cpuset_pidlist *pl = NULL;
2656 	struct pidblock *pbhead = NULL;
2657 	int n;
2658 
2659 	if (check() < 0)
2660 		goto err;
2661 
2662 	if (recursiveflag) {
2663 		struct cpuset_fts_tree *cs_tree;
2664 		const struct cpuset_fts_entry *cs_entry;
2665 
2666 		if ((cs_tree = cpuset_fts_open(relpath)) == NULL)
2667 			goto err;
2668 		while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2669 			if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET)
2670 				continue;
2671 			read_task_file(cpuset_fts_get_path(cs_entry), &pbhead);
2672 		}
2673 		cpuset_fts_close(cs_tree);
2674 	} else {
2675 		read_task_file(relpath, &pbhead);
2676 	}
2677 
2678 	if ((pl = calloc(1, sizeof(*pl))) == NULL)
2679 		goto err;
2680 	pl->npids = 0;
2681 	for (pb = pbhead; pb; pb = pb->next)
2682 		pl->npids += pidcount(pb->buf, pb->buflen);
2683 	if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL)
2684 		goto err;
2685 	n = 0;
2686 	for (pb = pbhead; pb; pb = pb->next)
2687 		n = pid_transform(pb, pl, n);
2688 	free_pidblocks(pbhead);
2689 	qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort);
2690 	return pl;
2691 err:
2692 	cpuset_freepidlist(pl);
2693 	free_pidblocks(pbhead);
2694 	return NULL;
2695 }
2696 
2697 /* Return number of elements in pidlist */
cpuset_pidlist_length(const struct cpuset_pidlist * pl)2698 int cpuset_pidlist_length(const struct cpuset_pidlist *pl)
2699 {
2700 	if (pl)
2701 		return pl->npids;
2702 	else
2703 		return 0;
2704 }
2705 
2706 /* Return i'th element of pidlist */
cpuset_get_pidlist(const struct cpuset_pidlist * pl,int i)2707 pid_t cpuset_get_pidlist(const struct cpuset_pidlist * pl, int i)
2708 {
2709 	if (pl && i >= 0 && i < pl->npids)
2710 		return pl->pids[i];
2711 	else
2712 		return (pid_t) - 1;
2713 }
2714 
2715 /* Free pidlist */
cpuset_freepidlist(struct cpuset_pidlist * pl)2716 void cpuset_freepidlist(struct cpuset_pidlist *pl)
2717 {
2718 	if (pl && pl->pids)
2719 		free(pl->pids);
2720 	free(pl);
2721 }
2722 
__cpuset_move(pid_t pid,const char * path)2723 static int __cpuset_move(pid_t pid, const char *path)
2724 {
2725 	char buf[SMALL_BUFSZ];
2726 
2727 	snprintf(buf, sizeof(buf), "%u", pid);
2728 	return write_string_file(path, buf);
2729 }
2730 
2731 /* Move task (pid == 0 for current) to a cpuset */
cpuset_move(pid_t pid,const char * relpath)2732 int cpuset_move(pid_t pid, const char *relpath)
2733 {
2734 	char buf[PATH_MAX];
2735 
2736 	if (check() < 0)
2737 		return -1;
2738 
2739 	if (pid == 0)
2740 		pid = getpid();
2741 
2742 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2743 	return __cpuset_move(pid, buf);
2744 }
2745 
2746 /* Move all tasks in pidlist to a cpuset */
cpuset_move_all(struct cpuset_pidlist * pl,const char * relpath)2747 int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath)
2748 {
2749 	int i;
2750 	char buf[PATH_MAX];
2751 	int ret;
2752 
2753 	if (check() < 0)
2754 		return -1;
2755 
2756 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2757 
2758 	ret = 0;
2759 	for (i = 0; i < pl->npids; i++)
2760 		if (__cpuset_move(pl->pids[i], buf) < 0)
2761 			ret = -1;
2762 	return ret;
2763 }
2764 
2765 /*
2766  * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a
2767  *                                      cpuset to another cpuset
2768  *
2769  * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may
2770  * race with tasks being added to or forking into fromrelpath. Loop
2771  * repeatedly, reading the tasks file of cpuset fromrelpath and writing
2772  * any task pid's found there to the tasks file of cpuset torelpath,
2773  * up to ten attempts, or until the tasks file of cpuset fromrelpath
2774  * is empty, or until fromrelpath is no longer present.
2775  *
2776  * Returns 0 with errno == 0 if able to empty the tasks file of cpuset
2777  * fromrelpath. Of course it is still possible that some independent
2778  * task could add another task to cpuset fromrelpath at the same time
2779  * that such a successful result is being returned, so there can be
2780  * no guarantee that a successful return means that fromrelpath is
2781  * still empty of tasks.
2782  *
2783  * We are careful to allow for the possibility that the cpuset
2784  * fromrelpath might disappear out from under us, perhaps because it
2785  * has notify_on_release set and gets automatically removed as soon
2786  * as we detach its last task from it.  Consider a missing fromrelpath
2787  * to be a successful move.
2788  *
2789  * If called with fromrelpath and torelpath pathnames that evaluate to
2790  * the same cpuset, then treat that as if cpuset_reattach() was called,
2791  * rebinding each task in this cpuset one time, and return success or
2792  * failure depending on the return of that cpuset_reattach() call.
2793  *
2794  * On failure, returns -1, with errno possibly one of:
2795  *  EACCES - search permission denied on intervening directory
2796  *  ENOTEMPTY - tasks remain after multiple attempts to move them
2797  *  EMFILE - too many open files
2798  *  ENODEV - /dev/cpuset not mounted
2799  *  ENOENT - component of cpuset path doesn't exist
2800  *  ENOMEM - out of memory
2801  *  ENOSYS - kernel doesn't support cpusets
2802  *  ENOTDIR - component of cpuset path is not a directory
2803  *  EPERM - lacked permission to kill a task
2804  *  EPERM - lacked permission to read cpusets or files therein
2805  *
2806  * This is an [optional] function. Use cpuset_function to invoke it.
2807  */
2808 
2809 #define NUMBER_MOVE_TASK_ATTEMPTS 10
2810 
cpuset_move_cpuset_tasks(const char * fromrelpath,const char * torelpath)2811 int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath)
2812 {
2813 	char fromfullpath[PATH_MAX];
2814 	char tofullpath[PATH_MAX];
2815 	int i;
2816 	struct cpuset_pidlist *pl = NULL;
2817 	int sav_errno;
2818 
2819 	fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath);
2820 	fullpath(tofullpath, sizeof(tofullpath), torelpath);
2821 
2822 	if (samefile(fromfullpath, tofullpath))
2823 		return cpuset_reattach(fromrelpath);
2824 
2825 	for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) {
2826 		int plen, j;
2827 
2828 		if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) {
2829 			/* missing cpuset is as good as if all moved */
2830 			if (errno == ENOENT)
2831 				goto no_more_cpuset;
2832 
2833 			/* other problems reading cpuset are bad news */
2834 			sav_errno = errno;
2835 			goto failed;
2836 		}
2837 
2838 		if ((plen = cpuset_pidlist_length(pl)) == 0)
2839 			goto no_more_pids;
2840 
2841 		for (j = 0; j < plen; j++) {
2842 			pid_t pid;
2843 
2844 			pid = cpuset_get_pidlist(pl, j);
2845 			if (cpuset_move(pid, torelpath) < 0) {
2846 				/* missing task is as good as if moved */
2847 				if (errno == ESRCH)
2848 					continue;
2849 
2850 				/* other per-task errors are bad news */
2851 				sav_errno = errno;
2852 				goto failed;
2853 			}
2854 		}
2855 
2856 		cpuset_freepidlist(pl);
2857 		pl = NULL;
2858 	}
2859 
2860 	sav_errno = ENOTEMPTY;
2861 	/* fall into ... */
2862 failed:
2863 	cpuset_freepidlist(pl);
2864 	errno = sav_errno;
2865 	return -1;
2866 
2867 no_more_pids:
2868 no_more_cpuset:
2869 	/* Success - all tasks (or entire cpuset ;) gone. */
2870 	cpuset_freepidlist(pl);
2871 	errno = 0;
2872 	return 0;
2873 }
2874 
2875 /* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */
cpuset_migrate(pid_t pid,const char * relpath)2876 int cpuset_migrate(pid_t pid, const char *relpath)
2877 {
2878 	char buf[PATH_MAX];
2879 	char buf2[PATH_MAX];
2880 	char memory_migrate_flag;
2881 	int r;
2882 
2883 	if (check() < 0)
2884 		return -1;
2885 
2886 	if (pid == 0)
2887 		pid = getpid();
2888 
2889 	fullpath(buf2, sizeof(buf2), relpath);
2890 
2891 	if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2892 		return -1;
2893 	if (store_flag(buf2, "memory_migrate", 1) < 0)
2894 		return -1;
2895 
2896 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2897 
2898 	r = __cpuset_move(pid, buf);
2899 
2900 	store_flag(buf2, "memory_migrate", memory_migrate_flag);
2901 	return r;
2902 }
2903 
2904 /* Migrate all tasks in pidlist to a cpuset (moves task and memory) */
cpuset_migrate_all(struct cpuset_pidlist * pl,const char * relpath)2905 int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath)
2906 {
2907 	int i;
2908 	char buf[PATH_MAX];
2909 	char buf2[PATH_MAX];
2910 	char memory_migrate_flag;
2911 	int ret;
2912 
2913 	if (check() < 0)
2914 		return -1;
2915 
2916 	fullpath(buf2, sizeof(buf2), relpath);
2917 
2918 	if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2919 		return -1;
2920 	if (store_flag(buf2, "memory_migrate", 1) < 0)
2921 		return -1;
2922 
2923 	fullpath2(buf, sizeof(buf), relpath, "tasks");
2924 
2925 	ret = 0;
2926 	for (i = 0; i < pl->npids; i++)
2927 		if (__cpuset_move(pl->pids[i], buf) < 0)
2928 			ret = -1;
2929 
2930 	if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0)
2931 		ret = -1;
2932 	return ret;
2933 }
2934 
2935 /* Rebind cpus_allowed of each task in cpuset 'path' */
cpuset_reattach(const char * relpath)2936 int cpuset_reattach(const char *relpath)
2937 {
2938 	struct cpuset_pidlist *pl;
2939 	int rc;
2940 
2941 	if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL)
2942 		return -1;
2943 	rc = cpuset_move_all(pl, relpath);
2944 	cpuset_freepidlist(pl);
2945 	return rc;
2946 }
2947 
2948 /* Map cpuset relative cpu number to system wide cpu number */
cpuset_c_rel_to_sys_cpu(const struct cpuset * cp,int cpu)2949 int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu)
2950 {
2951 	struct cpuset *cp_tofree = NULL;
2952 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2953 	int pos = -1;
2954 
2955 	if (!cp1)
2956 		goto err;
2957 	pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu);
2958 	/* fall into ... */
2959 err:
2960 	cpuset_free(cp_tofree);
2961 	return pos;
2962 }
2963 
2964 /* Map system wide cpu number to cpuset relative cpu number */
cpuset_c_sys_to_rel_cpu(const struct cpuset * cp,int cpu)2965 int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu)
2966 {
2967 	struct cpuset *cp_tofree = NULL;
2968 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2969 	int pos = -1;
2970 
2971 	if (!cp1)
2972 		goto err;
2973 	pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu);
2974 	/* fall into ... */
2975 err:
2976 	cpuset_free(cp_tofree);
2977 	return pos;
2978 }
2979 
2980 /* Map cpuset relative mem number to system wide mem number */
cpuset_c_rel_to_sys_mem(const struct cpuset * cp,int mem)2981 int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem)
2982 {
2983 	struct cpuset *cp_tofree = NULL;
2984 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2985 	int pos = -1;
2986 
2987 	if (!cp1)
2988 		goto err;
2989 	pos = bitmask_rel_to_abs_pos(cp1->mems, mem);
2990 	/* fall into ... */
2991 err:
2992 	cpuset_free(cp_tofree);
2993 	return pos;
2994 }
2995 
2996 /* Map system wide mem number to cpuset relative mem number */
cpuset_c_sys_to_rel_mem(const struct cpuset * cp,int mem)2997 int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem)
2998 {
2999 	struct cpuset *cp_tofree = NULL;
3000 	const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
3001 	int pos = -1;
3002 
3003 	if (!cp1)
3004 		goto err;
3005 	pos = bitmask_abs_to_rel_pos(cp1->mems, mem);
3006 	/* fall into ... */
3007 err:
3008 	cpuset_free(cp_tofree);
3009 	return pos;
3010 }
3011 
3012 /* Map pid's cpuset relative cpu number to system wide cpu number */
cpuset_p_rel_to_sys_cpu(pid_t pid,int cpu)3013 int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu)
3014 {
3015 	struct cpuset *cp;
3016 	int rc = -1;
3017 
3018 	if ((cp = cpuset_alloc()) == NULL)
3019 		goto done;
3020 	if (cpuset_cpusetofpid(cp, pid) < 0)
3021 		goto done;
3022 	rc = cpuset_c_rel_to_sys_cpu(cp, cpu);
3023 done:
3024 	cpuset_free(cp);
3025 	return rc;
3026 }
3027 
3028 /* Map system wide cpu number to pid's cpuset relative cpu number */
cpuset_p_sys_to_rel_cpu(pid_t pid,int cpu)3029 int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu)
3030 {
3031 	struct cpuset *cp;
3032 	int rc = -1;
3033 
3034 	if ((cp = cpuset_alloc()) == NULL)
3035 		goto done;
3036 	if (cpuset_cpusetofpid(cp, pid) < 0)
3037 		goto done;
3038 	rc = cpuset_c_sys_to_rel_cpu(cp, cpu);
3039 done:
3040 	cpuset_free(cp);
3041 	return rc;
3042 }
3043 
3044 /* Map pid's cpuset relative mem number to system wide mem number */
cpuset_p_rel_to_sys_mem(pid_t pid,int mem)3045 int cpuset_p_rel_to_sys_mem(pid_t pid, int mem)
3046 {
3047 	struct cpuset *cp;
3048 	int rc = -1;
3049 
3050 	if ((cp = cpuset_alloc()) == NULL)
3051 		goto done;
3052 	if (cpuset_cpusetofpid(cp, pid) < 0)
3053 		goto done;
3054 	rc = cpuset_c_rel_to_sys_mem(cp, mem);
3055 done:
3056 	cpuset_free(cp);
3057 	return rc;
3058 }
3059 
3060 /* Map system wide mem number to pid's cpuset relative mem number */
cpuset_p_sys_to_rel_mem(pid_t pid,int mem)3061 int cpuset_p_sys_to_rel_mem(pid_t pid, int mem)
3062 {
3063 	struct cpuset *cp;
3064 	int rc = -1;
3065 
3066 	if ((cp = cpuset_alloc()) == NULL)
3067 		goto done;
3068 	if (cpuset_cpusetofpid(cp, pid) < 0)
3069 		goto done;
3070 	rc = cpuset_c_sys_to_rel_mem(cp, mem);
3071 done:
3072 	cpuset_free(cp);
3073 	return rc;
3074 }
3075 
3076 /*
3077  * Override glibc's calls for get/set affinity - they have
3078  * something using cpu_set_t that will die when NR_CPUS > 1024.
3079  * Go directly to the 'real' system calls.  Also override calls
3080  * for get_mempolicy and set_mempolicy.  None of these
3081  * calls are yet (July 2004) guaranteed to be in all glibc versions
3082  * that we care about.
3083  */
3084 
sched_setaffinity(pid_t pid,unsigned len,unsigned long * mask)3085 static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask)
3086 {
3087 	return ltp_syscall(__NR_sched_setaffinity, pid, len, mask);
3088 }
3089 
3090 #if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE
get_mempolicy(int * policy,unsigned long * nmask,unsigned long maxnode,void * addr,int flags)3091 static int get_mempolicy(int *policy, unsigned long *nmask,
3092 			 unsigned long maxnode, void *addr, int flags)
3093 {
3094 	return ltp_syscall(__NR_get_mempolicy, policy, nmask, maxnode,
3095 		addr, flags);
3096 }
3097 #endif
3098 
3099 #if HAVE_DECL_MPOL_BIND || HAVE_DECL_MPOL_DEFAULT
set_mempolicy(int mode,unsigned long * nmask,unsigned long maxnode)3100 static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode)
3101 {
3102 	return ltp_syscall(__NR_set_mempolicy, mode, nmask, maxnode);
3103 }
3104 #endif
3105 
3106 struct cpuset_placement {
3107 	struct bitmask *cpus;
3108 	struct bitmask *mems;
3109 	char *path;
3110 };
3111 
3112 /* Allocate and fill in a placement struct - cpatures current placement */
cpuset_get_placement(pid_t pid)3113 struct cpuset_placement *cpuset_get_placement(pid_t pid)
3114 {
3115 	struct cpuset_placement *plc;
3116 	struct cpuset *cp = NULL;
3117 	char buf[PATH_MAX];
3118 	int nbits;
3119 
3120 	if ((plc = calloc(1, sizeof(*plc))) == NULL)
3121 		goto err;
3122 
3123 	nbits = cpuset_cpus_nbits();
3124 	if ((plc->cpus = bitmask_alloc(nbits)) == NULL)
3125 		goto err;
3126 
3127 	nbits = cpuset_mems_nbits();
3128 	if ((plc->mems = bitmask_alloc(nbits)) == NULL)
3129 		goto err;
3130 
3131 	if ((cp = cpuset_alloc()) == NULL)
3132 		goto err;
3133 	if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
3134 		goto err;
3135 	if (cpuset_query(cp, buf) < 0)
3136 		goto err;
3137 
3138 	bitmask_copy(plc->cpus, cp->cpus);
3139 	bitmask_copy(plc->mems, cp->mems);
3140 	plc->path = strdup(buf);
3141 
3142 	cpuset_free(cp);
3143 	return plc;
3144 err:
3145 	cpuset_free(cp);
3146 	cpuset_free_placement(plc);
3147 	return NULL;
3148 }
3149 
3150 /* Compare two placement structs - use to detect changes in placement */
cpuset_equal_placement(const struct cpuset_placement * plc1,const struct cpuset_placement * plc2)3151 int cpuset_equal_placement(const struct cpuset_placement *plc1,
3152 			   const struct cpuset_placement *plc2)
3153 {
3154 	return bitmask_equal(plc1->cpus, plc2->cpus) &&
3155 	    bitmask_equal(plc1->mems, plc2->mems) &&
3156 	    streq(plc1->path, plc2->path);
3157 }
3158 
3159 /* Free a placement struct */
cpuset_free_placement(struct cpuset_placement * plc)3160 void cpuset_free_placement(struct cpuset_placement *plc)
3161 {
3162 	if (!plc)
3163 		return;
3164 	bitmask_free(plc->cpus);
3165 	bitmask_free(plc->mems);
3166 	free(plc->path);
3167 	free(plc);
3168 }
3169 
3170 /*
3171  * A cpuset_fts_open() call constructs a linked list of entries
3172  * called a "cpuset_fts_tree", with one entry per cpuset below
3173  * the specified path.  The cpuset_fts_read() routine returns the
3174  * next entry on this list.  The various cpuset_fts_get_*() calls
3175  * return attributes of the specified entry.  The cpuset_fts_close()
3176  * call frees the linked list and all associated data.  All cpuset
3177  * entries and attributes for the cpuset_fts_tree returned from a
3178  * given cpuset_fts_open() call remain allocated and unchanged until
3179  * that cpuset_fts_tree is closed by a cpuset_fts_close() call.  Any
3180  * subsequent changes to the cpuset filesystem will go unnoticed
3181  * (not affect open cpuset_fts_tree's.)
3182  */
3183 
3184 struct cpuset_fts_entry;
3185 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree);
3186 
3187 struct cpuset_fts_tree {
3188 	struct cpuset_fts_entry *head;	/* head of linked entry list */
3189 	struct cpuset_fts_entry *next;	/* cpuset_fts_read() offset */
3190 };
3191 
3192 struct cpuset_fts_entry {
3193 	struct cpuset_fts_entry *next;	/* linked entry list chain */
3194 	struct cpuset *cpuset;
3195 	struct stat *stat;
3196 	char *path;
3197 	int info;
3198 	int err;
3199 };
3200 
3201 /* Open a handle on a cpuset hierarchy.  All the real work is done here. */
cpuset_fts_open(const char * cpusetpath)3202 struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath)
3203 {
3204 	FTS *fts = NULL;
3205 	FTSENT *ftsent;
3206 	char *path_argv[2];
3207 	char buf[PATH_MAX];
3208 	struct cpuset_fts_tree *cs_tree = NULL;
3209 	struct cpuset_fts_entry *ep;	/* the latest new list entry */
3210 	struct cpuset_fts_entry **pnlep;	/* ptr to next list entry ptr */
3211 	char *relpath;
3212 	int fts_flags;
3213 
3214 	fullpath(buf, sizeof(buf), cpusetpath);
3215 	path_argv[0] = buf;
3216 	path_argv[1] = NULL;
3217 
3218 	fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV;
3219 	fts = fts_open(path_argv, fts_flags, NULL);
3220 	if (fts == NULL)
3221 		goto err;
3222 
3223 	cs_tree = malloc(sizeof(*cs_tree));
3224 	if (cs_tree == NULL)
3225 		goto err;
3226 	pnlep = &cs_tree->head;
3227 	*pnlep = NULL;
3228 
3229 	while ((ftsent = fts_read(fts)) != NULL) {
3230 		if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR)
3231 			continue;
3232 
3233 		/* ftsent is a directory (perhaps unreadable) ==> cpuset */
3234 		ep = calloc(1, sizeof(*ep));
3235 		if (ep == NULL)
3236 			goto err;
3237 		*pnlep = ep;
3238 		pnlep = &ep->next;
3239 
3240 		/* Set entry's path, and if DNR, error */
3241 		relpath = ftsent->fts_path + strlen(cpusetmnt);
3242 		if (strlen(relpath) == 0)
3243 			relpath = "/";
3244 		ep->path = strdup(relpath);
3245 		if (ep->path == NULL)
3246 			goto err;
3247 		if (ftsent->fts_info == FTS_DNR) {
3248 			ep->info = CPUSET_FTS_ERR_DNR;
3249 			ep->err = ftsent->fts_errno;
3250 			continue;
3251 		}
3252 
3253 		/* ftsent is a -readable- cpuset: set entry's stat, etc */
3254 		ep->stat = calloc(1, sizeof(struct stat));
3255 		if (ep->stat == NULL)
3256 			goto err;
3257 		if (stat(ftsent->fts_path, ep->stat) < 0) {
3258 			ep->info = CPUSET_FTS_ERR_STAT;
3259 			ep->err = ftsent->fts_errno;
3260 			continue;
3261 		}
3262 
3263 		ep->cpuset = calloc(1, sizeof(struct cpuset));
3264 		if (ep->cpuset == NULL)
3265 			goto err;
3266 		if (cpuset_query(ep->cpuset, relpath) < 0) {
3267 			ep->info = CPUSET_FTS_ERR_CPUSET;
3268 			ep->err = errno;
3269 			continue;
3270 		}
3271 		ep->info = CPUSET_FTS_CPUSET;
3272 	}
3273 
3274 	(void)fts_close(fts);
3275 	cpuset_fts_rewind(cs_tree);
3276 	return cs_tree;
3277 
3278 err:
3279 	if (cs_tree)
3280 		cpuset_fts_close(cs_tree);
3281 	if (fts)
3282 		(void)fts_close(fts);
3283 	return NULL;
3284 }
3285 
3286 /* Return pointer to next cpuset entry in hierarchy */
cpuset_fts_read(struct cpuset_fts_tree * cs_tree)3287 const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree)
3288 {
3289 	const struct cpuset_fts_entry *cs_entry = cs_tree->next;
3290 	if (cs_tree->next != NULL)	/* seek to next entry */
3291 		cs_tree->next = cs_tree->next->next;
3292 	return cs_entry;
3293 }
3294 
3295 /* Reverse list of cpusets, in place.  Simulates pre-order/post-order flip. */
cpuset_fts_reverse(struct cpuset_fts_tree * cs_tree)3296 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree)
3297 {
3298 	struct cpuset_fts_entry *cs1, *cs2, *cs3;
3299 
3300 	/*
3301 	 * At each step, cs1 < cs2 < cs3 and the cs2->next pointer
3302 	 * is redirected from cs3 to cs1.
3303 	 */
3304 
3305 	cs1 = cs2 = NULL;
3306 	cs3 = cs_tree->head;
3307 	while (cs3) {
3308 		cs1 = cs2;
3309 		cs2 = cs3;
3310 		cs3 = cs3->next;
3311 		cs2->next = cs1;
3312 	}
3313 	cs_tree->head = cs2;
3314 	cpuset_fts_rewind(cs_tree);
3315 }
3316 
3317 /* Rewind cpuset list to beginning */
cpuset_fts_rewind(struct cpuset_fts_tree * cs_tree)3318 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree)
3319 {
3320 	cs_tree->next = cs_tree->head;
3321 }
3322 
3323 /* Return pointer to nul-terminated cpuset path of entry in hierarchy */
cpuset_fts_get_path(const struct cpuset_fts_entry * cs_entry)3324 const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry)
3325 {
3326 	return cs_entry->path;
3327 }
3328 
3329 /* Return pointer to stat(2) structure of a cpuset entry's directory */
cpuset_fts_get_stat(const struct cpuset_fts_entry * cs_entry)3330 const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry)
3331 {
3332 	return cs_entry->stat;
3333 }
3334 
3335 /* Return pointer to cpuset structure of a cpuset entry */
cpuset_fts_get_cpuset(const struct cpuset_fts_entry * cs_entry)3336 const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry
3337 					   *cs_entry)
3338 {
3339 	return cs_entry->cpuset;
3340 }
3341 
3342 /* Return value of errno (0 if no error) on attempted cpuset operations */
cpuset_fts_get_errno(const struct cpuset_fts_entry * cs_entry)3343 int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry)
3344 {
3345 	return cs_entry->err;
3346 }
3347 
3348 /* Return operation identity causing error */
cpuset_fts_get_info(const struct cpuset_fts_entry * cs_entry)3349 int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry)
3350 {
3351 	return cs_entry->info;
3352 }
3353 
3354 /* Close a cpuset hierarchy handle (free's all associated memory) */
cpuset_fts_close(struct cpuset_fts_tree * cs_tree)3355 void cpuset_fts_close(struct cpuset_fts_tree *cs_tree)
3356 {
3357 	struct cpuset_fts_entry *cs_entry = cs_tree->head;
3358 
3359 	while (cs_entry) {
3360 		struct cpuset_fts_entry *ep = cs_entry;
3361 
3362 		cs_entry = cs_entry->next;
3363 		free(ep->path);
3364 		free(ep->stat);
3365 		cpuset_free(ep->cpuset);
3366 		free(ep);
3367 	}
3368 	free(cs_tree);
3369 }
3370 
3371 /* Bind current task to cpu (uses sched_setaffinity(2)) */
cpuset_cpubind(int cpu)3372 int cpuset_cpubind(int cpu)
3373 {
3374 	struct bitmask *bmp;
3375 	int r;
3376 
3377 	if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3378 		return -1;
3379 	bitmask_setbit(bmp, cpu);
3380 	r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp));
3381 	bitmask_free(bmp);
3382 	return r;
3383 }
3384 
3385 /*
3386  * int cpuset_latestcpu(pid_t pid)
3387  *
3388  * Return most recent CPU on which task pid executed.  If pid == 0,
3389  * examine current task.
3390  *
3391  * The last used CPU is visible for a given pid as field #39 (starting
3392  * with #1) in the file /proc/pid/stat.  Currently this file has 41
3393  * fields, in which case this is the 3rd to the last field.
3394  *
3395  * Unfortunately field #2 is a command name and might have embedded
3396  * whitespace.  So we can't just count white space separated fields.
3397  * Fortunately, this command name is surrounded by parentheses, as
3398  * for example "(sh)", and that closing parenthesis is the last ')'
3399  * character in the line.  No remaining fields can have embedded
3400  * whitespace or parentheses.  So instead of looking for the 39th
3401  * white space separated field, we can look for the 37th white space
3402  * separated field past the last ')' character on the line.
3403  */
3404 
3405 /* Return most recent CPU on which task pid executed */
cpuset_latestcpu(pid_t pid)3406 int cpuset_latestcpu(pid_t pid)
3407 {
3408 	char buf[PATH_MAX];
3409 	char *bp;
3410 	int fd = -1;
3411 	int cpu = -1;
3412 
3413 	if (pid == 0)
3414 		snprintf(buf, sizeof(buf), "/proc/self/stat");
3415 	else
3416 		snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
3417 
3418 	if ((fd = open(buf, O_RDONLY)) < 0)
3419 		goto err;
3420 	if (read(fd, buf, sizeof(buf)) < 1)
3421 		goto err;
3422 	close(fd);
3423 
3424 	bp = strrchr(buf, ')');
3425 	if (bp)
3426 		sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %u",	/* 37th field past ')' */
3427 		       &cpu);
3428 	if (cpu < 0)
3429 		errno = EINVAL;
3430 	return cpu;
3431 err:
3432 	if (fd >= 0)
3433 		close(fd);
3434 	return -1;
3435 }
3436 
3437 /* Bind current task to memory (uses set_mempolicy(2)) */
cpuset_membind(int mem)3438 int cpuset_membind(int mem)
3439 {
3440 	struct bitmask *bmp;
3441 	int r;
3442 
3443 	if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3444 		return -1;
3445 	bitmask_setbit(bmp, mem);
3446 #if HAVE_DECL_MPOL_BIND
3447 	r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp), bitmask_nbits(bmp) + 1);
3448 #else
3449 	r = -1;
3450 	errno = ENOSYS;
3451 #endif
3452 	bitmask_free(bmp);
3453 	return r;
3454 }
3455 
3456 /* [optional] Return Memory Node holding page at specified addr */
cpuset_addr2node(void * addr)3457 int cpuset_addr2node(void *addr)
3458 {
3459 	int node = -1;
3460 
3461 #if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE
3462 	if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE | MPOL_F_ADDR)) {
3463 		/* I realize this seems redundant, but I _want_ to make sure
3464 		 * that this value is -1. */
3465 		node = -1;
3466 	}
3467 #endif
3468 	return node;
3469 }
3470 
3471 /*
3472  * Transform cpuset into Text Format Representation in buffer 'buf',
3473  * of length 'buflen', nul-terminated if space allows.  Return number
3474  * of characters that would have been written, if enough space had
3475  * been available, in the same way that snprintf() does.
3476  */
3477 
3478 /* Export cpuset settings to a regular file */
cpuset_export(const struct cpuset * cp,char * buf,int buflen)3479 int cpuset_export(const struct cpuset *cp, char *buf, int buflen)
3480 {
3481 	char *tmp = NULL;
3482 	int n = 0;
3483 
3484 	if (cp->cpu_exclusive)
3485 		n += snprintf(buf + n, MAX(buflen - n, 0), "cpu_exclusive\n");
3486 
3487 	if (cp->mem_exclusive)
3488 		n += snprintf(buf + n, MAX(buflen - n, 0), "mem_exclusive\n");
3489 
3490 	if (cp->notify_on_release)
3491 		n += snprintf(buf + n, MAX(buflen - n, 0),
3492 			      "notify_on_release\n");
3493 
3494 	if (cp->memory_pressure_enabled)
3495 		n += snprintf(buf + n, MAX(buflen - n, 0),
3496 			      "memory_pressure_enabled\n");
3497 
3498 	if (cp->memory_migrate)
3499 		n += snprintf(buf + n, MAX(buflen - n, 0), "memory_migrate\n");
3500 
3501 	if (cp->memory_spread_page)
3502 		n += snprintf(buf + n, MAX(buflen - n, 0),
3503 			      "memory_spread_page\n");
3504 
3505 	if (cp->memory_spread_slab)
3506 		n += snprintf(buf + n, MAX(buflen - n, 0),
3507 			      "memory_spread_slab\n");
3508 
3509 	if ((tmp = sprint_mask_buf(cp->cpus)) == NULL)
3510 		return -1;
3511 	n += snprintf(buf + n, MAX(buflen - n, 0), "cpus %s\n", tmp);
3512 	free(tmp);
3513 	tmp = NULL;
3514 
3515 	if ((tmp = sprint_mask_buf(cp->mems)) == NULL)
3516 		return -1;
3517 	n += snprintf(buf + n, MAX(buflen - n, 0), "mems %s\n", tmp);
3518 	free(tmp);
3519 	tmp = NULL;
3520 
3521 	return n;
3522 }
3523 
import_list(UNUSED const char * tok,const char * arg,struct bitmask * bmp,char * emsg,int elen)3524 static int import_list(UNUSED const char *tok, const char *arg,
3525 		       struct bitmask *bmp, char *emsg, int elen)
3526 {
3527 	if (bitmask_parselist(arg, bmp) < 0) {
3528 		if (emsg)
3529 			snprintf(emsg, elen, "Invalid list format: %s", arg);
3530 		return -1;
3531 	}
3532 	return 0;
3533 }
3534 
stolower(char * s)3535 static void stolower(char *s)
3536 {
3537 	while (*s) {
3538 		unsigned char c = *s;
3539 		*s = tolower(c);
3540 		s++;
3541 	}
3542 }
3543 
3544 /* Import cpuset settings from a regular file */
cpuset_import(struct cpuset * cp,const char * buf,int * elinenum,char * emsg,int elen)3545 int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum,
3546 		  char *emsg, int elen)
3547 {
3548 	char *linebuf = NULL;
3549 	int linebuflen;
3550 	int linenum = 0;
3551 	int offset = 0;
3552 
3553 	linebuflen = strlen(buf) + 1;
3554 	if ((linebuf = malloc(linebuflen)) == NULL) {
3555 		if (emsg)
3556 			snprintf(emsg, elen, "Insufficient memory");
3557 		goto err;
3558 	}
3559 
3560 	while (slgets(linebuf, linebuflen, buf, &offset)) {
3561 		char *tok, *arg;
3562 		char *ptr;	/* for strtok_r */
3563 
3564 		linenum++;
3565 		if ((tok = strchr(linebuf, '#')) != NULL)
3566 			*tok = 0;
3567 		if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL)
3568 			continue;
3569 		stolower(tok);
3570 
3571 		arg = strtok_r(0, " \t", &ptr);
3572 
3573 		if (streq(tok, "cpu_exclusive")) {
3574 			cp->cpu_exclusive = 1;
3575 			goto eol;
3576 		}
3577 		if (streq(tok, "mem_exclusive")) {
3578 			cp->mem_exclusive = 1;
3579 			goto eol;
3580 		}
3581 		if (streq(tok, "notify_on_release")) {
3582 			cp->notify_on_release = 1;
3583 			goto eol;
3584 		}
3585 		if (streq(tok, "memory_pressure_enabled")) {
3586 			cp->memory_pressure_enabled = 1;
3587 			goto eol;
3588 		}
3589 		if (streq(tok, "memory_migrate")) {
3590 			cp->memory_migrate = 1;
3591 			goto eol;
3592 		}
3593 		if (streq(tok, "memory_spread_page")) {
3594 			cp->memory_spread_page = 1;
3595 			goto eol;
3596 		}
3597 		if (streq(tok, "memory_spread_slab")) {
3598 			cp->memory_spread_slab = 1;
3599 			goto eol;
3600 		}
3601 		if (streq(tok, "cpu") || streq(tok, "cpus")) {
3602 			if (import_list(tok, arg, cp->cpus, emsg, elen) < 0)
3603 				goto err;
3604 			goto eol;
3605 		}
3606 		if (streq(tok, "mem") || streq(tok, "mems")) {
3607 			if (import_list(tok, arg, cp->mems, emsg, elen) < 0)
3608 				goto err;
3609 			goto eol;
3610 		}
3611 		if (emsg)
3612 			snprintf(emsg, elen, "Unrecognized token: '%s'", tok);
3613 		goto err;
3614 eol:
3615 		if ((tok = strtok_r(0, " \t", &ptr)) != NULL) {
3616 			if (emsg)
3617 				snprintf(emsg, elen, "Surplus token: '%s'",
3618 					 tok);
3619 			goto err;
3620 		}
3621 		continue;
3622 	}
3623 
3624 	free(linebuf);
3625 
3626 	if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems))
3627 		cpuset_localcpus(cp->mems, cp->cpus);
3628 	else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems))
3629 		cpuset_localmems(cp->cpus, cp->mems);
3630 
3631 	/*
3632 	 * All cpuset attributes are determined in an import.
3633 	 * Those that aren't explicitly specified are presumed
3634 	 * to be unchanged (zero, if it's a freshly allocated
3635 	 * struct cpuset.)
3636 	 */
3637 
3638 	cp->cpus_valid = 1;
3639 	cp->mems_valid = 1;
3640 	cp->cpu_exclusive_valid = 1;
3641 	cp->mem_exclusive_valid = 1;
3642 	cp->notify_on_release_valid = 1;
3643 	cp->memory_migrate_valid = 1;
3644 	cp->memory_pressure_enabled_valid = 1;
3645 	cp->memory_spread_page_valid = 1;
3646 	cp->memory_spread_slab_valid = 1;
3647 
3648 	return 0;
3649 err:
3650 	if (elinenum)
3651 		*elinenum = linenum;
3652 	free(linebuf);
3653 	return -1;
3654 }
3655 
3656 /* Pin current task CPU (and memory) */
cpuset_pin(int relcpu)3657 int cpuset_pin(int relcpu)
3658 {
3659 	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3660 	int cpu, r;
3661 
3662 	if (check() < 0)
3663 		return -1;
3664 
3665 	do {
3666 		cpuset_free_placement(plc1);
3667 		plc1 = cpuset_get_placement(0);
3668 
3669 		r = 0;
3670 		if (cpuset_unpin() < 0)
3671 			r = -1;
3672 		cpu = cpuset_p_rel_to_sys_cpu(0, relcpu);
3673 		if (cpuset_cpubind(cpu) < 0)
3674 			r = -1;
3675 
3676 		cpuset_free_placement(plc2);
3677 		plc2 = cpuset_get_placement(0);
3678 	} while (!cpuset_equal_placement(plc1, plc2));
3679 
3680 	cpuset_free_placement(plc1);
3681 	cpuset_free_placement(plc2);
3682 	return r;
3683 }
3684 
3685 /* Return number CPUs in current tasks cpuset */
cpuset_size()3686 int cpuset_size()
3687 {
3688 	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3689 	int r;
3690 
3691 	if (check() < 0)
3692 		return -1;
3693 
3694 	do {
3695 		cpuset_free_placement(plc1);
3696 		plc1 = cpuset_get_placement(0);
3697 
3698 		r = cpuset_cpus_weight(0);
3699 
3700 		cpuset_free_placement(plc2);
3701 		plc2 = cpuset_get_placement(0);
3702 	} while (!cpuset_equal_placement(plc1, plc2));
3703 
3704 	cpuset_free_placement(plc1);
3705 	cpuset_free_placement(plc2);
3706 	return r;
3707 }
3708 
3709 /* Return relative CPU number, within current cpuset, last executed on */
cpuset_where()3710 int cpuset_where()
3711 {
3712 	struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3713 	int r;
3714 
3715 	if (check() < 0)
3716 		return -1;
3717 
3718 	do {
3719 		cpuset_free_placement(plc1);
3720 		plc1 = cpuset_get_placement(0);
3721 
3722 		r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0));
3723 
3724 		cpuset_free_placement(plc2);
3725 		plc2 = cpuset_get_placement(0);
3726 	} while (!cpuset_equal_placement(plc1, plc2));
3727 
3728 	cpuset_free_placement(plc1);
3729 	cpuset_free_placement(plc2);
3730 	return r;
3731 }
3732 
3733 /* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */
cpuset_unpin()3734 int cpuset_unpin()
3735 {
3736 	struct bitmask *cpus = NULL, *mems = NULL;
3737 	int r = -1;
3738 
3739 	if (check() < 0)
3740 		goto err;
3741 
3742 	/*
3743 	 * Don't need cpuset_*_placement() guard against concurrent
3744 	 * cpuset migration, because none of the following depends
3745 	 * on the tasks cpuset placement.
3746 	 */
3747 
3748 	if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3749 		goto err;
3750 	bitmask_setall(cpus);
3751 	if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0)
3752 		goto err;
3753 
3754 	if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3755 		goto err;
3756 #if HAVE_DECL_MPOL_DEFAULT
3757 	if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems),
3758 			  bitmask_nbits(mems) + 1) < 0)
3759 		goto err;
3760 	r = 0;
3761 #endif
3762 	/* fall into ... */
3763 err:
3764 	bitmask_free(cpus);
3765 	bitmask_free(mems);
3766 	return r;
3767 
3768 }
3769 
3770 struct cpuset_function_list {
3771 	const char *fname;
3772 	void *func;
3773 } flist[] = {
3774 	{
3775 	"cpuset_version", cpuset_version}, {
3776 	"cpuset_alloc", cpuset_alloc}, {
3777 	"cpuset_free", cpuset_free}, {
3778 	"cpuset_cpus_nbits", cpuset_cpus_nbits}, {
3779 	"cpuset_mems_nbits", cpuset_mems_nbits}, {
3780 	"cpuset_setcpus", cpuset_setcpus}, {
3781 	"cpuset_setmems", cpuset_setmems}, {
3782 	"cpuset_set_iopt", cpuset_set_iopt}, {
3783 	"cpuset_set_sopt", cpuset_set_sopt}, {
3784 	"cpuset_getcpus", cpuset_getcpus}, {
3785 	"cpuset_getmems", cpuset_getmems}, {
3786 	"cpuset_cpus_weight", cpuset_cpus_weight}, {
3787 	"cpuset_mems_weight", cpuset_mems_weight}, {
3788 	"cpuset_get_iopt", cpuset_get_iopt}, {
3789 	"cpuset_get_sopt", cpuset_get_sopt}, {
3790 	"cpuset_localcpus", cpuset_localcpus}, {
3791 	"cpuset_localmems", cpuset_localmems}, {
3792 	"cpuset_cpumemdist", cpuset_cpumemdist}, {
3793 	"cpuset_cpu2node", cpuset_cpu2node}, {
3794 	"cpuset_addr2node", cpuset_addr2node}, {
3795 	"cpuset_create", cpuset_create}, {
3796 	"cpuset_delete", cpuset_delete}, {
3797 	"cpuset_query", cpuset_query}, {
3798 	"cpuset_modify", cpuset_modify}, {
3799 	"cpuset_getcpusetpath", cpuset_getcpusetpath}, {
3800 	"cpuset_cpusetofpid", cpuset_cpusetofpid}, {
3801 	"cpuset_mountpoint", cpuset_mountpoint}, {
3802 	"cpuset_collides_exclusive", cpuset_collides_exclusive}, {
3803 	"cpuset_nuke", cpuset_nuke}, {
3804 	"cpuset_init_pidlist", cpuset_init_pidlist}, {
3805 	"cpuset_pidlist_length", cpuset_pidlist_length}, {
3806 	"cpuset_get_pidlist", cpuset_get_pidlist}, {
3807 	"cpuset_freepidlist", cpuset_freepidlist}, {
3808 	"cpuset_move", cpuset_move}, {
3809 	"cpuset_move_all", cpuset_move_all}, {
3810 	"cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks}, {
3811 	"cpuset_migrate", cpuset_migrate}, {
3812 	"cpuset_migrate_all", cpuset_migrate_all}, {
3813 	"cpuset_reattach", cpuset_reattach}, {
3814 	"cpuset_open_memory_pressure", cpuset_open_memory_pressure}, {
3815 	"cpuset_read_memory_pressure", cpuset_read_memory_pressure}, {
3816 	"cpuset_close_memory_pressure", cpuset_close_memory_pressure}, {
3817 	"cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu}, {
3818 	"cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu}, {
3819 	"cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem}, {
3820 	"cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem}, {
3821 	"cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu}, {
3822 	"cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu}, {
3823 	"cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem}, {
3824 	"cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem}, {
3825 	"cpuset_get_placement", cpuset_get_placement}, {
3826 	"cpuset_equal_placement", cpuset_equal_placement}, {
3827 	"cpuset_free_placement", cpuset_free_placement}, {
3828 	"cpuset_fts_open", cpuset_fts_open}, {
3829 	"cpuset_fts_read", cpuset_fts_read}, {
3830 	"cpuset_fts_reverse", cpuset_fts_reverse}, {
3831 	"cpuset_fts_rewind", cpuset_fts_rewind}, {
3832 	"cpuset_fts_get_path", cpuset_fts_get_path}, {
3833 	"cpuset_fts_get_stat", cpuset_fts_get_stat}, {
3834 	"cpuset_fts_get_cpuset", cpuset_fts_get_cpuset}, {
3835 	"cpuset_fts_get_errno", cpuset_fts_get_errno}, {
3836 	"cpuset_fts_get_info", cpuset_fts_get_info}, {
3837 	"cpuset_fts_close", cpuset_fts_close}, {
3838 	"cpuset_cpubind", cpuset_cpubind}, {
3839 	"cpuset_latestcpu", cpuset_latestcpu}, {
3840 	"cpuset_membind", cpuset_membind}, {
3841 	"cpuset_export", cpuset_export}, {
3842 	"cpuset_import", cpuset_import}, {
3843 	"cpuset_function", cpuset_function}, {
3844 	"cpuset_pin", cpuset_pin}, {
3845 	"cpuset_size", cpuset_size}, {
3846 	"cpuset_where", cpuset_where}, {
3847 "cpuset_unpin", cpuset_unpin},};
3848 
3849 /* Return pointer to a libcpuset.so function, or NULL */
cpuset_function(const char * function_name)3850 void *cpuset_function(const char *function_name)
3851 {
3852 	unsigned int i;
3853 
3854 	for (i = 0; i < sizeof(flist) / sizeof(flist[0]); i++)
3855 		if (streq(function_name, flist[i].fname))
3856 			return flist[i].func;
3857 	return NULL;
3858 }
3859 
3860 /* Fortran interface to basic cpuset routines */
cpuset_pin_(int * ptr_relcpu)3861 int cpuset_pin_(int *ptr_relcpu)
3862 {
3863 	return cpuset_pin(*ptr_relcpu);
3864 }
3865 
cpuset_size_(void)3866 int cpuset_size_(void)
3867 {
3868 	return cpuset_size();
3869 }
3870 
cpuset_where_(void)3871 int cpuset_where_(void)
3872 {
3873 	return cpuset_where();
3874 }
3875 
cpuset_unpin_(void)3876 int cpuset_unpin_(void)
3877 {
3878 	return cpuset_unpin();
3879 }
3880 
3881 #endif /* HAVE_LINUX_MEMPOLICY_H */
3882