1 /*
2 * cpuset user library implementation.
3 *
4 * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved.
5 *
6 * Paul Jackson <pj@sgi.com>
7 */
8
9 /*
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25 #define _XOPEN_SOURCE 500 /* need to see pread() */
26 #define _BSD_SOURCE 1 /* need to see syscall() */
27 #include <unistd.h>
28
29 #include <ctype.h>
30 #include <dirent.h>
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <fts.h>
34 #include <limits.h>
35 #include <signal.h>
36 #include <stdint.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <sys/stat.h>
41 #include <sys/syscall.h>
42 #include <sys/types.h>
43 #include <time.h>
44 #include <utime.h>
45 #include <sys/utsname.h> /* for cpuset_would_crash_kernel() */
46
47 #include "bitmask.h"
48 #include "cpuset.h"
49 #include "common.h"
50 #include "test.h"
51 #include "linux_syscall_numbers.h"
52 #include "config.h"
53 #if HAVE_LINUX_MEMPOLICY_H
54 #include <linux/mempolicy.h>
55
56 /* Bump version, and update Change History, when libcpuset API changes */
57 #define CPUSET_VERSION 3
58
59 /*
60 * For a history of what changed in each version, see the "Change
61 * History" section, at the end of the libcpuset master document.
62 */
63
cpuset_version(void)64 int cpuset_version(void)
65 {
66 return CPUSET_VERSION;
67 }
68
69 struct cpuset {
70 struct bitmask *cpus;
71 struct bitmask *mems;
72 char cpu_exclusive;
73 char mem_exclusive;
74 char mem_hardwall;
75 char notify_on_release;
76 char memory_migrate;
77 char memory_pressure_enabled;
78 char memory_spread_page;
79 char memory_spread_slab;
80 char sched_load_balance;
81 int sched_relax_domain_level;
82
83 /*
84 * Each field 'x' above gets an 'x_valid' field below.
85 * The apply_cpuset_settings() will only set those fields whose
86 * corresponding *_valid flags are set. The cpuset_alloc()
87 * routine clears these flags as part of the clear in calloc(),
88 * and the various cpuset_set*() routines set these flags when
89 * setting the corresponding value.
90 *
91 * The purpose of these valid fields is to ensure that when
92 * we create a new cpuset, we don't accidentally overwrite
93 * some non-zero kernel default, such as an inherited
94 * memory_spread_* flag, just because the user application
95 * code didn't override the default zero settings resulting
96 * from the calloc() call in cpuset_alloc().
97 *
98 * The choice of 'char' for the type of the flags above,
99 * but a bitfield for the flags below, is somewhat capricious.
100 */
101 unsigned cpus_valid:1;
102 unsigned mems_valid:1;
103 unsigned cpu_exclusive_valid:1;
104 unsigned mem_exclusive_valid:1;
105 unsigned mem_hardwall_valid:1;
106 unsigned notify_on_release_valid:1;
107 unsigned memory_migrate_valid:1;
108 unsigned memory_pressure_enabled_valid:1;
109 unsigned memory_spread_page_valid:1;
110 unsigned memory_spread_slab_valid:1;
111 unsigned sched_load_balance_valid:1;
112 unsigned sched_relax_domain_level_valid:1;
113
114 /*
115 * if the relative variable was modified, use following flags
116 * to put a mark
117 */
118 unsigned cpus_dirty:1;
119 unsigned mems_dirty:1;
120 unsigned cpu_exclusive_dirty:1;
121 unsigned mem_exclusive_dirty:1;
122 unsigned mem_hardwall_dirty:1;
123 unsigned notify_on_release_dirty:1;
124 unsigned memory_migrate_dirty:1;
125 unsigned memory_pressure_enabled_dirty:1;
126 unsigned memory_spread_page_dirty:1;
127 unsigned memory_spread_slab_dirty:1;
128 unsigned sched_load_balance_dirty:1;
129 unsigned sched_relax_domain_level_dirty:1;
130 };
131
132 /* Presumed cpuset file system mount point */
133 static const char *cpusetmnt = "/dev/cpuset";
134
135 /* Stashed copy of cpunodemap[], mapping each cpu to its node. */
136 static const char *mapfile = "/var/run/cpunodemap";
137
138 /* The primary source for the cpunodemap[] is available below here. */
139 static const char *sysdevices = "/sys/devices/system";
140
141 #define max(a,b) ((a) > (b) ? (a) : (b))
142 #define min(a,b) ((a) < (b) ? (a) : (b))
143
144 /* small buffer size - for reading boolean flags or map file (1 or 2 ints) */
145 #define SMALL_BUFSZ 16
146
147 /*
148 * The 'mask_size_file' is used to ferrit out the kernel cpumask_t
149 * and nodemask_t sizes. The lines in this file that begin with the
150 * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask
151 * and nodemask string, respectively. The lengths of these strings
152 * reflect the kernel's internal cpumask_t and nodemask_t sizes,
153 * which sizes are needed to correctly call the sched_setaffinity
154 * and set_mempolicy system calls, and to size user level
155 * bitmasks to match the kernels.
156 */
157
158 static const char *mask_size_file = "/proc/self/status";
159 static const char *cpumask_prefix = "Cpus_allowed:\t";
160 static const char *nodemask_prefix = "Mems_allowed:\t";
161
162 /*
163 * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits.
164 *
165 * The first time we need these, we parse the Cpus_allowed and
166 * Mems_allowed lines from mask_size_file ("/proc/self/status").
167 */
168
169 static int cpumask_sz;
170 static int nodemask_sz;
171
172 /*
173 * These defaults only kick in if we fail to size the kernel
174 * cpumask and nodemask by reading the Cpus_allowed and
175 * Mems_allowed fields from the /proc/self/status file.
176 */
177
178 #define DEFCPUBITS (512)
179 #define DEFNODEBITS (DEFCPUBITS/2)
180
181 /*
182 * Arch-neutral API for obtaining NUMA distances between CPUs
183 * and Memory Nodes, via the files:
184 * /sys/devices/system/node/nodeN/distance
185 * which have lines such as:
186 * 46 66 10 20
187 * which say that for cpu on node N (from the path above), the
188 * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20,
189 * respectively.
190 */
191
192 static const char *distance_directory = "/sys/devices/system/node";
193
194 /*
195 * Someday, we should disable, then later discard, the SN code
196 * marked ALTERNATE_SN_DISTMAP.
197 */
198
199 #define ALTERNATE_SN_DISTMAP 1
200 #ifdef ALTERNATE_SN_DISTMAP
201
202 /*
203 * Alternative SN (SGI ia64) architecture specific API for obtaining
204 * NUMA distances between CPUs and Memory Nodes is via the file
205 * /proc/sgi_sn/sn_topology, which has lines such as:
206 *
207 * node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20
208 *
209 * which says that for each CPU on node 2, the distance to nodes
210 * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively.
211 *
212 * This file has other lines as well, which start with other
213 * keywords than "node". Ignore these other lines.
214 */
215
216 static const char *sn_topology = "/proc/sgi_sn/sn_topology";
217 static const char *sn_top_node_prefix = "node ";
218
219 #endif
220
221 /*
222 * Check that cpusets supported, /dev/cpuset mounted.
223 * If ok, return 0.
224 * If not, return -1 and set errno:
225 * ENOSYS - kernel doesn't support cpusets
226 * ENODEV - /dev/cpuset not mounted
227 */
228
229 static enum {
230 check_notdone,
231 check_enosys,
232 check_enodev,
233 check_ok
234 } check_state = check_notdone;
235
check()236 static int check()
237 {
238 if (check_state == check_notdone) {
239 struct stat statbuf;
240
241 if (stat("/proc/self/cpuset", &statbuf) < 0) {
242 check_state = check_enosys;
243 goto done;
244 }
245
246 if (stat("/dev/cpuset/tasks", &statbuf) < 0) {
247 check_state = check_enodev;
248 goto done;
249 }
250
251 check_state = check_ok;
252 }
253 done:
254 switch (check_state) {
255 case check_enosys:
256 errno = ENOSYS;
257 return -1;
258 case check_enodev:
259 errno = ENODEV;
260 return -1;
261 default:
262 break;
263 }
264 return 0;
265 }
266
chomp(char * s)267 static void chomp(char *s)
268 {
269 char *t;
270
271 for (t = s + strlen(s) - 1; t >= s; t--) {
272 if (*t == '\n' || *t == '\r')
273 *t = '\0';
274 else
275 break;
276 }
277 }
278
279 /*
280 * Determine number of bytes in a seekable open file, without
281 * assuming that stat(2) on that file has a useful size.
282 * Has side affect of leaving the file rewound to the beginnning.
283 */
filesize(FILE * fp)284 static int filesize(FILE * fp)
285 {
286 int sz = 0;
287 rewind(fp);
288 while (fgetc(fp) != EOF)
289 sz++;
290 rewind(fp);
291 return sz;
292 }
293
294 /* Are strings s1 and s2 equal? */
streq(const char * s1,const char * s2)295 static int streq(const char *s1, const char *s2)
296 {
297 return strcmp(s1, s2) == 0;
298 }
299
300 /* Is string 'pre' a prefix of string 's'? */
strprefix(const char * s,const char * pre)301 static int strprefix(const char *s, const char *pre)
302 {
303 return strncmp(s, pre, strlen(pre)) == 0;
304 }
305
306 /*
307 * char *flgets(char *buf, int buflen, FILE *fp)
308 *
309 * Obtain one line from input file fp. Copy up to first
310 * buflen-1 chars of line into buffer buf, discarding any remainder
311 * of line. Stop reading at newline, discarding newline.
312 * Nul terminate result and return pointer to buffer buf
313 * on success, or NULL if nothing more to read or failure.
314 */
315
flgets(char * buf,int buflen,FILE * fp)316 static char *flgets(char *buf, int buflen, FILE * fp)
317 {
318 int c = -1;
319 char *bp;
320
321 bp = buf;
322 while ((--buflen > 0) && ((c = getc(fp)) >= 0)) {
323 if (c == '\n')
324 goto newline;
325 *bp++ = c;
326 }
327 if ((c < 0) && (bp == buf))
328 return NULL;
329
330 if (c > 0) {
331 while ((c = getc(fp)) >= 0) {
332 if (c == '\n')
333 break;
334 }
335 }
336
337 newline:
338 *bp++ = '\0';
339 return buf;
340 }
341
342 /*
343 * sgetc(const char *inputbuf, int *offsetptr)
344 *
345 * Return next char from nul-terminated input buffer inputbuf,
346 * starting at offset *offsetptr. Increment *offsetptr.
347 * If next char would be nul ('\0'), return EOF and don't
348 * increment *offsetptr.
349 */
350
sgetc(const char * inputbuf,int * offsetptr)351 static int sgetc(const char *inputbuf, int *offsetptr)
352 {
353 char c;
354
355 if ((c = inputbuf[*offsetptr]) != 0) {
356 *offsetptr = *offsetptr + 1;
357 return c;
358 } else {
359 return EOF;
360 }
361 }
362
363 /*
364 * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
365 *
366 * Obtain next line from nul-terminated input buffer 'inputbuf',
367 * starting at offset *offsetptr. Copy up to first buflen-1
368 * chars of line into output buffer buf, discarding any remainder
369 * of line. Stop reading at newline, discarding newline.
370 * Nul terminate result and return pointer to output buffer
371 * buf on success, or NULL if nothing more to read.
372 */
373
slgets(char * buf,int buflen,const char * inputbuf,int * offsetptr)374 static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
375 {
376 int c = -1;
377 char *bp;
378
379 bp = buf;
380 while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) {
381 if (c == '\n')
382 goto newline;
383 *bp++ = c;
384 }
385 if ((c < 0) && (bp == buf))
386 return NULL;
387
388 if (c > 0) {
389 while ((c = sgetc(inputbuf, offsetptr)) >= 0) {
390 if (c == '\n')
391 break;
392 }
393 }
394
395 newline:
396 *bp++ = '\0';
397 return buf;
398 }
399
400 /*
401 * time_t get_mtime(char *path)
402 *
403 * Return modtime of file at location path, else return 0.
404 */
405
get_mtime(const char * path)406 static time_t get_mtime(const char *path)
407 {
408 struct stat statbuf;
409
410 if (stat(path, &statbuf) != 0)
411 return 0;
412 return statbuf.st_mtime;
413 }
414
415 /*
416 * int set_mtime(const char *path, time_t mtime)
417 *
418 * Set modtime of file 'path' to 'mtime'. Return 0 on success,
419 * or -1 on error, setting errno.
420 */
421
set_mtime(const char * path,time_t mtime)422 static int set_mtime(const char *path, time_t mtime)
423 {
424 struct utimbuf times;
425
426 times.actime = mtime;
427 times.modtime = mtime;
428 return utime(path, ×);
429 }
430
431 /*
432 * True if two pathnames resolve to same file.
433 * False if either path can not be stat'd,
434 * or if the two paths resolve to a different file.
435 */
436
samefile(const char * path1,const char * path2)437 static int samefile(const char *path1, const char *path2)
438 {
439 struct stat sb1, sb2;
440
441 if (stat(path1, &sb1) != 0)
442 return 0;
443 if (stat(path2, &sb2) != 0)
444 return 0;
445 return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev;
446 }
447
448 #define slash(c) (*(c) == '/')
449 #define eocomp(c) (slash(c) || !*(c))
450 #define dot1(c) (*(c) == '.' && eocomp(c+1))
451
452 /* In place path compression. Remove extra dots and slashes. */
pathcomp(char * p)453 static char *pathcomp(char *p)
454 {
455 char *a = p;
456 char *b = p;
457
458 if (!p || !*p)
459 return p;
460 if (slash(p))
461 *b++ = *a++;
462 for (;;) {
463 if (slash(a))
464 while (slash(++a))
465 continue;
466 if (!*a) {
467 if (b == p)
468 *b++ = '.';
469 *b = '\0';
470 return (p);
471 } else if (dot1(a)) {
472 a++;
473 } else {
474 if ((b != p) && !slash(b - 1))
475 *b++ = '/';
476 while (!eocomp(a))
477 *b++ = *a++;
478 }
479 }
480 }
481
482 #undef slash
483 #undef eocomp
484 #undef dot1
485
486 /*
487 * pathcat2(buf, buflen, name1, name2)
488 *
489 * Return buf, of length buflen, with name1/name2 stored in it.
490 */
491
pathcat2(char * buf,int buflen,const char * name1,const char * name2)492 static char *pathcat2(char *buf, int buflen, const char *name1,
493 const char *name2)
494 {
495 (void)snprintf(buf, buflen, "%s/%s", name1, name2);
496 return pathcomp(buf);
497 }
498
499 /*
500 * pathcat3(buf, buflen, name1, name2, name3)
501 *
502 * Return buf, of length buflen, with name1/name2/name3 stored in it.
503 */
504
pathcat3(char * buf,int buflen,const char * name1,const char * name2,const char * name3)505 static char *pathcat3(char *buf, int buflen, const char *name1,
506 const char *name2, const char *name3)
507 {
508 (void)snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3);
509 return pathcomp(buf);
510 }
511
512 /*
513 * fullpath(buf, buflen, name)
514 *
515 * Put full path of cpuset 'name' in buffer 'buf'. If name
516 * starts with a slash (``/``) character, then this a path
517 * relative to ``/dev/cpuset``, otherwise it is relative to
518 * the current tasks cpuset. Return 0 on success, else
519 * -1 on error, setting errno.
520 */
521
fullpath(char * buf,int buflen,const char * name)522 static int fullpath(char *buf, int buflen, const char *name)
523 {
524 int len;
525
526 /* easy case */
527 if (*name == '/') {
528 pathcat2(buf, buflen, cpusetmnt, name);
529 pathcomp(buf);
530 return 0;
531 }
532
533 /* hard case */
534 snprintf(buf, buflen, "%s/", cpusetmnt);
535 len = strlen(buf);
536 if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL)
537 return -1;
538 if (strlen(buf) >= buflen - 1 - strlen(name)) {
539 errno = E2BIG;
540 return -1;
541 }
542 strcat(buf, "/");
543 strcat(buf, name);
544 pathcomp(buf);
545 return 0;
546 }
547
548 /*
549 * fullpath2(buf, buflen, name1, name2)
550 *
551 * Like fullpath(), only concatenate two pathname components on end.
552 */
553
fullpath2(char * buf,int buflen,const char * name1,const char * name2)554 static int fullpath2(char *buf, int buflen, const char *name1,
555 const char *name2)
556 {
557 if (fullpath(buf, buflen, name1) < 0)
558 return -1;
559 if (strlen(buf) >= buflen - 1 - strlen(name2)) {
560 errno = E2BIG;
561 return -1;
562 }
563 strcat(buf, "/");
564 strcat(buf, name2);
565 pathcomp(buf);
566 return 0;
567 }
568
569 /*
570 * Convert the string length of an ascii hex mask to the number
571 * of bits represented by that mask.
572 *
573 * The cpumask and nodemask values in /proc/self/status are in an
574 * ascii format that uses 9 characters for each 32 bits of mask.
575 */
s2nbits(const char * s)576 static int s2nbits(const char *s)
577 {
578 return strlen(s) * 32 / 9;
579 }
580
update_mask_sizes()581 static void update_mask_sizes()
582 {
583 FILE *fp = NULL;
584 char *buf = NULL;
585 int fsize;
586
587 if ((fp = fopen(mask_size_file, "r")) == NULL)
588 goto done;
589 fsize = filesize(fp);
590 if ((buf = malloc(fsize)) == NULL)
591 goto done;
592
593 /*
594 * Beware: mask sizing arithmetic is fussy.
595 * The trailing newline left by fgets() is required.
596 */
597 while (fgets(buf, fsize, fp)) {
598 if (strprefix(buf, cpumask_prefix))
599 cpumask_sz = s2nbits(buf + strlen(cpumask_prefix));
600 if (strprefix(buf, nodemask_prefix))
601 nodemask_sz = s2nbits(buf + strlen(nodemask_prefix));
602 }
603 done:
604 free(buf);
605 if (fp != NULL)
606 fclose(fp);
607 if (cpumask_sz == 0)
608 cpumask_sz = DEFCPUBITS;
609 if (nodemask_sz == 0)
610 nodemask_sz = DEFNODEBITS;
611 }
612
613 /* Allocate a new struct cpuset */
cpuset_alloc()614 struct cpuset *cpuset_alloc()
615 {
616 struct cpuset *cp = NULL;
617 int nbits;
618
619 if ((cp = calloc(1, sizeof(struct cpuset))) == NULL)
620 goto err;
621
622 nbits = cpuset_cpus_nbits();
623 if ((cp->cpus = bitmask_alloc(nbits)) == NULL)
624 goto err;
625
626 nbits = cpuset_mems_nbits();
627 if ((cp->mems = bitmask_alloc(nbits)) == NULL)
628 goto err;
629
630 return cp;
631 err:
632 if (cp && cp->cpus)
633 bitmask_free(cp->cpus);
634 if (cp && cp->mems)
635 bitmask_free(cp->mems);
636 free(cp);
637 return NULL;
638 }
639
640 /* Free struct cpuset *cp */
cpuset_free(struct cpuset * cp)641 void cpuset_free(struct cpuset *cp)
642 {
643 if (!cp)
644 return;
645 if (cp->cpus)
646 bitmask_free(cp->cpus);
647 if (cp->mems)
648 bitmask_free(cp->mems);
649 free(cp);
650 }
651
652 /* Number of bits in a CPU bitmask on current system */
cpuset_cpus_nbits()653 int cpuset_cpus_nbits()
654 {
655 if (cpumask_sz == 0)
656 update_mask_sizes();
657 return cpumask_sz;
658 }
659
660 /* Number of bits in a Memory bitmask on current system */
cpuset_mems_nbits()661 int cpuset_mems_nbits()
662 {
663 if (nodemask_sz == 0)
664 update_mask_sizes();
665 return nodemask_sz;
666 }
667
668 /* Set CPUs in cpuset cp to bitmask cpus */
cpuset_setcpus(struct cpuset * cp,const struct bitmask * cpus)669 int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus)
670 {
671 if (cp->cpus)
672 bitmask_free(cp->cpus);
673 cp->cpus = bitmask_alloc(bitmask_nbits(cpus));
674 if (cp->cpus == NULL)
675 return -1;
676 bitmask_copy(cp->cpus, cpus);
677 cp->cpus_valid = 1;
678 cp->cpus_dirty = 1;
679 return 0;
680 }
681
682 /* Set Memory Nodes in cpuset cp to bitmask mems */
cpuset_setmems(struct cpuset * cp,const struct bitmask * mems)683 int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems)
684 {
685 if (cp->mems)
686 bitmask_free(cp->mems);
687 cp->mems = bitmask_alloc(bitmask_nbits(mems));
688 if (cp->mems == NULL)
689 return -1;
690 bitmask_copy(cp->mems, mems);
691 cp->mems_valid = 1;
692 cp->mems_dirty = 1;
693 return 0;
694 }
695
696 /* Set integer value optname of cpuset cp */
cpuset_set_iopt(struct cpuset * cp,const char * optionname,int value)697 int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value)
698 {
699 if (streq(optionname, "cpu_exclusive")) {
700 cp->cpu_exclusive = ! !value;
701 cp->cpu_exclusive_valid = 1;
702 cp->cpu_exclusive_dirty = 1;
703 } else if (streq(optionname, "mem_exclusive")) {
704 cp->mem_exclusive = ! !value;
705 cp->mem_exclusive_valid = 1;
706 cp->mem_exclusive_dirty = 1;
707 } else if (streq(optionname, "mem_hardwall")) {
708 cp->mem_hardwall = ! !value;
709 cp->mem_hardwall_valid = 1;
710 cp->mem_hardwall_dirty = 1;
711 } else if (streq(optionname, "notify_on_release")) {
712 cp->notify_on_release = ! !value;
713 cp->notify_on_release_valid = 1;
714 cp->notify_on_release_dirty = 1;
715 } else if (streq(optionname, "memory_pressure_enabled")) {
716 cp->memory_pressure_enabled = ! !value;
717 cp->memory_pressure_enabled_valid = 1;
718 cp->memory_pressure_enabled_dirty = 1;
719 } else if (streq(optionname, "memory_migrate")) {
720 cp->memory_migrate = ! !value;
721 cp->memory_migrate_valid = 1;
722 cp->memory_migrate_dirty = 1;
723 } else if (streq(optionname, "memory_spread_page")) {
724 cp->memory_spread_page = ! !value;
725 cp->memory_spread_page_valid = 1;
726 cp->memory_spread_page_dirty = 1;
727 } else if (streq(optionname, "memory_spread_slab")) {
728 cp->memory_spread_slab = ! !value;
729 cp->memory_spread_slab_valid = 1;
730 cp->memory_spread_slab_dirty = 1;
731 } else if (streq(optionname, "sched_load_balance")) {
732 cp->sched_load_balance = ! !value;
733 cp->sched_load_balance_valid = 1;
734 cp->sched_load_balance_dirty = 1;
735 } else if (streq(optionname, "sched_relax_domain_level")) {
736 cp->sched_relax_domain_level = value;
737 cp->sched_relax_domain_level_valid = 1;
738 cp->sched_relax_domain_level_dirty = 1;
739 } else
740 return -2; /* optionname not recognized */
741 return 0;
742 }
743
744 /* [optional] Set string value optname */
cpuset_set_sopt(UNUSED struct cpuset * cp,UNUSED const char * optionname,UNUSED const char * value)745 int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname,
746 UNUSED const char *value)
747 {
748 return -2; /* For now, all string options unrecognized */
749 }
750
751 /* Return handle for reading memory_pressure. */
cpuset_open_memory_pressure(const char * cpusetpath)752 int cpuset_open_memory_pressure(const char *cpusetpath)
753 {
754 char buf[PATH_MAX];
755
756 fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure");
757 return open(buf, O_RDONLY);
758 }
759
760 /* Return current memory_pressure of cpuset. */
cpuset_read_memory_pressure(int han)761 int cpuset_read_memory_pressure(int han)
762 {
763 char buf[SMALL_BUFSZ];
764
765 if (pread(han, buf, sizeof(buf), 0L) < 0)
766 return -1;
767 return atoi(buf);
768 }
769
770 /* Close handle for reading memory pressure. */
cpuset_close_memory_pressure(int han)771 void cpuset_close_memory_pressure(int han)
772 {
773 close(han);
774 }
775
776 /*
777 * Resolve cpuset pointer (to that of current task if cp == NULL).
778 *
779 * If cp not NULL, just return it. If cp is NULL, return pointer
780 * to temporary cpuset for current task, and set *cp_tofree to
781 * pointer to that same temporary cpuset, to be freed later.
782 *
783 * Return NULL and set errno on error. Errors can occur when
784 * resolving the current tasks cpuset.
785 */
resolve_cp(const struct cpuset * cp,struct cpuset ** cp_tofree)786 static const struct cpuset *resolve_cp(const struct cpuset *cp,
787 struct cpuset **cp_tofree)
788 {
789 const struct cpuset *rcp;
790
791 if (cp) {
792 rcp = cp;
793 } else {
794 struct cpuset *cp1 = cpuset_alloc();
795 if (cp1 == NULL)
796 goto err;
797 if (cpuset_cpusetofpid(cp1, 0) < 0) {
798 cpuset_free(cp1);
799 goto err;
800 }
801 *cp_tofree = cp1;
802 rcp = cp1;
803 }
804 return rcp;
805 err:
806 return NULL;
807 }
808
809 /* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */
cpuset_getcpus(const struct cpuset * cp,struct bitmask * cpus)810 int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus)
811 {
812 struct cpuset *cp_tofree = NULL;
813 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
814
815 if (!cp1)
816 goto err;
817 if (cp1->cpus == NULL) {
818 errno = EINVAL;
819 goto err;
820 }
821 bitmask_copy(cpus, cp1->cpus);
822 cpuset_free(cp_tofree);
823 return 0;
824 err:
825 cpuset_free(cp_tofree);
826 return -1;
827 }
828
829 /* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */
cpuset_getmems(const struct cpuset * cp,struct bitmask * mems)830 int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems)
831 {
832 struct cpuset *cp_tofree = NULL;
833 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
834
835 if (!cp1)
836 goto err;
837 if (cp1->mems == NULL) {
838 errno = EINVAL;
839 goto err;
840 }
841 bitmask_copy(mems, cp1->mems);
842 cpuset_free(cp_tofree);
843 return 0;
844 err:
845 cpuset_free(cp_tofree);
846 return -1;
847 }
848
849 /* Return number of CPUs in cpuset cp (current task if cp == NULL) */
cpuset_cpus_weight(const struct cpuset * cp)850 int cpuset_cpus_weight(const struct cpuset *cp)
851 {
852 struct cpuset *cp_tofree = NULL;
853 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
854 int w = -1;
855
856 if (!cp1)
857 goto err;
858 if (cp1->cpus == NULL) {
859 errno = EINVAL;
860 goto err;
861 }
862 w = bitmask_weight(cp1->cpus);
863 /* fall into ... */
864 err:
865 cpuset_free(cp_tofree);
866 return w;
867 }
868
869 /* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */
cpuset_mems_weight(const struct cpuset * cp)870 int cpuset_mems_weight(const struct cpuset *cp)
871 {
872 struct cpuset *cp_tofree = NULL;
873 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
874 int w = -1;
875
876 if (!cp1)
877 goto err;
878 if (cp1->mems == NULL) {
879 errno = EINVAL;
880 goto err;
881 }
882 w = bitmask_weight(cp1->mems);
883 /* fall into ... */
884 err:
885 cpuset_free(cp_tofree);
886 return w;
887 }
888
889 /* Return integer value of option optname in cp */
cpuset_get_iopt(const struct cpuset * cp,const char * optionname)890 int cpuset_get_iopt(const struct cpuset *cp, const char *optionname)
891 {
892 if (streq(optionname, "cpu_exclusive"))
893 return cp->cpu_exclusive;
894 else if (streq(optionname, "mem_exclusive"))
895 return cp->mem_exclusive;
896 else if (streq(optionname, "mem_hardwall"))
897 return cp->mem_hardwall;
898 else if (streq(optionname, "notify_on_release"))
899 return cp->notify_on_release;
900 else if (streq(optionname, "memory_pressure_enabled"))
901 return cp->memory_pressure_enabled;
902 else if (streq(optionname, "memory_migrate"))
903 return cp->memory_migrate;
904 else if (streq(optionname, "memory_spread_page"))
905 return cp->memory_spread_page;
906 else if (streq(optionname, "memory_spread_slab"))
907 return cp->memory_spread_slab;
908 else if (streq(optionname, "sched_load_balance"))
909 return cp->sched_load_balance;
910 else if (streq(optionname, "sched_relax_domain_level"))
911 return cp->sched_relax_domain_level;
912 else
913 return -2; /* optionname not recognized */
914 }
915
916 /* [optional] Return string value of optname */
cpuset_get_sopt(UNUSED const struct cpuset * cp,UNUSED const char * optionname)917 const char *cpuset_get_sopt(UNUSED const struct cpuset *cp,
918 UNUSED const char *optionname)
919 {
920 return NULL; /* For now, all string options unrecognized */
921 }
922
read_flag(const char * filepath,char * flagp)923 static int read_flag(const char *filepath, char *flagp)
924 {
925 char buf[SMALL_BUFSZ]; /* buffer a "0" or "1" flag line */
926 int fd = -1;
927
928 if ((fd = open(filepath, O_RDONLY)) < 0)
929 goto err;
930 if (read(fd, buf, sizeof(buf)) < 1)
931 goto err;
932 if (atoi(buf))
933 *flagp = 1;
934 else
935 *flagp = 0;
936 close(fd);
937 return 0;
938 err:
939 if (fd >= 0)
940 close(fd);
941 return -1;
942 }
943
load_flag(const char * path,char * flagp,const char * flag)944 static int load_flag(const char *path, char *flagp, const char *flag)
945 {
946 char buf[PATH_MAX];
947
948 pathcat2(buf, sizeof(buf), path, flag);
949 return read_flag(buf, flagp);
950 }
951
read_number(const char * filepath,int * numberp)952 static int read_number(const char *filepath, int *numberp)
953 {
954 char buf[SMALL_BUFSZ];
955 int fd = -1;
956
957 if ((fd = open(filepath, O_RDONLY)) < 0)
958 goto err;
959 if (read(fd, buf, sizeof(buf)) < 1)
960 goto err;
961 *numberp = atoi(buf);
962 close(fd);
963 return 0;
964 err:
965 if (fd >= 0)
966 close(fd);
967 return -1;
968 }
969
load_number(const char * path,int * numberp,const char * file)970 static int load_number(const char *path, int *numberp, const char *file)
971 {
972 char buf[PATH_MAX];
973
974 pathcat2(buf, sizeof(buf), path, file);
975 return read_number(buf, numberp);
976 }
977
read_mask(const char * filepath,struct bitmask ** bmpp,int nbits)978 static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits)
979 {
980 FILE *fp = NULL;
981 char *buf = NULL;
982 int buflen;
983 struct bitmask *bmp = NULL;
984
985 if ((fp = fopen(filepath, "r")) == NULL)
986 goto err;
987 buflen = filesize(fp) + 1; /* + 1 for nul term */
988 if ((buf = malloc(buflen)) == NULL)
989 goto err;
990 if (flgets(buf, buflen, fp) == NULL)
991 goto err;
992 fclose(fp);
993 fp = NULL;
994
995 if ((bmp = bitmask_alloc(nbits)) == NULL)
996 goto err;
997 if (*buf && bitmask_parselist(buf, bmp) < 0)
998 goto err;
999 if (*bmpp)
1000 bitmask_free(*bmpp);
1001 *bmpp = bmp;
1002 free(buf);
1003 buf = NULL;
1004 return 0;
1005 err:
1006 if (buf != NULL)
1007 free(buf);
1008 if (fp != NULL)
1009 fclose(fp);
1010 if (bmp != NULL)
1011 bitmask_free(bmp);
1012 return -1;
1013 }
1014
load_mask(const char * path,struct bitmask ** bmpp,int nbits,const char * mask)1015 static int load_mask(const char *path, struct bitmask **bmpp,
1016 int nbits, const char *mask)
1017 {
1018 char buf[PATH_MAX];
1019
1020 pathcat2(buf, sizeof(buf), path, mask);
1021 return read_mask(buf, bmpp, nbits);
1022 }
1023
1024 /* Write string to file at given filepath. Create or truncate file. */
write_string_file(const char * filepath,const char * str)1025 static int write_string_file(const char *filepath, const char *str)
1026 {
1027 int fd = -1;
1028
1029 if ((fd = open(filepath, O_WRONLY | O_CREAT, 0644)) < 0)
1030 goto err;
1031 if (write(fd, str, strlen(str)) < 0)
1032 goto err;
1033 close(fd);
1034 return 0;
1035 err:
1036 if (fd >= 0)
1037 close(fd);
1038 return -1;
1039 }
1040
1041 /* Size and allocate buffer. Write bitmask into it. Caller must free */
sprint_mask_buf(const struct bitmask * bmp)1042 static char *sprint_mask_buf(const struct bitmask *bmp)
1043 {
1044 char *buf = NULL;
1045 int buflen;
1046 char c;
1047
1048 /* First bitmask_displaylist() call just to get the length */
1049 buflen = bitmask_displaylist(&c, 1, bmp) + 1; /* "+ 1" for nul */
1050 if ((buf = malloc(buflen)) == NULL)
1051 return NULL;
1052 bitmask_displaylist(buf, buflen, bmp);
1053 return buf;
1054 }
1055
exists_flag(const char * path,const char * flag)1056 static int exists_flag(const char *path, const char *flag)
1057 {
1058 char buf[PATH_MAX];
1059 struct stat statbuf;
1060 int rc;
1061
1062 pathcat2(buf, sizeof(buf), path, flag);
1063 rc = (stat(buf, &statbuf) == 0);
1064 errno = 0;
1065 return rc;
1066 }
1067
store_flag(const char * path,const char * flag,int val)1068 static int store_flag(const char *path, const char *flag, int val)
1069 {
1070 char buf[PATH_MAX];
1071
1072 pathcat2(buf, sizeof(buf), path, flag);
1073 return write_string_file(buf, val ? "1" : "0");
1074 }
1075
store_number(const char * path,const char * file,int val)1076 static int store_number(const char *path, const char *file, int val)
1077 {
1078 char buf[PATH_MAX];
1079 char data[SMALL_BUFSZ];
1080
1081 memset(data, 0, sizeof(data));
1082 pathcat2(buf, sizeof(buf), path, file);
1083 snprintf(data, sizeof(data), "%d", val);
1084 return write_string_file(buf, data);
1085 }
1086
store_mask(const char * path,const char * mask,const struct bitmask * bmp)1087 static int store_mask(const char *path, const char *mask,
1088 const struct bitmask *bmp)
1089 {
1090 char maskpath[PATH_MAX];
1091 char *bp = NULL;
1092 int rc;
1093
1094 if (bmp == NULL)
1095 return 0;
1096 pathcat2(maskpath, sizeof(maskpath), path, mask);
1097 if ((bp = sprint_mask_buf(bmp)) == NULL)
1098 return -1;
1099 rc = write_string_file(maskpath, bp);
1100 free(bp);
1101 return rc;
1102 }
1103
1104 /*
1105 * Return 1 if 'cpu' is online, else 0 if offline. Tests the file
1106 * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents
1107 * were N == cpu number.
1108 */
1109
cpu_online(unsigned int cpu)1110 char cpu_online(unsigned int cpu)
1111 {
1112 char online;
1113 char cpupath[PATH_MAX];
1114
1115 (void)snprintf(cpupath, sizeof(cpupath),
1116 "/sys/devices/system/cpu/cpu%d/online", cpu);
1117 if (read_flag(cpupath, &online) < 0)
1118 return 0; /* oops - guess that cpu's not there */
1119 return online;
1120 }
1121
1122 /*
1123 * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()),
1124 * to the node on which that cpu resides or cpuset_mems_nbits().
1125 *
1126 * To avoid every user having to recalculate this relation
1127 * from various clues in the sysfs file system (below the
1128 * path /sys/devices/system) a copy of this map is kept at
1129 * /var/run/cpunodemap.
1130 *
1131 * The system automatically cleans out files below
1132 * /var/run on each system reboot (see the init script
1133 * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry
1134 * about stale data in this file across reboots. If the file
1135 * is missing, let the first process that needs it, and has
1136 * permission to write in the /var/run directory, rebuild it.
1137 *
1138 * If using this cached data, remember the mtime of the mapfile
1139 * the last time we read it in case something like a hotplug
1140 * event results in the file being removed and rebuilt, so we
1141 * can detect if we're using a stale cache, and need to reload.
1142 *
1143 * The mtime of this file is set to the time when we did
1144 * the recalculation of the map, from the clues beneath
1145 * /sys/devices/system. This is done so that a program
1146 * won't see the mapfile it just wrote as being newer than what
1147 * it just wrote out (store_map) and read the same map back in
1148 * (load_file).
1149 */
1150
1151 /*
1152 * Hold flockfile(stdin) while using cpunodemap for posix thread safety.
1153 *
1154 * Note on locking and flockfile(FILE *):
1155 *
1156 * We use flockfile() and funlockfile() instead of directly
1157 * calling pthread_mutex_lock and pthread_mutex_unlock on
1158 * a pthread_mutex_t, because this avoids forcing the app
1159 * to link with libpthread. The glibc implementation of
1160 * flockfile/funlockfile will fall back to no-ops if libpthread
1161 * doesn't happen to be linked.
1162 *
1163 * Since flockfile already has the moderately convoluted
1164 * combination of weak and strong symbols required to accomplish
1165 * this, it is easier to use flockfile() on some handy FILE *
1166 * stream as a surrogate for pthread locking than it is to so
1167 * re-invent that wheel.
1168 *
1169 * Forcing all apps that use cpusets to link with libpthread
1170 * would force non-transparent initialization on apps that
1171 * might not be prepared to handle it.
1172 *
1173 * The application using libcpuset should never notice this
1174 * odd use of flockfile(), because we never return to the
1175 * application from any libcpuset call with any such lock held.
1176 * We just use this locking for guarding some non-atomic cached
1177 * data updates and accesses, internal to some libcpuset calls.
1178 * Also, flockfile() allows recursive nesting, so if the app
1179 * calls libcpuset holding such a file lock, we won't deadlock
1180 * if we go to acquire the same lock. We'll just get the lock
1181 * and increment its counter while we hold it.
1182 */
1183
1184 static struct cpunodemap {
1185 int *map; /* map[cpumask_sz]: maps cpu to its node */
1186 time_t mtime; /* modtime of mapfile when last read */
1187 } cpunodemap;
1188
1189 /*
1190 * rebuild_map() - Rebuild cpunodemap[] from scratch.
1191 *
1192 * Situation:
1193 * Neither our in-memory cpunodemap[] array nor the
1194 * cache of it in mapfile is current.
1195 * Action:
1196 * Rebuild it from first principles and the information
1197 * available below /sys/devices/system.
1198 */
1199
rebuild_map()1200 static void rebuild_map()
1201 {
1202 char buf[PATH_MAX];
1203 DIR *dir1, *dir2;
1204 struct dirent *dent1, *dent2;
1205 int ncpus = cpuset_cpus_nbits();
1206 int nmems = cpuset_mems_nbits();
1207 unsigned int cpu, mem;
1208
1209 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1210 cpunodemap.map[cpu] = -1;
1211 pathcat2(buf, sizeof(buf), sysdevices, "node");
1212 if ((dir1 = opendir(buf)) == NULL)
1213 return;
1214 while ((dent1 = readdir(dir1)) != NULL) {
1215 if (sscanf(dent1->d_name, "node%u", &mem) < 1)
1216 continue;
1217 pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name);
1218 if ((dir2 = opendir(buf)) == NULL)
1219 continue;
1220 while ((dent2 = readdir(dir2)) != NULL) {
1221 if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1)
1222 continue;
1223 if (cpu >= (unsigned int)ncpus
1224 || mem >= (unsigned int)nmems)
1225 continue;
1226 cpunodemap.map[cpu] = mem;
1227 }
1228 closedir(dir2);
1229 }
1230 closedir(dir1);
1231 cpunodemap.mtime = time(0);
1232 }
1233
1234 /*
1235 * load_map() - Load cpunodemap[] from mapfile.
1236 *
1237 * Situation:
1238 * The cpunodemap in mapfile is more recent than
1239 * what we have in the cpunodemap[] array.
1240 * Action:
1241 * Reload the cpunodemap[] array from the file.
1242 */
1243
load_map()1244 static void load_map()
1245 {
1246 char buf[SMALL_BUFSZ]; /* buffer 1 line of mapfile */
1247 FILE *mapfp; /* File stream on mapfile */
1248 int ncpus = cpuset_cpus_nbits();
1249 int nmems = cpuset_mems_nbits();
1250 unsigned int cpu, mem;
1251
1252 if ((cpunodemap.map = calloc(ncpus, sizeof(int))) == NULL)
1253 return;
1254 cpunodemap.mtime = get_mtime(mapfile);
1255 if ((mapfp = fopen(mapfile, "r")) == NULL)
1256 return;
1257 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1258 cpunodemap.map[cpu] = nmems;
1259 while (flgets(buf, sizeof(buf), mapfp) != NULL) {
1260 if (sscanf(buf, "%u %u", &cpu, &mem) < 2)
1261 continue;
1262 if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems)
1263 continue;
1264 cpunodemap.map[cpu] = mem;
1265 }
1266 fclose(mapfp);
1267 }
1268
1269 /*
1270 * store_map() - Write cpunodemap[] out to mapfile.
1271 *
1272 * Situation:
1273 * The cpunodemap in the cpunodemap[] array is
1274 * more recent than the one in mapfile.
1275 * Action:
1276 * Write cpunodemap[] out to mapfile.
1277 */
1278
store_map()1279 static void store_map()
1280 {
1281 char buf[PATH_MAX];
1282 int fd = -1;
1283 FILE *mapfp = NULL;
1284 int ncpus = cpuset_cpus_nbits();
1285 int nmems = cpuset_mems_nbits();
1286 unsigned int cpu, mem;
1287
1288 snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX");
1289 if ((fd = mkstemp(buf)) < 0)
1290 goto err;
1291 if ((mapfp = fdopen(fd, "w")) == NULL)
1292 goto err;
1293 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1294 mem = cpunodemap.map[cpu];
1295 if (mem < (unsigned int)nmems)
1296 fprintf(mapfp, "%u %u\n", cpu, mem);
1297 }
1298 fclose(mapfp);
1299 set_mtime(buf, cpunodemap.mtime);
1300 if (rename(buf, mapfile) < 0)
1301 goto err;
1302 /* mkstemp() creates mode 0600 - change to world readable */
1303 (void)chmod(mapfile, 0444);
1304 return;
1305 err:
1306 if (mapfp != NULL) {
1307 fclose(mapfp);
1308 fd = -1;
1309 }
1310 if (fd >= 0)
1311 close(fd);
1312 (void)unlink(buf);
1313 }
1314
1315 /*
1316 * Load and gain thread safe access to the <cpu, node> map.
1317 *
1318 * Return 0 on success with flockfile(stdin) held.
1319 * Each successful get_map() call must be matched with a
1320 * following put_map() call to release the lock.
1321 *
1322 * On error, return -1 with errno set and no lock held.
1323 */
1324
get_map()1325 static int get_map()
1326 {
1327 time_t file_mtime;
1328
1329 flockfile(stdin);
1330
1331 if (cpunodemap.map == NULL) {
1332 cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int));
1333 if (cpunodemap.map == NULL)
1334 goto err;
1335 }
1336
1337 /* If no one has a good cpunodemap, rebuild from scratch */
1338 file_mtime = get_mtime(mapfile);
1339 if (cpunodemap.mtime == 0 && file_mtime == 0)
1340 rebuild_map();
1341
1342 /* If either cpunodemap[] or mapfile newer, update other with it */
1343 file_mtime = get_mtime(mapfile);
1344 if (cpunodemap.mtime < file_mtime)
1345 load_map();
1346 else if (cpunodemap.mtime > file_mtime)
1347 store_map();
1348 return 0;
1349 err:
1350 funlockfile(stdin);
1351 return -1;
1352 }
1353
put_map()1354 static void put_map()
1355 {
1356 funlockfile(stdin);
1357 }
1358
1359 /* Set cpus to those local to Memory Nodes mems */
cpuset_localcpus(const struct bitmask * mems,struct bitmask * cpus)1360 int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus)
1361 {
1362 int ncpus = cpuset_cpus_nbits();
1363 unsigned int cpu;
1364
1365 if (check() < 0)
1366 return -1;
1367
1368 get_map();
1369 bitmask_clearall(cpus);
1370 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1371 if (bitmask_isbitset(mems, cpunodemap.map[cpu]))
1372 bitmask_setbit(cpus, cpu);
1373 }
1374 put_map();
1375 return 0;
1376 }
1377
1378 /* Set mems to those local to CPUs cpus */
cpuset_localmems(const struct bitmask * cpus,struct bitmask * mems)1379 int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems)
1380 {
1381 int ncpus = cpuset_cpus_nbits();
1382 unsigned int cpu;
1383
1384 if (check() < 0)
1385 return -1;
1386
1387 get_map();
1388 bitmask_clearall(mems);
1389 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1390 if (bitmask_isbitset(cpus, cpu))
1391 bitmask_setbit(mems, cpunodemap.map[cpu]);
1392 }
1393 put_map();
1394 return 0;
1395 }
1396
1397 /*
1398 * distmap[]
1399 *
1400 * Array of ints of size cpumask_sz by nodemask_sz.
1401 *
1402 * Element distmap[cpu][mem] is the distance between CPU cpu
1403 * and Memory Node mem. Distances are weighted to roughly
1404 * approximate the cost of memory references, and scaled so that
1405 * the distance from a CPU to its local Memory Node is ten (10).
1406 *
1407 * The first call to cpuset_cpumemdist() builds this map, from
1408 * whatever means the kernel provides to obtain these distances.
1409 *
1410 * These distances derive from ACPI SLIT table entries, which are
1411 * eight bits in size.
1412 *
1413 * Hold flockfile(stdout) while using distmap for posix thread safety.
1414 */
1415
1416 typedef unsigned char distmap_entry_t; /* type of distmap[] entries */
1417
1418 static distmap_entry_t *distmap; /* maps <cpu, mem> to distance */
1419
1420 #define DISTMAP_MAX UCHAR_MAX /* maximum value in distmap[] */
1421
1422 #define I(i,j) ((i) * nmems + (j)) /* 2-D array index simulation */
1423
1424 /*
1425 * Parse arch neutral lines from 'distance' files of form:
1426 *
1427 * 46 66 10 20
1428 *
1429 * The lines contain a space separated list of distances, which is parsed
1430 * into array dists[] of each nodes distance from the specified node.
1431 *
1432 * Result is placed in distmap[ncpus][nmems]:
1433 *
1434 * For each cpu c on node:
1435 * For each node position n in list of distances:
1436 * distmap[c][n] = dists[n]
1437 */
1438
parse_distmap_line(unsigned int node,char * buf)1439 static int parse_distmap_line(unsigned int node, char *buf)
1440 {
1441 char *p, *q;
1442 int ncpus = cpuset_cpus_nbits();
1443 int nmems = cpuset_mems_nbits();
1444 unsigned int c, n;
1445 distmap_entry_t *dists = NULL;
1446 struct bitmask *cpus = NULL, *mems = NULL;
1447 int ret = -1;
1448
1449 p = buf;
1450 if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1451 goto err;
1452 for (n = 0; n < (unsigned int)nmems; n++)
1453 dists[n] = DISTMAP_MAX;
1454
1455 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1456 unsigned int d;
1457
1458 if ((p = strpbrk(p, "0123456789")) == NULL)
1459 break;
1460 d = strtoul(p, &q, 10);
1461 if (p == q)
1462 break;
1463 if (d < DISTMAP_MAX)
1464 dists[n] = (distmap_entry_t) d;
1465 }
1466
1467 if ((mems = bitmask_alloc(nmems)) == NULL)
1468 goto err;
1469 bitmask_setbit(mems, node);
1470
1471 if ((cpus = bitmask_alloc(ncpus)) == NULL)
1472 goto err;
1473 cpuset_localcpus(mems, cpus);
1474
1475 for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1476 c = bitmask_next(cpus, c + 1))
1477 for (n = 0; n < (unsigned int)nmems; n++)
1478 distmap[I(c, n)] = dists[n];
1479 ret = 0;
1480 /* fall into ... */
1481 err:
1482 bitmask_free(mems);
1483 bitmask_free(cpus);
1484 free(dists);
1485 return ret;
1486 }
1487
parse_distance_file(unsigned int node,const char * path)1488 static int parse_distance_file(unsigned int node, const char *path)
1489 {
1490 FILE *fp;
1491 char *buf = NULL;
1492 int buflen;
1493
1494 if ((fp = fopen(path, "r")) == NULL)
1495 goto err;
1496
1497 buflen = filesize(fp);
1498
1499 if ((buf = malloc(buflen)) == NULL)
1500 goto err;
1501
1502 if (flgets(buf, buflen, fp) == NULL)
1503 goto err;
1504
1505 if (parse_distmap_line(node, buf) < 0)
1506 goto err;
1507
1508 free(buf);
1509 fclose(fp);
1510 return 0;
1511 err:
1512 free(buf);
1513 if (fp)
1514 fclose(fp);
1515 return -1;
1516 }
1517
build_distmap()1518 static void build_distmap()
1519 {
1520 static int tried_before = 0;
1521 int ncpus = cpuset_cpus_nbits();
1522 int nmems = cpuset_mems_nbits();
1523 int c, m;
1524 DIR *dir = NULL;
1525 struct dirent *dent;
1526
1527 if (tried_before)
1528 goto err;
1529 tried_before = 1;
1530
1531 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1532 goto err;
1533
1534 for (c = 0; c < ncpus; c++)
1535 for (m = 0; m < nmems; m++)
1536 distmap[I(c, m)] = DISTMAP_MAX;
1537
1538 if ((dir = opendir(distance_directory)) == NULL)
1539 goto err;
1540 while ((dent = readdir(dir)) != NULL) {
1541 char buf[PATH_MAX];
1542 unsigned int node;
1543
1544 if (sscanf(dent->d_name, "node%u", &node) < 1)
1545 continue;
1546 pathcat3(buf, sizeof(buf), distance_directory, dent->d_name,
1547 "distance");
1548 if (parse_distance_file(node, buf) < 0)
1549 goto err;
1550 }
1551 closedir(dir);
1552 return;
1553 err:
1554 if (dir)
1555 closedir(dir);
1556 free(distmap);
1557 distmap = NULL;
1558 }
1559
1560 #ifdef ALTERNATE_SN_DISTMAP
1561
1562 /*
1563 * Parse SN architecture specific line of form:
1564 *
1565 * node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10
1566 *
1567 * Second field is node number. The "dist" field is the colon separated list
1568 * of distances, which is parsed into array dists[] of each nodes distance
1569 * from that node.
1570 *
1571 * Result is placed in distmap[ncpus][nmems]:
1572 *
1573 * For each cpu c on that node:
1574 * For each node position n in list of distances:
1575 * distmap[c][n] = dists[n]
1576 */
1577
parse_distmap_line_sn(char * buf)1578 static void parse_distmap_line_sn(char *buf)
1579 {
1580 char *p, *pend, *q;
1581 int ncpus = cpuset_cpus_nbits();
1582 int nmems = cpuset_mems_nbits();
1583 unsigned long c, n, node;
1584 distmap_entry_t *dists = NULL;
1585 struct bitmask *cpus = NULL, *mems = NULL;
1586
1587 if ((p = strchr(buf, ' ')) == NULL)
1588 goto err;
1589 if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems)
1590 goto err;
1591 if ((p = strstr(q, " dist ")) == NULL)
1592 goto err;
1593 p += strlen(" dist ");
1594 if ((pend = strchr(p, ' ')) != NULL)
1595 *pend = '\0';
1596 if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1597 goto err;
1598 for (n = 0; n < (unsigned int)nmems; n++)
1599 dists[n] = DISTMAP_MAX;
1600
1601 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1602 unsigned long d;
1603
1604 if ((p = strpbrk(p, "0123456789")) == NULL)
1605 break;
1606 d = strtoul(p, &q, 10);
1607 if (p == q)
1608 break;
1609 if (d < DISTMAP_MAX)
1610 dists[n] = (distmap_entry_t) d;
1611 }
1612
1613 if ((mems = bitmask_alloc(nmems)) == NULL)
1614 goto err;
1615 bitmask_setbit(mems, node);
1616
1617 if ((cpus = bitmask_alloc(ncpus)) == NULL)
1618 goto err;
1619 cpuset_localcpus(mems, cpus);
1620
1621 for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1622 c = bitmask_next(cpus, c + 1))
1623 for (n = 0; n < (unsigned int)nmems; n++)
1624 distmap[I(c, n)] = dists[n];
1625 /* fall into ... */
1626 err:
1627 bitmask_free(mems);
1628 bitmask_free(cpus);
1629 free(dists);
1630 }
1631
build_distmap_sn()1632 static void build_distmap_sn()
1633 {
1634 int ncpus = cpuset_cpus_nbits();
1635 int nmems = cpuset_mems_nbits();
1636 int c, m;
1637 static int tried_before = 0;
1638 FILE *fp = NULL;
1639 char *buf = NULL;
1640 int buflen;
1641
1642 if (tried_before)
1643 goto err;
1644 tried_before = 1;
1645
1646 if ((fp = fopen(sn_topology, "r")) == NULL)
1647 goto err;
1648
1649 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1650 goto err;
1651
1652 for (c = 0; c < ncpus; c++)
1653 for (m = 0; m < nmems; m++)
1654 distmap[I(c, m)] = DISTMAP_MAX;
1655
1656 buflen = filesize(fp);
1657 if ((buf = malloc(buflen)) == NULL)
1658 goto err;
1659
1660 while (flgets(buf, buflen, fp) != NULL)
1661 if (strprefix(buf, sn_top_node_prefix))
1662 parse_distmap_line_sn(buf);
1663
1664 free(buf);
1665 fclose(fp);
1666 return;
1667 err:
1668 free(buf);
1669 free(distmap);
1670 distmap = NULL;
1671 if (fp)
1672 fclose(fp);
1673 }
1674
1675 #endif
1676
1677 /* [optional] Hardware distance from CPU to Memory Node */
cpuset_cpumemdist(int cpu,int mem)1678 unsigned int cpuset_cpumemdist(int cpu, int mem)
1679 {
1680 int ncpus = cpuset_cpus_nbits();
1681 int nmems = cpuset_mems_nbits();
1682 distmap_entry_t r = DISTMAP_MAX;
1683
1684 flockfile(stdout);
1685
1686 if (check() < 0)
1687 goto err;
1688
1689 if (distmap == NULL)
1690 build_distmap();
1691
1692 #ifdef ALTERNATE_SN_DISTMAP
1693 if (distmap == NULL)
1694 build_distmap_sn();
1695 #endif
1696
1697 if (distmap == NULL)
1698 goto err;
1699
1700 if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems)
1701 goto err;
1702
1703 r = distmap[I(cpu, mem)];
1704 /* fall into ... */
1705 err:
1706 funlockfile(stdout);
1707 return r;
1708 }
1709
1710 /* [optional] Return Memory Node closest to cpu */
cpuset_cpu2node(int cpu)1711 int cpuset_cpu2node(int cpu)
1712 {
1713 int ncpus = cpuset_cpus_nbits();
1714 int nmems = cpuset_mems_nbits();
1715 struct bitmask *cpus = NULL, *mems = NULL;
1716 int r = -1;
1717
1718 if (check() < 0)
1719 goto err;
1720
1721 if ((cpus = bitmask_alloc(ncpus)) == NULL)
1722 goto err;
1723 bitmask_setbit(cpus, cpu);
1724
1725 if ((mems = bitmask_alloc(nmems)) == NULL)
1726 goto err;
1727 cpuset_localmems(cpus, mems);
1728 r = bitmask_first(mems);
1729 /* fall into ... */
1730 err:
1731 bitmask_free(cpus);
1732 bitmask_free(mems);
1733 return r;
1734 }
1735
apply_cpuset_settings(const char * path,const struct cpuset * cp)1736 static int apply_cpuset_settings(const char *path, const struct cpuset *cp)
1737 {
1738 if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) {
1739 if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0)
1740 goto err;
1741 }
1742
1743 if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) {
1744 if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0)
1745 goto err;
1746 }
1747
1748 if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) {
1749 if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0)
1750 goto err;
1751 }
1752
1753 if (cp->notify_on_release_valid && cp->notify_on_release_dirty) {
1754 if (store_flag(path, "notify_on_release", cp->notify_on_release)
1755 < 0)
1756 goto err;
1757 }
1758
1759 if (cp->memory_migrate_valid &&
1760 cp->memory_migrate_dirty && exists_flag(path, "memory_migrate")) {
1761 if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0)
1762 goto err;
1763 }
1764
1765 if (cp->memory_pressure_enabled_valid &&
1766 cp->memory_pressure_enabled_dirty &&
1767 exists_flag(path, "memory_pressure_enabled")) {
1768 if (store_flag
1769 (path, "memory_pressure_enabled",
1770 cp->memory_pressure_enabled) < 0)
1771 goto err;
1772 }
1773
1774 if (cp->memory_spread_page_valid &&
1775 cp->memory_spread_page_dirty &&
1776 exists_flag(path, "memory_spread_page")) {
1777 if (store_flag
1778 (path, "memory_spread_page", cp->memory_spread_page) < 0)
1779 goto err;
1780 }
1781
1782 if (cp->memory_spread_slab_valid &&
1783 cp->memory_spread_slab_dirty &&
1784 exists_flag(path, "memory_spread_slab")) {
1785 if (store_flag
1786 (path, "memory_spread_slab", cp->memory_spread_slab) < 0)
1787 goto err;
1788 }
1789
1790 if (cp->sched_load_balance_valid &&
1791 cp->sched_load_balance_dirty &&
1792 exists_flag(path, "sched_load_balance")) {
1793 if (store_flag
1794 (path, "sched_load_balance", cp->sched_load_balance) < 0)
1795 goto err;
1796 }
1797
1798 if (cp->sched_relax_domain_level_valid &&
1799 cp->sched_relax_domain_level_dirty &&
1800 exists_flag(path, "sched_relax_domain_level")) {
1801 if (store_number
1802 (path, "sched_relax_domain_level",
1803 cp->sched_relax_domain_level) < 0)
1804 goto err;
1805 }
1806
1807 if (cp->cpus_valid && cp->cpus_dirty) {
1808 if (store_mask(path, "cpus", cp->cpus) < 0)
1809 goto err;
1810 }
1811
1812 if (cp->mems_valid && cp->mems_dirty) {
1813 if (store_mask(path, "mems", cp->mems) < 0)
1814 goto err;
1815 }
1816 return 0;
1817 err:
1818 return -1;
1819 }
1820
1821 /*
1822 * get_siblings() - helper routine for cpuset_would_crash_kernel(), below.
1823 *
1824 * Extract max value of any 'siblings' field in /proc/cpuinfo.
1825 * Cache the result - only need to extract once in lifetime of task.
1826 *
1827 * The siblings field is the number of logical CPUs in a physical
1828 * processor package. It is equal to the product of the number of
1829 * cores in that package, times the number of hyper-threads per core.
1830 * The bug that cpuset_would_crash_kernel() is detecting arises
1831 * when a cpu_exclusive cpuset tries to include just some, not all,
1832 * of the sibling logical CPUs available in a processor package.
1833 *
1834 * In the improbable case that a system has mixed values of siblings
1835 * (some processor packages have more than others, perhaps due to
1836 * partially enabling Hyper-Threading), we take the worse case value,
1837 * the largest siblings value. This might be overkill. I don't know
1838 * if this kernel bug considers each processor package's siblings
1839 * separately or not. But it sure is easier this way ...
1840 *
1841 * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from
1842 * open to close, the first time called.
1843 */
1844
get_siblings()1845 static int get_siblings()
1846 {
1847 static int siblings;
1848 char buf[32]; /* big enough for one 'siblings' line */
1849 FILE *fp;
1850
1851 if (siblings)
1852 return siblings;
1853
1854 if ((fp = fopen("/proc/cpuinfo", "r")) == NULL)
1855 return 4; /* wing it - /proc not mounted ? */
1856 while (flgets(buf, sizeof(buf), fp) != NULL) {
1857 int s;
1858
1859 if (sscanf(buf, "siblings : %d", &s) < 1)
1860 continue;
1861 if (s > siblings)
1862 siblings = s;
1863 }
1864 fclose(fp);
1865 if (siblings == 0)
1866 siblings = 1; /* old kernel, no siblings, default to 1 */
1867 return siblings;
1868 }
1869
1870 /*
1871 * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic
1872 * scheduler domain code invoked for cpu_exclusive cpusets that causes
1873 * the kernel to freeze, requiring a hardware reset.
1874 *
1875 * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive'
1876 * cpuset is defined where that cpusets 'cpus' are not on package
1877 * boundaries then the kernel will freeze, usually as soon as this
1878 * cpuset is created, requiring a hardware reset.
1879 *
1880 * A cpusets 'cpus' are not on package boundaries if the cpuset
1881 * includes a proper non-empty subset (some, but not all) of the
1882 * logical cpus on a processor package. This requires multiple
1883 * logical CPUs per package, available with either Hyper-Thread or
1884 * Multi-Core support. Without one of these features, there is only
1885 * one logical CPU per physical package, and it's not possible to
1886 * have a proper, non-empty subset of a set of cardinality one.
1887 *
1888 * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC
1889 * on i386 and x86_64 arch's.
1890 *
1891 * The objective of this routine cpuset_would_crash_kernel() is to
1892 * determine if a proposed cpuset setting would crash the kernel due
1893 * to this bug, so that the caller can avoid the crash.
1894 *
1895 * Ideally we'd check for exactly these conditions here, but computing
1896 * the package (identified by the 'physical id' field of /proc/cpuinfo)
1897 * of each cpu in a cpuset is more effort than it's worth here.
1898 *
1899 * Also there is no obvious way to identify exactly whether the kernel
1900 * one is executing on has this bug, short of trying it, and seeing
1901 * if the kernel just crashed.
1902 *
1903 * So for now, we look for a simpler set of conditions, that meets
1904 * our immediate need - avoid this crash on SUSE SLES10 systems that
1905 * are susceptible to it. We look for the kernel version 2.6.16.*,
1906 * which is the base kernel of SUSE SLES10, and for i386 or x86_64
1907 * processors, which had CONFIG_SCHED_MC enabled.
1908 *
1909 * If these simpler conditions are met, we further simplify the check,
1910 * by presuming that the logical CPUs are numbered on processor
1911 * package boundaries. If each package has S siblings, we assume
1912 * that CPUs numbered N through N + S -1 are on the same package,
1913 * for any CPU N such that N mod S == 0.
1914 *
1915 * Yes, this is a hack, focused on avoiding kernel freezes on
1916 * susceptible SUSE SLES10 systems.
1917 */
1918
cpuset_would_crash_kernel(const struct cpuset * cp)1919 static int cpuset_would_crash_kernel(const struct cpuset *cp)
1920 {
1921 static int susceptible_system = -1;
1922
1923 if (!cp->cpu_exclusive)
1924 goto ok;
1925
1926 if (susceptible_system == -1) {
1927 struct utsname u;
1928 int rel_2_6_16, arch_i386, arch_x86_64;
1929
1930 if (uname(&u) < 0)
1931 goto fail;
1932 rel_2_6_16 = strprefix(u.release, "2.6.16.");
1933 arch_i386 = streq(u.machine, "i386");
1934 arch_x86_64 = streq(u.machine, "x86_64");
1935 susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64);
1936 }
1937
1938 if (susceptible_system) {
1939 int ncpus = cpuset_cpus_nbits();
1940 int siblings = get_siblings();
1941 unsigned int cpu;
1942
1943 for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) {
1944 int s, num_set = 0;
1945
1946 for (s = 0; s < siblings; s++) {
1947 if (bitmask_isbitset(cp->cpus, cpu + s))
1948 num_set++;
1949 }
1950
1951 /* If none or all siblings set, we're still ok */
1952 if (num_set == 0 || num_set == siblings)
1953 continue;
1954
1955 /* Found one that would crash kernel. Fail. */
1956 errno = ENXIO;
1957 goto fail;
1958 }
1959 }
1960 /* If not susceptible, or if all ok, fall into "ok" ... */
1961 ok:
1962 return 0; /* would not crash */
1963 fail:
1964 return 1; /* would crash */
1965 }
1966
1967 /* compare two cpuset and mark the dirty variable */
mark_dirty_variable(struct cpuset * cp1,const struct cpuset * cp2)1968 static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2)
1969 {
1970 if (cp1->cpu_exclusive_valid &&
1971 cp1->cpu_exclusive != cp2->cpu_exclusive)
1972 cp1->cpu_exclusive_dirty = 1;
1973
1974 if (cp1->mem_exclusive_valid &&
1975 cp1->mem_exclusive != cp2->mem_exclusive)
1976 cp1->mem_exclusive_dirty = 1;
1977
1978 if (cp1->mem_hardwall_valid && cp1->mem_hardwall != cp2->mem_hardwall)
1979 cp1->mem_hardwall_dirty = 1;
1980
1981 if (cp1->notify_on_release_valid &&
1982 cp1->notify_on_release != cp2->notify_on_release)
1983 cp1->notify_on_release_dirty = 1;
1984
1985 if (cp1->memory_migrate_valid &&
1986 cp1->memory_migrate != cp2->memory_migrate)
1987 cp1->memory_migrate_dirty = 1;
1988
1989 if (cp1->memory_pressure_enabled_valid &&
1990 cp1->memory_pressure_enabled != cp2->memory_pressure_enabled)
1991 cp1->memory_pressure_enabled_dirty = 1;
1992
1993 if (cp1->memory_spread_page_valid &&
1994 cp1->memory_spread_page != cp2->memory_spread_page)
1995 cp1->memory_spread_page_dirty = 1;
1996
1997 if (cp1->memory_spread_slab_valid &&
1998 cp1->memory_spread_slab != cp2->memory_spread_slab)
1999 cp1->memory_spread_slab_dirty = 1;
2000
2001 if (cp1->sched_load_balance_valid &&
2002 cp1->sched_load_balance != cp2->sched_load_balance)
2003 cp1->sched_load_balance_dirty = 1;
2004
2005 if (cp1->sched_relax_domain_level_valid &&
2006 cp1->sched_relax_domain_level != cp2->sched_relax_domain_level)
2007 cp1->sched_relax_domain_level_dirty = 1;
2008
2009 if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus))
2010 cp1->cpus_dirty = 1;
2011 if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems))
2012 cp1->mems_dirty = 1;
2013 }
2014
2015 /* Create (if new set) or modify cpuset 'cp' at location 'relpath' */
cr_or_mod(const char * relpath,const struct cpuset * cp,int new)2016 static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new)
2017 {
2018 char buf[PATH_MAX];
2019 int do_rmdir_on_err = 0;
2020 int do_restore_cp_sav_on_err = 0;
2021 struct cpuset *cp_sav = NULL;
2022 int sav_errno;
2023
2024 if (check() < 0)
2025 goto err;
2026
2027 if (cpuset_would_crash_kernel(cp))
2028 goto err;
2029
2030 fullpath(buf, sizeof(buf), relpath);
2031
2032 if (new) {
2033 if (mkdir(buf, 0755) < 0)
2034 goto err;
2035 /* we made it, so we should remove it on error */
2036 do_rmdir_on_err = 1;
2037 }
2038
2039 if ((cp_sav = cpuset_alloc()) == NULL)
2040 goto err;
2041 if (cpuset_query(cp_sav, relpath) < 0)
2042 goto err;
2043 /* we have old settings to restore on error */
2044 do_restore_cp_sav_on_err = 1;
2045
2046 /* check which variable need to restore on error */
2047 mark_dirty_variable(cp_sav, cp);
2048
2049 if (apply_cpuset_settings(buf, cp) < 0)
2050 goto err;
2051
2052 cpuset_free(cp_sav);
2053 return 0;
2054 err:
2055 sav_errno = errno;
2056 if (do_restore_cp_sav_on_err)
2057 (void)apply_cpuset_settings(buf, cp_sav);
2058 if (cp_sav)
2059 cpuset_free(cp_sav);
2060 if (do_rmdir_on_err)
2061 (void)rmdir(buf);
2062 errno = sav_errno;
2063 return -1;
2064 }
2065
2066 /* Create cpuset 'cp' at location 'relpath' */
cpuset_create(const char * relpath,const struct cpuset * cp)2067 int cpuset_create(const char *relpath, const struct cpuset *cp)
2068 {
2069 return cr_or_mod(relpath, cp, 1);
2070 }
2071
2072 /* Delete cpuset at location 'path' (if empty) */
cpuset_delete(const char * relpath)2073 int cpuset_delete(const char *relpath)
2074 {
2075 char buf[PATH_MAX];
2076
2077 if (check() < 0)
2078 goto err;
2079
2080 fullpath(buf, sizeof(buf), relpath);
2081 if (rmdir(buf) < 0)
2082 goto err;
2083
2084 return 0;
2085 err:
2086 return -1;
2087 }
2088
2089 /* Set cpuset cp to the cpuset at location 'path' */
cpuset_query(struct cpuset * cp,const char * relpath)2090 int cpuset_query(struct cpuset *cp, const char *relpath)
2091 {
2092 char buf[PATH_MAX];
2093
2094 if (check() < 0)
2095 goto err;
2096
2097 fullpath(buf, sizeof(buf), relpath);
2098
2099 if (load_flag(buf, &cp->cpu_exclusive, "cpu_exclusive") < 0)
2100 goto err;
2101 cp->cpu_exclusive_valid = 1;
2102
2103 if (load_flag(buf, &cp->mem_exclusive, "mem_exclusive") < 0)
2104 goto err;
2105 cp->mem_exclusive_valid = 1;
2106
2107 if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0)
2108 goto err;
2109 cp->notify_on_release_valid = 1;
2110
2111 if (exists_flag(buf, "memory_migrate")) {
2112 if (load_flag(buf, &cp->memory_migrate, "memory_migrate") < 0)
2113 goto err;
2114 cp->memory_migrate_valid = 1;
2115 }
2116
2117 if (exists_flag(buf, "mem_hardwall")) {
2118 if (load_flag(buf, &cp->mem_hardwall, "mem_hardwall") < 0)
2119 goto err;
2120 cp->mem_hardwall_valid = 1;
2121 }
2122
2123 if (exists_flag(buf, "memory_pressure_enabled")) {
2124 if (load_flag
2125 (buf, &cp->memory_pressure_enabled,
2126 "memory_pressure_enabled") < 0)
2127 goto err;
2128 cp->memory_pressure_enabled_valid = 1;
2129 }
2130
2131 if (exists_flag(buf, "memory_spread_page")) {
2132 if (load_flag
2133 (buf, &cp->memory_spread_page, "memory_spread_page") < 0)
2134 goto err;
2135 cp->memory_spread_page_valid = 1;
2136 }
2137
2138 if (exists_flag(buf, "memory_spread_slab")) {
2139 if (load_flag
2140 (buf, &cp->memory_spread_slab, "memory_spread_slab") < 0)
2141 goto err;
2142 cp->memory_spread_slab_valid = 1;
2143 }
2144
2145 if (exists_flag(buf, "sched_load_balance")) {
2146 if (load_flag
2147 (buf, &cp->sched_load_balance, "sched_load_balance") < 0)
2148 goto err;
2149 cp->sched_load_balance_valid = 1;
2150 }
2151
2152 if (exists_flag(buf, "sched_relax_domain_level")) {
2153 if (load_number
2154 (buf, &cp->sched_relax_domain_level,
2155 "sched_relax_domain_level") < 0)
2156 goto err;
2157 cp->sched_relax_domain_level_valid = 1;
2158 }
2159
2160 if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpus") < 0)
2161 goto err;
2162 cp->cpus_valid = 1;
2163
2164 if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "mems") < 0)
2165 goto err;
2166 cp->mems_valid = 1;
2167
2168 return 0;
2169 err:
2170 return -1;
2171 }
2172
2173 /* Modify cpuset at location 'relpath' to values of 'cp' */
cpuset_modify(const char * relpath,const struct cpuset * cp)2174 int cpuset_modify(const char *relpath, const struct cpuset *cp)
2175 {
2176 return cr_or_mod(relpath, cp, 0);
2177 }
2178
2179 /* Get cpuset path of pid into buf */
cpuset_getcpusetpath(pid_t pid,char * buf,size_t size)2180 char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size)
2181 {
2182 int fd; /* dual use: cpuset file for pid and self */
2183 int rc; /* dual use: snprintf and read return codes */
2184
2185 if (check() < 0)
2186 return NULL;
2187
2188 /* borrow result buf[] to build cpuset file path */
2189 if (pid == 0)
2190 rc = snprintf(buf, size, "/proc/self/cpuset");
2191 else
2192 rc = snprintf(buf, size, "/proc/%d/cpuset", pid);
2193 if (rc >= (int)size) {
2194 errno = E2BIG;
2195 return NULL;
2196 }
2197 if ((fd = open(buf, O_RDONLY)) < 0) {
2198 int e = errno;
2199 if (e == ENOENT)
2200 e = ESRCH;
2201 if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0)
2202 e = ENOSYS;
2203 else
2204 close(fd);
2205 errno = e;
2206 return NULL;
2207 }
2208 rc = read(fd, buf, size);
2209 close(fd);
2210 if (rc < 0)
2211 return NULL;
2212 if (rc >= (int)size) {
2213 errno = E2BIG;
2214 return NULL;
2215 }
2216 buf[rc] = 0;
2217 chomp(buf);
2218 return buf;
2219
2220 }
2221
2222 /* Get cpuset 'cp' of pid */
cpuset_cpusetofpid(struct cpuset * cp,pid_t pid)2223 int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid)
2224 {
2225 char buf[PATH_MAX];
2226
2227 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
2228 return -1;
2229 if (cpuset_query(cp, buf) < 0)
2230 return -1;
2231 return 0;
2232 }
2233
2234 /* [optional] Return mountpoint of cpuset filesystem */
cpuset_mountpoint()2235 const char *cpuset_mountpoint()
2236 {
2237 if (check() < 0) {
2238 switch (errno) {
2239 case ENODEV:
2240 return "[cpuset filesystem not mounted]";
2241 default:
2242 return "[cpuset filesystem not supported]";
2243 }
2244 }
2245 return cpusetmnt;
2246 }
2247
2248 /* Return true if path is a directory. */
isdir(const char * path)2249 static int isdir(const char *path)
2250 {
2251 struct stat statbuf;
2252
2253 if (stat(path, &statbuf) < 0)
2254 return 0;
2255 return S_ISDIR(statbuf.st_mode);
2256 }
2257
2258 /*
2259 * [optional] cpuset_collides_exclusive() - True if would collide exclusive.
2260 *
2261 * Return true iff the specified cpuset would overlap with any
2262 * sibling cpusets in either cpus or mems, where either this
2263 * cpuset or the sibling is cpu_exclusive or mem_exclusive.
2264 *
2265 * cpuset_create() fails with errno == EINVAL if the requested cpuset
2266 * would overlap with any sibling, where either one is cpu_exclusive or
2267 * mem_exclusive. This is a common, and not obvious error. The
2268 * following routine checks for this particular case, so that code
2269 * creating cpusets can better identify the situation, perhaps to issue
2270 * a more informative error message.
2271 *
2272 * Can also be used to diagnose cpuset_modify failures. This
2273 * routine ignores any existing cpuset with the same path as the
2274 * given 'cpusetpath', and only looks for exclusive collisions with
2275 * sibling cpusets of that path.
2276 *
2277 * In case of any error, returns (0) -- does not collide. Presumably
2278 * any actual attempt to create or modify a cpuset will encounter the
2279 * same error, and report it usefully.
2280 *
2281 * This routine is not particularly efficient; most likely code creating or
2282 * modifying a cpuset will want to try the operation first, and then if that
2283 * fails with errno EINVAL, perhaps call this routine to determine if an
2284 * exclusive cpuset collision caused the error.
2285 */
2286
cpuset_collides_exclusive(const char * cpusetpath,const struct cpuset * cp1)2287 int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1)
2288 {
2289 char parent[PATH_MAX];
2290 char *p;
2291 char *pathcopy = NULL;
2292 char *base;
2293 DIR *dir = NULL;
2294 struct dirent *dent;
2295 struct cpuset *cp2 = NULL;
2296 struct bitmask *cpus1 = NULL, *cpus2 = NULL;
2297 struct bitmask *mems1 = NULL, *mems2 = NULL;
2298 int ret;
2299
2300 if (check() < 0)
2301 goto err;
2302
2303 fullpath(parent, sizeof(parent), cpusetpath);
2304 if (streq(parent, cpusetmnt))
2305 goto err; /* only one cpuset root - can't collide */
2306 pathcopy = strdup(parent);
2307 p = strrchr(parent, '/');
2308 if (!p)
2309 goto err; /* huh? - impossible - run and hide */
2310 *p = 0; /* now parent is dirname of fullpath */
2311
2312 p = strrchr(pathcopy, '/');
2313 base = p + 1; /* now base is basename of fullpath */
2314 if (!*base)
2315 goto err; /* this is also impossible - run away */
2316
2317 if ((dir = opendir(parent)) == NULL)
2318 goto err;
2319 if ((cp2 = cpuset_alloc()) == NULL)
2320 goto err;
2321 if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2322 goto err;
2323 if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2324 goto err;
2325 if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2326 goto err;
2327 if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2328 goto err;
2329
2330 while ((dent = readdir(dir)) != NULL) {
2331 char child[PATH_MAX];
2332
2333 if (streq(dent->d_name, ".") || streq(dent->d_name, ".."))
2334 continue;
2335 if (streq(dent->d_name, base))
2336 continue;
2337 pathcat2(child, sizeof(child), parent, dent->d_name);
2338 if (!isdir(child))
2339 continue;
2340 if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0)
2341 goto err;
2342 if (cp1->cpu_exclusive || cp2->cpu_exclusive) {
2343 cpuset_getcpus(cp1, cpus1);
2344 cpuset_getcpus(cp2, cpus2);
2345 if (bitmask_intersects(cpus1, cpus2))
2346 goto collides;
2347 }
2348 if (cp1->mem_exclusive || cp2->mem_exclusive) {
2349 cpuset_getmems(cp1, mems1);
2350 cpuset_getmems(cp2, mems2);
2351 if (bitmask_intersects(mems1, mems2))
2352 goto collides;
2353 }
2354 }
2355 err:
2356 /* error, or did not collide */
2357 ret = 0;
2358 goto done;
2359 collides:
2360 /* collides */
2361 ret = 1;
2362 /* fall into ... */
2363 done:
2364 if (dir)
2365 closedir(dir);
2366 cpuset_free(cp2);
2367 free(pathcopy);
2368 bitmask_free(cpus1);
2369 bitmask_free(cpus2);
2370 bitmask_free(mems1);
2371 bitmask_free(mems2);
2372 return ret;
2373 }
2374
2375 /*
2376 * [optional] cpuset_nuke() - Remove cpuset anyway possible
2377 *
2378 * Remove a cpuset, including killing tasks in it, and
2379 * removing any descendent cpusets and killing their tasks.
2380 *
2381 * Tasks can take a long time (minutes on some configurations)
2382 * to exit. Loop up to 'seconds' seconds, trying to kill them.
2383 *
2384 * How we do it:
2385 * 1) First, kill all the pids, looping until there are
2386 * no more pids in this cpuset or below, or until the
2387 * 'seconds' timeout limit is exceeded.
2388 * 2) Then depth first recursively rmdir the cpuset directories.
2389 * 3) If by this point the original cpuset is gone, we succeeded.
2390 *
2391 * If the timeout is exceeded, and tasks still exist, fail with
2392 * errno == ETIME.
2393 *
2394 * We sleep a variable amount of time. After the first attempt to
2395 * kill all the tasks in the cpuset or its descendents, we sleep 1
2396 * second, the next time 2 seconds, increasing 1 second each loop
2397 * up to a max of 10 seconds. If more loops past 10 are required
2398 * to kill all the tasks, we sleep 10 seconds each subsequent loop.
2399 * In any case, before the last loop, we sleep however many seconds
2400 * remain of the original timeout 'seconds' requested. The total
2401 * time of all sleeps will be no more than the requested 'seconds'.
2402 *
2403 * If the cpuset started out empty of any tasks, or if the passed in
2404 * 'seconds' was zero, then this routine will return quickly, having
2405 * not slept at all. Otherwise, this routine will at a minimum send
2406 * a SIGKILL to all the tasks in this cpuset subtree, then sleep one
2407 * second, before looking to see if any tasks remain. If tasks remain
2408 * in the cpuset subtree, and a longer 'seconds' timeout was requested
2409 * (more than one), it will continue to kill remaining tasks and sleep,
2410 * in a loop, for as long as time and tasks remain.
2411 *
2412 * The signal sent for the kill is hardcoded to SIGKILL (9). If some
2413 * other signal should be sent first, use a separate code loop,
2414 * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to
2415 * scan the task pids in a cpuset. If SIGKILL should -not- be sent,
2416 * this cpuset_nuke() routine can still be called to recursively
2417 * remove a cpuset subtree, by specifying a timeout of zero 'seconds'.
2418 *
2419 * On success, returns 0 with errno == 0.
2420 *
2421 * On failure, returns -1, with errno possibly one of:
2422 * EACCES - search permission denied on intervening directory
2423 * ETIME - timed out - tasks remain after 'seconds' timeout
2424 * EMFILE - too many open files
2425 * ENODEV - /dev/cpuset not mounted
2426 * ENOENT - component of cpuset path doesn't exist
2427 * ENOMEM - out of memory
2428 * ENOSYS - kernel doesn't support cpusets
2429 * ENOTDIR - component of cpuset path is not a directory
2430 * EPERM - lacked permission to kill a task
2431 * EPERM - lacked permission to read cpusets or files therein
2432 */
2433
2434 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree);
2435
cpuset_nuke(const char * relpath,unsigned int seconds)2436 int cpuset_nuke(const char *relpath, unsigned int seconds)
2437 {
2438 unsigned int secs_left = seconds; /* total sleep seconds left */
2439 unsigned int secs_loop = 1; /* how much sleep next loop */
2440 unsigned int secs_slept; /* seconds slept in sleep() */
2441 struct cpuset_pidlist *pl = NULL; /* pids in cpuset subtree */
2442 struct cpuset_fts_tree *cs_tree;
2443 const struct cpuset_fts_entry *cs_entry;
2444 int ret, sav_errno = 0;
2445
2446 if (check() < 0)
2447 return -1;
2448
2449 if (seconds == 0)
2450 goto rmdir_cpusets;
2451
2452 while (1) {
2453 int plen, j;
2454
2455 if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) {
2456 /* missing cpuset is as good as if already nuked */
2457 if (errno == ENOENT) {
2458 ret = 0;
2459 goto no_more_cpuset;
2460 }
2461
2462 /* other problems reading cpuset are bad news */
2463 sav_errno = errno;
2464 goto failed;
2465 }
2466
2467 if ((plen = cpuset_pidlist_length(pl)) == 0)
2468 goto rmdir_cpusets;
2469
2470 for (j = 0; j < plen; j++) {
2471 pid_t pid;
2472
2473 if ((pid = cpuset_get_pidlist(pl, j)) > 1) {
2474 if (kill(pid, SIGKILL) < 0 && errno != ESRCH) {
2475 sav_errno = errno;
2476 goto failed;
2477 }
2478 }
2479 }
2480
2481 if (secs_left == 0)
2482 goto took_too_long;
2483
2484 cpuset_freepidlist(pl);
2485 pl = NULL;
2486
2487 secs_slept = secs_loop - sleep(secs_loop);
2488
2489 /* Ensure forward progress */
2490 if (secs_slept == 0)
2491 secs_slept = 1;
2492
2493 /* Ensure sane sleep() return (unnecessary?) */
2494 if (secs_slept > secs_loop)
2495 secs_slept = secs_loop;
2496
2497 secs_left -= secs_slept;
2498
2499 if (secs_loop < 10)
2500 secs_loop++;
2501
2502 secs_loop = min(secs_left, secs_loop);
2503 }
2504
2505 took_too_long:
2506 sav_errno = ETIME;
2507 /* fall into ... */
2508 failed:
2509 cpuset_freepidlist(pl);
2510 errno = sav_errno;
2511 return -1;
2512
2513 rmdir_cpusets:
2514 /* Let's try removing cpuset(s) now. */
2515 cpuset_freepidlist(pl);
2516
2517 if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT)
2518 return -1;
2519 ret = 0;
2520 cpuset_fts_reverse(cs_tree); /* rmdir's must be done bottom up */
2521 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2522 char buf[PATH_MAX];
2523
2524 fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry));
2525 if (rmdir(buf) < 0 && errno != ENOENT) {
2526 sav_errno = errno;
2527 ret = -1;
2528 }
2529 }
2530 cpuset_fts_close(cs_tree);
2531 /* fall into ... */
2532 no_more_cpuset:
2533 if (ret == 0)
2534 errno = 0;
2535 else
2536 errno = sav_errno;
2537 return ret;
2538 }
2539
2540 /*
2541 * When recursively reading all the tasks files from a subtree,
2542 * chain together the read results, one pidblock per tasks file,
2543 * containing the raw unprocessed ascii as read(2) in. After
2544 * we gather up this raw data, we then go back to count how
2545 * many pid's there are in total, allocate an array of pid_t
2546 * of that size, and transform the raw ascii data into this
2547 * array of pid_t's.
2548 */
2549
2550 struct pidblock {
2551 char *buf;
2552 int buflen;
2553 struct pidblock *next;
2554 };
2555
2556 /*
2557 * Chain the raw contents of a file onto the pbhead list.
2558 *
2559 * We malloc "+ 1" extra byte for a nul-terminator, so that
2560 * the strtoul() loop in pid_transform() won't scan past
2561 * the end of pb->buf[] and accidentally find more pids.
2562 */
add_pidblock(const char * file,struct pidblock ** ppbhead)2563 static void add_pidblock(const char *file, struct pidblock **ppbhead)
2564 {
2565 FILE *fp = NULL;
2566 struct pidblock *pb = NULL;
2567 int fsz;
2568
2569 if ((fp = fopen(file, "r")) == NULL)
2570 goto err;
2571 fsz = filesize(fp);
2572 if (fsz == 0)
2573 goto err;
2574 if ((pb = calloc(1, sizeof(*pb))) == NULL)
2575 goto err;
2576 pb->buflen = fsz;
2577 if ((pb->buf = malloc(pb->buflen + 1)) == NULL)
2578 goto err;
2579 if (fread(pb->buf, 1, pb->buflen, fp) > 0) {
2580 pb->buf[pb->buflen] = '\0';
2581 pb->next = *ppbhead;
2582 *ppbhead = pb;
2583 }
2584 fclose(fp);
2585 return;
2586 err:
2587 if (fp)
2588 fclose(fp);
2589 free(pb);
2590 }
2591
read_task_file(const char * relpath,struct pidblock ** ppbhead)2592 static void read_task_file(const char *relpath, struct pidblock **ppbhead)
2593 {
2594 char buf[PATH_MAX];
2595
2596 fullpath2(buf, sizeof(buf), relpath, "tasks");
2597 add_pidblock(buf, ppbhead);
2598 }
2599
2600 struct cpuset_pidlist {
2601 pid_t *pids;
2602 int npids;
2603 };
2604
2605 /* Count how many pids in buf (one per line - just count newlines) */
pidcount(const char * buf,int buflen)2606 static int pidcount(const char *buf, int buflen)
2607 {
2608 int n = 0;
2609 const char *cp;
2610
2611 for (cp = buf; cp < buf + buflen; cp++) {
2612 if (*cp == '\n')
2613 n++;
2614 }
2615 return n;
2616 }
2617
2618 /* Transform one-per-line ascii pids in pb to pid_t entries in pl */
pid_transform(struct pidblock * pb,struct cpuset_pidlist * pl,int n)2619 static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n)
2620 {
2621 char *a, *b;
2622
2623 for (a = pb->buf; a < pb->buf + pb->buflen; a = b) {
2624 pid_t p = strtoul(a, &b, 10);
2625 if (a == b)
2626 break;
2627 pl->pids[n++] = p;
2628 }
2629 return n;
2630 }
2631
free_pidblocks(struct pidblock * pbhead)2632 static void free_pidblocks(struct pidblock *pbhead)
2633 {
2634 struct pidblock *pb, *nextpb;
2635
2636 for (pb = pbhead; pb; pb = nextpb) {
2637 nextpb = pb->next;
2638 free(pb->buf);
2639 free(pb);
2640 }
2641 }
2642
2643 /* numeric comparison routine for qsort */
numericsort(const void * m1,const void * m2)2644 static int numericsort(const void *m1, const void *m2)
2645 {
2646 pid_t p1 = *(pid_t *) m1;
2647 pid_t p2 = *(pid_t *) m2;
2648
2649 return p1 - p2;
2650 }
2651
2652 /* Return list pids in cpuset 'path' */
cpuset_init_pidlist(const char * relpath,int recursiveflag)2653 struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath,
2654 int recursiveflag)
2655 {
2656 struct pidblock *pb = NULL;
2657 struct cpuset_pidlist *pl = NULL;
2658 struct pidblock *pbhead = NULL;
2659 int n;
2660
2661 if (check() < 0)
2662 goto err;
2663
2664 if (recursiveflag) {
2665 struct cpuset_fts_tree *cs_tree;
2666 const struct cpuset_fts_entry *cs_entry;
2667
2668 if ((cs_tree = cpuset_fts_open(relpath)) == NULL)
2669 goto err;
2670 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2671 if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET)
2672 continue;
2673 read_task_file(cpuset_fts_get_path(cs_entry), &pbhead);
2674 }
2675 cpuset_fts_close(cs_tree);
2676 } else {
2677 read_task_file(relpath, &pbhead);
2678 }
2679
2680 if ((pl = calloc(1, sizeof(*pl))) == NULL)
2681 goto err;
2682 pl->npids = 0;
2683 for (pb = pbhead; pb; pb = pb->next)
2684 pl->npids += pidcount(pb->buf, pb->buflen);
2685 if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL)
2686 goto err;
2687 n = 0;
2688 for (pb = pbhead; pb; pb = pb->next)
2689 n = pid_transform(pb, pl, n);
2690 free_pidblocks(pbhead);
2691 qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort);
2692 return pl;
2693 err:
2694 cpuset_freepidlist(pl);
2695 free_pidblocks(pbhead);
2696 return NULL;
2697 }
2698
2699 /* Return number of elements in pidlist */
cpuset_pidlist_length(const struct cpuset_pidlist * pl)2700 int cpuset_pidlist_length(const struct cpuset_pidlist *pl)
2701 {
2702 if (pl)
2703 return pl->npids;
2704 else
2705 return 0;
2706 }
2707
2708 /* Return i'th element of pidlist */
cpuset_get_pidlist(const struct cpuset_pidlist * pl,int i)2709 pid_t cpuset_get_pidlist(const struct cpuset_pidlist * pl, int i)
2710 {
2711 if (pl && i >= 0 && i < pl->npids)
2712 return pl->pids[i];
2713 else
2714 return (pid_t) - 1;
2715 }
2716
2717 /* Free pidlist */
cpuset_freepidlist(struct cpuset_pidlist * pl)2718 void cpuset_freepidlist(struct cpuset_pidlist *pl)
2719 {
2720 if (pl && pl->pids)
2721 free(pl->pids);
2722 free(pl);
2723 }
2724
__cpuset_move(pid_t pid,const char * path)2725 static int __cpuset_move(pid_t pid, const char *path)
2726 {
2727 char buf[SMALL_BUFSZ];
2728
2729 snprintf(buf, sizeof(buf), "%u", pid);
2730 return write_string_file(path, buf);
2731 }
2732
2733 /* Move task (pid == 0 for current) to a cpuset */
cpuset_move(pid_t pid,const char * relpath)2734 int cpuset_move(pid_t pid, const char *relpath)
2735 {
2736 char buf[PATH_MAX];
2737
2738 if (check() < 0)
2739 return -1;
2740
2741 if (pid == 0)
2742 pid = getpid();
2743
2744 fullpath2(buf, sizeof(buf), relpath, "tasks");
2745 return __cpuset_move(pid, buf);
2746 }
2747
2748 /* Move all tasks in pidlist to a cpuset */
cpuset_move_all(struct cpuset_pidlist * pl,const char * relpath)2749 int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath)
2750 {
2751 int i;
2752 char buf[PATH_MAX];
2753 int ret;
2754
2755 if (check() < 0)
2756 return -1;
2757
2758 fullpath2(buf, sizeof(buf), relpath, "tasks");
2759
2760 ret = 0;
2761 for (i = 0; i < pl->npids; i++)
2762 if (__cpuset_move(pl->pids[i], buf) < 0)
2763 ret = -1;
2764 return ret;
2765 }
2766
2767 /*
2768 * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a
2769 * cpuset to another cpuset
2770 *
2771 * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may
2772 * race with tasks being added to or forking into fromrelpath. Loop
2773 * repeatedly, reading the tasks file of cpuset fromrelpath and writing
2774 * any task pid's found there to the tasks file of cpuset torelpath,
2775 * up to ten attempts, or until the tasks file of cpuset fromrelpath
2776 * is empty, or until fromrelpath is no longer present.
2777 *
2778 * Returns 0 with errno == 0 if able to empty the tasks file of cpuset
2779 * fromrelpath. Of course it is still possible that some independent
2780 * task could add another task to cpuset fromrelpath at the same time
2781 * that such a successful result is being returned, so there can be
2782 * no guarantee that a successful return means that fromrelpath is
2783 * still empty of tasks.
2784 *
2785 * We are careful to allow for the possibility that the cpuset
2786 * fromrelpath might disappear out from under us, perhaps because it
2787 * has notify_on_release set and gets automatically removed as soon
2788 * as we detach its last task from it. Consider a missing fromrelpath
2789 * to be a successful move.
2790 *
2791 * If called with fromrelpath and torelpath pathnames that evaluate to
2792 * the same cpuset, then treat that as if cpuset_reattach() was called,
2793 * rebinding each task in this cpuset one time, and return success or
2794 * failure depending on the return of that cpuset_reattach() call.
2795 *
2796 * On failure, returns -1, with errno possibly one of:
2797 * EACCES - search permission denied on intervening directory
2798 * ENOTEMPTY - tasks remain after multiple attempts to move them
2799 * EMFILE - too many open files
2800 * ENODEV - /dev/cpuset not mounted
2801 * ENOENT - component of cpuset path doesn't exist
2802 * ENOMEM - out of memory
2803 * ENOSYS - kernel doesn't support cpusets
2804 * ENOTDIR - component of cpuset path is not a directory
2805 * EPERM - lacked permission to kill a task
2806 * EPERM - lacked permission to read cpusets or files therein
2807 *
2808 * This is an [optional] function. Use cpuset_function to invoke it.
2809 */
2810
2811 #define NUMBER_MOVE_TASK_ATTEMPTS 10
2812
cpuset_move_cpuset_tasks(const char * fromrelpath,const char * torelpath)2813 int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath)
2814 {
2815 char fromfullpath[PATH_MAX];
2816 char tofullpath[PATH_MAX];
2817 int i;
2818 struct cpuset_pidlist *pl = NULL;
2819 int sav_errno;
2820
2821 fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath);
2822 fullpath(tofullpath, sizeof(tofullpath), torelpath);
2823
2824 if (samefile(fromfullpath, tofullpath))
2825 return cpuset_reattach(fromrelpath);
2826
2827 for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) {
2828 int plen, j;
2829
2830 if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) {
2831 /* missing cpuset is as good as if all moved */
2832 if (errno == ENOENT)
2833 goto no_more_cpuset;
2834
2835 /* other problems reading cpuset are bad news */
2836 sav_errno = errno;
2837 goto failed;
2838 }
2839
2840 if ((plen = cpuset_pidlist_length(pl)) == 0)
2841 goto no_more_pids;
2842
2843 for (j = 0; j < plen; j++) {
2844 pid_t pid;
2845
2846 pid = cpuset_get_pidlist(pl, j);
2847 if (cpuset_move(pid, torelpath) < 0) {
2848 /* missing task is as good as if moved */
2849 if (errno == ESRCH)
2850 continue;
2851
2852 /* other per-task errors are bad news */
2853 sav_errno = errno;
2854 goto failed;
2855 }
2856 }
2857
2858 cpuset_freepidlist(pl);
2859 pl = NULL;
2860 }
2861
2862 sav_errno = ENOTEMPTY;
2863 /* fall into ... */
2864 failed:
2865 cpuset_freepidlist(pl);
2866 errno = sav_errno;
2867 return -1;
2868
2869 no_more_pids:
2870 no_more_cpuset:
2871 /* Success - all tasks (or entire cpuset ;) gone. */
2872 cpuset_freepidlist(pl);
2873 errno = 0;
2874 return 0;
2875 }
2876
2877 /* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */
cpuset_migrate(pid_t pid,const char * relpath)2878 int cpuset_migrate(pid_t pid, const char *relpath)
2879 {
2880 char buf[PATH_MAX];
2881 char buf2[PATH_MAX];
2882 char memory_migrate_flag;
2883 int r;
2884
2885 if (check() < 0)
2886 return -1;
2887
2888 if (pid == 0)
2889 pid = getpid();
2890
2891 fullpath(buf2, sizeof(buf2), relpath);
2892
2893 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2894 return -1;
2895 if (store_flag(buf2, "memory_migrate", 1) < 0)
2896 return -1;
2897
2898 fullpath2(buf, sizeof(buf), relpath, "tasks");
2899
2900 r = __cpuset_move(pid, buf);
2901
2902 store_flag(buf2, "memory_migrate", memory_migrate_flag);
2903 return r;
2904 }
2905
2906 /* Migrate all tasks in pidlist to a cpuset (moves task and memory) */
cpuset_migrate_all(struct cpuset_pidlist * pl,const char * relpath)2907 int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath)
2908 {
2909 int i;
2910 char buf[PATH_MAX];
2911 char buf2[PATH_MAX];
2912 char memory_migrate_flag;
2913 int ret;
2914
2915 if (check() < 0)
2916 return -1;
2917
2918 fullpath(buf2, sizeof(buf2), relpath);
2919
2920 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2921 return -1;
2922 if (store_flag(buf2, "memory_migrate", 1) < 0)
2923 return -1;
2924
2925 fullpath2(buf, sizeof(buf), relpath, "tasks");
2926
2927 ret = 0;
2928 for (i = 0; i < pl->npids; i++)
2929 if (__cpuset_move(pl->pids[i], buf) < 0)
2930 ret = -1;
2931
2932 if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0)
2933 ret = -1;
2934 return ret;
2935 }
2936
2937 /* Rebind cpus_allowed of each task in cpuset 'path' */
cpuset_reattach(const char * relpath)2938 int cpuset_reattach(const char *relpath)
2939 {
2940 struct cpuset_pidlist *pl;
2941 int rc;
2942
2943 if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL)
2944 return -1;
2945 rc = cpuset_move_all(pl, relpath);
2946 cpuset_freepidlist(pl);
2947 return rc;
2948 }
2949
2950 /* Map cpuset relative cpu number to system wide cpu number */
cpuset_c_rel_to_sys_cpu(const struct cpuset * cp,int cpu)2951 int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu)
2952 {
2953 struct cpuset *cp_tofree = NULL;
2954 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2955 int pos = -1;
2956
2957 if (!cp1)
2958 goto err;
2959 pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu);
2960 /* fall into ... */
2961 err:
2962 cpuset_free(cp_tofree);
2963 return pos;
2964 }
2965
2966 /* Map system wide cpu number to cpuset relative cpu number */
cpuset_c_sys_to_rel_cpu(const struct cpuset * cp,int cpu)2967 int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu)
2968 {
2969 struct cpuset *cp_tofree = NULL;
2970 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2971 int pos = -1;
2972
2973 if (!cp1)
2974 goto err;
2975 pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu);
2976 /* fall into ... */
2977 err:
2978 cpuset_free(cp_tofree);
2979 return pos;
2980 }
2981
2982 /* Map cpuset relative mem number to system wide mem number */
cpuset_c_rel_to_sys_mem(const struct cpuset * cp,int mem)2983 int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem)
2984 {
2985 struct cpuset *cp_tofree = NULL;
2986 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2987 int pos = -1;
2988
2989 if (!cp1)
2990 goto err;
2991 pos = bitmask_rel_to_abs_pos(cp1->mems, mem);
2992 /* fall into ... */
2993 err:
2994 cpuset_free(cp_tofree);
2995 return pos;
2996 }
2997
2998 /* Map system wide mem number to cpuset relative mem number */
cpuset_c_sys_to_rel_mem(const struct cpuset * cp,int mem)2999 int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem)
3000 {
3001 struct cpuset *cp_tofree = NULL;
3002 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
3003 int pos = -1;
3004
3005 if (!cp1)
3006 goto err;
3007 pos = bitmask_abs_to_rel_pos(cp1->mems, mem);
3008 /* fall into ... */
3009 err:
3010 cpuset_free(cp_tofree);
3011 return pos;
3012 }
3013
3014 /* Map pid's cpuset relative cpu number to system wide cpu number */
cpuset_p_rel_to_sys_cpu(pid_t pid,int cpu)3015 int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu)
3016 {
3017 struct cpuset *cp;
3018 int rc = -1;
3019
3020 if ((cp = cpuset_alloc()) == NULL)
3021 goto done;
3022 if (cpuset_cpusetofpid(cp, pid) < 0)
3023 goto done;
3024 rc = cpuset_c_rel_to_sys_cpu(cp, cpu);
3025 done:
3026 cpuset_free(cp);
3027 return rc;
3028 }
3029
3030 /* Map system wide cpu number to pid's cpuset relative cpu number */
cpuset_p_sys_to_rel_cpu(pid_t pid,int cpu)3031 int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu)
3032 {
3033 struct cpuset *cp;
3034 int rc = -1;
3035
3036 if ((cp = cpuset_alloc()) == NULL)
3037 goto done;
3038 if (cpuset_cpusetofpid(cp, pid) < 0)
3039 goto done;
3040 rc = cpuset_c_sys_to_rel_cpu(cp, cpu);
3041 done:
3042 cpuset_free(cp);
3043 return rc;
3044 }
3045
3046 /* Map pid's cpuset relative mem number to system wide mem number */
cpuset_p_rel_to_sys_mem(pid_t pid,int mem)3047 int cpuset_p_rel_to_sys_mem(pid_t pid, int mem)
3048 {
3049 struct cpuset *cp;
3050 int rc = -1;
3051
3052 if ((cp = cpuset_alloc()) == NULL)
3053 goto done;
3054 if (cpuset_cpusetofpid(cp, pid) < 0)
3055 goto done;
3056 rc = cpuset_c_rel_to_sys_mem(cp, mem);
3057 done:
3058 cpuset_free(cp);
3059 return rc;
3060 }
3061
3062 /* Map system wide mem number to pid's cpuset relative mem number */
cpuset_p_sys_to_rel_mem(pid_t pid,int mem)3063 int cpuset_p_sys_to_rel_mem(pid_t pid, int mem)
3064 {
3065 struct cpuset *cp;
3066 int rc = -1;
3067
3068 if ((cp = cpuset_alloc()) == NULL)
3069 goto done;
3070 if (cpuset_cpusetofpid(cp, pid) < 0)
3071 goto done;
3072 rc = cpuset_c_sys_to_rel_mem(cp, mem);
3073 done:
3074 cpuset_free(cp);
3075 return rc;
3076 }
3077
3078 /*
3079 * Override glibc's calls for get/set affinity - they have
3080 * something using cpu_set_t that will die when NR_CPUS > 1024.
3081 * Go directly to the 'real' system calls. Also override calls
3082 * for get_mempolicy and set_mempolicy. None of these
3083 * calls are yet (July 2004) guaranteed to be in all glibc versions
3084 * that we care about.
3085 */
3086
sched_setaffinity(pid_t pid,unsigned len,unsigned long * mask)3087 static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask)
3088 {
3089 return ltp_syscall(__NR_sched_setaffinity, pid, len, mask);
3090 }
3091
3092 #if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE
get_mempolicy(int * policy,unsigned long * nmask,unsigned long maxnode,void * addr,int flags)3093 static int get_mempolicy(int *policy, unsigned long *nmask,
3094 unsigned long maxnode, void *addr, int flags)
3095 {
3096 return ltp_syscall(__NR_get_mempolicy, policy, nmask, maxnode,
3097 addr, flags);
3098 }
3099 #endif
3100
3101 #if HAVE_DECL_MPOL_BIND || HAVE_DECL_MPOL_DEFAULT
set_mempolicy(int mode,unsigned long * nmask,unsigned long maxnode)3102 static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode)
3103 {
3104 return ltp_syscall(__NR_set_mempolicy, mode, nmask, maxnode);
3105 }
3106 #endif
3107
3108 struct cpuset_placement {
3109 struct bitmask *cpus;
3110 struct bitmask *mems;
3111 char *path;
3112 };
3113
3114 /* Allocate and fill in a placement struct - cpatures current placement */
cpuset_get_placement(pid_t pid)3115 struct cpuset_placement *cpuset_get_placement(pid_t pid)
3116 {
3117 struct cpuset_placement *plc;
3118 struct cpuset *cp = NULL;
3119 char buf[PATH_MAX];
3120 int nbits;
3121
3122 if ((plc = calloc(1, sizeof(*plc))) == NULL)
3123 goto err;
3124
3125 nbits = cpuset_cpus_nbits();
3126 if ((plc->cpus = bitmask_alloc(nbits)) == NULL)
3127 goto err;
3128
3129 nbits = cpuset_mems_nbits();
3130 if ((plc->mems = bitmask_alloc(nbits)) == NULL)
3131 goto err;
3132
3133 if ((cp = cpuset_alloc()) == NULL)
3134 goto err;
3135 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
3136 goto err;
3137 if (cpuset_query(cp, buf) < 0)
3138 goto err;
3139
3140 bitmask_copy(plc->cpus, cp->cpus);
3141 bitmask_copy(plc->mems, cp->mems);
3142 plc->path = strdup(buf);
3143
3144 cpuset_free(cp);
3145 return plc;
3146 err:
3147 cpuset_free(cp);
3148 cpuset_free_placement(plc);
3149 return NULL;
3150 }
3151
3152 /* Compare two placement structs - use to detect changes in placement */
cpuset_equal_placement(const struct cpuset_placement * plc1,const struct cpuset_placement * plc2)3153 int cpuset_equal_placement(const struct cpuset_placement *plc1,
3154 const struct cpuset_placement *plc2)
3155 {
3156 return bitmask_equal(plc1->cpus, plc2->cpus) &&
3157 bitmask_equal(plc1->mems, plc2->mems) &&
3158 streq(plc1->path, plc2->path);
3159 }
3160
3161 /* Free a placement struct */
cpuset_free_placement(struct cpuset_placement * plc)3162 void cpuset_free_placement(struct cpuset_placement *plc)
3163 {
3164 if (!plc)
3165 return;
3166 bitmask_free(plc->cpus);
3167 bitmask_free(plc->mems);
3168 free(plc->path);
3169 free(plc);
3170 }
3171
3172 /*
3173 * A cpuset_fts_open() call constructs a linked list of entries
3174 * called a "cpuset_fts_tree", with one entry per cpuset below
3175 * the specified path. The cpuset_fts_read() routine returns the
3176 * next entry on this list. The various cpuset_fts_get_*() calls
3177 * return attributes of the specified entry. The cpuset_fts_close()
3178 * call frees the linked list and all associated data. All cpuset
3179 * entries and attributes for the cpuset_fts_tree returned from a
3180 * given cpuset_fts_open() call remain allocated and unchanged until
3181 * that cpuset_fts_tree is closed by a cpuset_fts_close() call. Any
3182 * subsequent changes to the cpuset filesystem will go unnoticed
3183 * (not affect open cpuset_fts_tree's.)
3184 */
3185
3186 struct cpuset_fts_entry;
3187 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree);
3188
3189 struct cpuset_fts_tree {
3190 struct cpuset_fts_entry *head; /* head of linked entry list */
3191 struct cpuset_fts_entry *next; /* cpuset_fts_read() offset */
3192 };
3193
3194 struct cpuset_fts_entry {
3195 struct cpuset_fts_entry *next; /* linked entry list chain */
3196 struct cpuset *cpuset;
3197 struct stat *stat;
3198 char *path;
3199 int info;
3200 int err;
3201 };
3202
3203 /* Open a handle on a cpuset hierarchy. All the real work is done here. */
cpuset_fts_open(const char * cpusetpath)3204 struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath)
3205 {
3206 FTS *fts = NULL;
3207 FTSENT *ftsent;
3208 char *path_argv[2];
3209 char buf[PATH_MAX];
3210 struct cpuset_fts_tree *cs_tree = NULL;
3211 struct cpuset_fts_entry *ep; /* the latest new list entry */
3212 struct cpuset_fts_entry **pnlep; /* ptr to next list entry ptr */
3213 char *relpath;
3214 int fts_flags;
3215
3216 fullpath(buf, sizeof(buf), cpusetpath);
3217 path_argv[0] = buf;
3218 path_argv[1] = NULL;
3219
3220 fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV;
3221 fts = fts_open(path_argv, fts_flags, NULL);
3222 if (fts == NULL)
3223 goto err;
3224
3225 cs_tree = malloc(sizeof(*cs_tree));
3226 if (cs_tree == NULL)
3227 goto err;
3228 pnlep = &cs_tree->head;
3229 *pnlep = NULL;
3230
3231 while ((ftsent = fts_read(fts)) != NULL) {
3232 if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR)
3233 continue;
3234
3235 /* ftsent is a directory (perhaps unreadable) ==> cpuset */
3236 ep = calloc(1, sizeof(*ep));
3237 if (ep == NULL)
3238 goto err;
3239 *pnlep = ep;
3240 pnlep = &ep->next;
3241
3242 /* Set entry's path, and if DNR, error */
3243 relpath = ftsent->fts_path + strlen(cpusetmnt);
3244 if (strlen(relpath) == 0)
3245 relpath = "/";
3246 ep->path = strdup(relpath);
3247 if (ep->path == NULL)
3248 goto err;
3249 if (ftsent->fts_info == FTS_DNR) {
3250 ep->info = CPUSET_FTS_ERR_DNR;
3251 ep->err = ftsent->fts_errno;
3252 continue;
3253 }
3254
3255 /* ftsent is a -readable- cpuset: set entry's stat, etc */
3256 ep->stat = calloc(1, sizeof(struct stat));
3257 if (ep->stat == NULL)
3258 goto err;
3259 if (stat(ftsent->fts_path, ep->stat) < 0) {
3260 ep->info = CPUSET_FTS_ERR_STAT;
3261 ep->err = ftsent->fts_errno;
3262 continue;
3263 }
3264
3265 ep->cpuset = calloc(1, sizeof(struct cpuset));
3266 if (ep->cpuset == NULL)
3267 goto err;
3268 if (cpuset_query(ep->cpuset, relpath) < 0) {
3269 ep->info = CPUSET_FTS_ERR_CPUSET;
3270 ep->err = errno;
3271 continue;
3272 }
3273 ep->info = CPUSET_FTS_CPUSET;
3274 }
3275
3276 (void)fts_close(fts);
3277 cpuset_fts_rewind(cs_tree);
3278 return cs_tree;
3279
3280 err:
3281 if (cs_tree)
3282 cpuset_fts_close(cs_tree);
3283 if (fts)
3284 (void)fts_close(fts);
3285 return NULL;
3286 }
3287
3288 /* Return pointer to next cpuset entry in hierarchy */
cpuset_fts_read(struct cpuset_fts_tree * cs_tree)3289 const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree)
3290 {
3291 const struct cpuset_fts_entry *cs_entry = cs_tree->next;
3292 if (cs_tree->next != NULL) /* seek to next entry */
3293 cs_tree->next = cs_tree->next->next;
3294 return cs_entry;
3295 }
3296
3297 /* Reverse list of cpusets, in place. Simulates pre-order/post-order flip. */
cpuset_fts_reverse(struct cpuset_fts_tree * cs_tree)3298 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree)
3299 {
3300 struct cpuset_fts_entry *cs1, *cs2, *cs3;
3301
3302 /*
3303 * At each step, cs1 < cs2 < cs3 and the cs2->next pointer
3304 * is redirected from cs3 to cs1.
3305 */
3306
3307 cs1 = cs2 = NULL;
3308 cs3 = cs_tree->head;
3309 while (cs3) {
3310 cs1 = cs2;
3311 cs2 = cs3;
3312 cs3 = cs3->next;
3313 cs2->next = cs1;
3314 }
3315 cs_tree->head = cs2;
3316 cpuset_fts_rewind(cs_tree);
3317 }
3318
3319 /* Rewind cpuset list to beginning */
cpuset_fts_rewind(struct cpuset_fts_tree * cs_tree)3320 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree)
3321 {
3322 cs_tree->next = cs_tree->head;
3323 }
3324
3325 /* Return pointer to nul-terminated cpuset path of entry in hierarchy */
cpuset_fts_get_path(const struct cpuset_fts_entry * cs_entry)3326 const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry)
3327 {
3328 return cs_entry->path;
3329 }
3330
3331 /* Return pointer to stat(2) structure of a cpuset entry's directory */
cpuset_fts_get_stat(const struct cpuset_fts_entry * cs_entry)3332 const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry)
3333 {
3334 return cs_entry->stat;
3335 }
3336
3337 /* Return pointer to cpuset structure of a cpuset entry */
cpuset_fts_get_cpuset(const struct cpuset_fts_entry * cs_entry)3338 const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry
3339 *cs_entry)
3340 {
3341 return cs_entry->cpuset;
3342 }
3343
3344 /* Return value of errno (0 if no error) on attempted cpuset operations */
cpuset_fts_get_errno(const struct cpuset_fts_entry * cs_entry)3345 int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry)
3346 {
3347 return cs_entry->err;
3348 }
3349
3350 /* Return operation identity causing error */
cpuset_fts_get_info(const struct cpuset_fts_entry * cs_entry)3351 int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry)
3352 {
3353 return cs_entry->info;
3354 }
3355
3356 /* Close a cpuset hierarchy handle (free's all associated memory) */
cpuset_fts_close(struct cpuset_fts_tree * cs_tree)3357 void cpuset_fts_close(struct cpuset_fts_tree *cs_tree)
3358 {
3359 struct cpuset_fts_entry *cs_entry = cs_tree->head;
3360
3361 while (cs_entry) {
3362 struct cpuset_fts_entry *ep = cs_entry;
3363
3364 cs_entry = cs_entry->next;
3365 free(ep->path);
3366 free(ep->stat);
3367 cpuset_free(ep->cpuset);
3368 free(ep);
3369 }
3370 free(cs_tree);
3371 }
3372
3373 /* Bind current task to cpu (uses sched_setaffinity(2)) */
cpuset_cpubind(int cpu)3374 int cpuset_cpubind(int cpu)
3375 {
3376 struct bitmask *bmp;
3377 int r;
3378
3379 if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3380 return -1;
3381 bitmask_setbit(bmp, cpu);
3382 r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp));
3383 bitmask_free(bmp);
3384 return r;
3385 }
3386
3387 /*
3388 * int cpuset_latestcpu(pid_t pid)
3389 *
3390 * Return most recent CPU on which task pid executed. If pid == 0,
3391 * examine current task.
3392 *
3393 * The last used CPU is visible for a given pid as field #39 (starting
3394 * with #1) in the file /proc/pid/stat. Currently this file has 41
3395 * fields, in which case this is the 3rd to the last field.
3396 *
3397 * Unfortunately field #2 is a command name and might have embedded
3398 * whitespace. So we can't just count white space separated fields.
3399 * Fortunately, this command name is surrounded by parentheses, as
3400 * for example "(sh)", and that closing parenthesis is the last ')'
3401 * character in the line. No remaining fields can have embedded
3402 * whitespace or parentheses. So instead of looking for the 39th
3403 * white space separated field, we can look for the 37th white space
3404 * separated field past the last ')' character on the line.
3405 */
3406
3407 /* Return most recent CPU on which task pid executed */
cpuset_latestcpu(pid_t pid)3408 int cpuset_latestcpu(pid_t pid)
3409 {
3410 char buf[PATH_MAX];
3411 char *bp;
3412 int fd = -1;
3413 int cpu = -1;
3414
3415 if (pid == 0)
3416 snprintf(buf, sizeof(buf), "/proc/self/stat");
3417 else
3418 snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
3419
3420 if ((fd = open(buf, O_RDONLY)) < 0)
3421 goto err;
3422 if (read(fd, buf, sizeof(buf)) < 1)
3423 goto err;
3424 close(fd);
3425
3426 bp = strrchr(buf, ')');
3427 if (bp)
3428 sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %u", /* 37th field past ')' */
3429 &cpu);
3430 if (cpu < 0)
3431 errno = EINVAL;
3432 return cpu;
3433 err:
3434 if (fd >= 0)
3435 close(fd);
3436 return -1;
3437 }
3438
3439 /* Bind current task to memory (uses set_mempolicy(2)) */
cpuset_membind(int mem)3440 int cpuset_membind(int mem)
3441 {
3442 struct bitmask *bmp;
3443 int r;
3444
3445 if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3446 return -1;
3447 bitmask_setbit(bmp, mem);
3448 #if HAVE_DECL_MPOL_BIND
3449 r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp), bitmask_nbits(bmp) + 1);
3450 #else
3451 r = -1;
3452 errno = ENOSYS;
3453 #endif
3454 bitmask_free(bmp);
3455 return r;
3456 }
3457
3458 /* [optional] Return Memory Node holding page at specified addr */
cpuset_addr2node(void * addr)3459 int cpuset_addr2node(void *addr)
3460 {
3461 int node = -1;
3462
3463 #if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE
3464 if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE | MPOL_F_ADDR)) {
3465 /* I realize this seems redundant, but I _want_ to make sure
3466 * that this value is -1. */
3467 node = -1;
3468 }
3469 #endif
3470 return node;
3471 }
3472
3473 /*
3474 * Transform cpuset into Text Format Representation in buffer 'buf',
3475 * of length 'buflen', nul-terminated if space allows. Return number
3476 * of characters that would have been written, if enough space had
3477 * been available, in the same way that snprintf() does.
3478 */
3479
3480 /* Export cpuset settings to a regular file */
cpuset_export(const struct cpuset * cp,char * buf,int buflen)3481 int cpuset_export(const struct cpuset *cp, char *buf, int buflen)
3482 {
3483 char *tmp = NULL;
3484 int n = 0;
3485
3486 if (cp->cpu_exclusive)
3487 n += snprintf(buf + n, max(buflen - n, 0), "cpu_exclusive\n");
3488
3489 if (cp->mem_exclusive)
3490 n += snprintf(buf + n, max(buflen - n, 0), "mem_exclusive\n");
3491
3492 if (cp->notify_on_release)
3493 n += snprintf(buf + n, max(buflen - n, 0),
3494 "notify_on_release\n");
3495
3496 if (cp->memory_pressure_enabled)
3497 n += snprintf(buf + n, max(buflen - n, 0),
3498 "memory_pressure_enabled\n");
3499
3500 if (cp->memory_migrate)
3501 n += snprintf(buf + n, max(buflen - n, 0), "memory_migrate\n");
3502
3503 if (cp->memory_spread_page)
3504 n += snprintf(buf + n, max(buflen - n, 0),
3505 "memory_spread_page\n");
3506
3507 if (cp->memory_spread_slab)
3508 n += snprintf(buf + n, max(buflen - n, 0),
3509 "memory_spread_slab\n");
3510
3511 if ((tmp = sprint_mask_buf(cp->cpus)) == NULL)
3512 return -1;
3513 n += snprintf(buf + n, max(buflen - n, 0), "cpus %s\n", tmp);
3514 free(tmp);
3515 tmp = NULL;
3516
3517 if ((tmp = sprint_mask_buf(cp->mems)) == NULL)
3518 return -1;
3519 n += snprintf(buf + n, max(buflen - n, 0), "mems %s\n", tmp);
3520 free(tmp);
3521 tmp = NULL;
3522
3523 return n;
3524 }
3525
import_list(UNUSED const char * tok,const char * arg,struct bitmask * bmp,char * emsg,int elen)3526 static int import_list(UNUSED const char *tok, const char *arg,
3527 struct bitmask *bmp, char *emsg, int elen)
3528 {
3529 if (bitmask_parselist(arg, bmp) < 0) {
3530 if (emsg)
3531 snprintf(emsg, elen, "Invalid list format: %s", arg);
3532 return -1;
3533 }
3534 return 0;
3535 }
3536
stolower(char * s)3537 static void stolower(char *s)
3538 {
3539 while (*s) {
3540 unsigned char c = *s;
3541 *s = tolower(c);
3542 s++;
3543 }
3544 }
3545
3546 /* Import cpuset settings from a regular file */
cpuset_import(struct cpuset * cp,const char * buf,int * elinenum,char * emsg,int elen)3547 int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum,
3548 char *emsg, int elen)
3549 {
3550 char *linebuf = NULL;
3551 int linebuflen;
3552 int linenum = 0;
3553 int offset = 0;
3554
3555 linebuflen = strlen(buf) + 1;
3556 if ((linebuf = malloc(linebuflen)) == NULL) {
3557 if (emsg)
3558 snprintf(emsg, elen, "Insufficient memory");
3559 goto err;
3560 }
3561
3562 while (slgets(linebuf, linebuflen, buf, &offset)) {
3563 char *tok, *arg;
3564 char *ptr; /* for strtok_r */
3565
3566 linenum++;
3567 if ((tok = strchr(linebuf, '#')) != NULL)
3568 *tok = 0;
3569 if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL)
3570 continue;
3571 stolower(tok);
3572
3573 arg = strtok_r(0, " \t", &ptr);
3574
3575 if (streq(tok, "cpu_exclusive")) {
3576 cp->cpu_exclusive = 1;
3577 goto eol;
3578 }
3579 if (streq(tok, "mem_exclusive")) {
3580 cp->mem_exclusive = 1;
3581 goto eol;
3582 }
3583 if (streq(tok, "notify_on_release")) {
3584 cp->notify_on_release = 1;
3585 goto eol;
3586 }
3587 if (streq(tok, "memory_pressure_enabled")) {
3588 cp->memory_pressure_enabled = 1;
3589 goto eol;
3590 }
3591 if (streq(tok, "memory_migrate")) {
3592 cp->memory_migrate = 1;
3593 goto eol;
3594 }
3595 if (streq(tok, "memory_spread_page")) {
3596 cp->memory_spread_page = 1;
3597 goto eol;
3598 }
3599 if (streq(tok, "memory_spread_slab")) {
3600 cp->memory_spread_slab = 1;
3601 goto eol;
3602 }
3603 if (streq(tok, "cpu") || streq(tok, "cpus")) {
3604 if (import_list(tok, arg, cp->cpus, emsg, elen) < 0)
3605 goto err;
3606 goto eol;
3607 }
3608 if (streq(tok, "mem") || streq(tok, "mems")) {
3609 if (import_list(tok, arg, cp->mems, emsg, elen) < 0)
3610 goto err;
3611 goto eol;
3612 }
3613 if (emsg)
3614 snprintf(emsg, elen, "Unrecognized token: '%s'", tok);
3615 goto err;
3616 eol:
3617 if ((tok = strtok_r(0, " \t", &ptr)) != NULL) {
3618 if (emsg)
3619 snprintf(emsg, elen, "Surplus token: '%s'",
3620 tok);
3621 goto err;
3622 }
3623 continue;
3624 }
3625
3626 free(linebuf);
3627
3628 if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems))
3629 cpuset_localcpus(cp->mems, cp->cpus);
3630 else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems))
3631 cpuset_localmems(cp->cpus, cp->mems);
3632
3633 /*
3634 * All cpuset attributes are determined in an import.
3635 * Those that aren't explicitly specified are presumed
3636 * to be unchanged (zero, if it's a freshly allocated
3637 * struct cpuset.)
3638 */
3639
3640 cp->cpus_valid = 1;
3641 cp->mems_valid = 1;
3642 cp->cpu_exclusive_valid = 1;
3643 cp->mem_exclusive_valid = 1;
3644 cp->notify_on_release_valid = 1;
3645 cp->memory_migrate_valid = 1;
3646 cp->memory_pressure_enabled_valid = 1;
3647 cp->memory_spread_page_valid = 1;
3648 cp->memory_spread_slab_valid = 1;
3649
3650 return 0;
3651 err:
3652 if (elinenum)
3653 *elinenum = linenum;
3654 free(linebuf);
3655 return -1;
3656 }
3657
3658 /* Pin current task CPU (and memory) */
cpuset_pin(int relcpu)3659 int cpuset_pin(int relcpu)
3660 {
3661 struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3662 int cpu, r;
3663
3664 if (check() < 0)
3665 return -1;
3666
3667 do {
3668 cpuset_free_placement(plc1);
3669 plc1 = cpuset_get_placement(0);
3670
3671 r = 0;
3672 if (cpuset_unpin() < 0)
3673 r = -1;
3674 cpu = cpuset_p_rel_to_sys_cpu(0, relcpu);
3675 if (cpuset_cpubind(cpu) < 0)
3676 r = -1;
3677
3678 cpuset_free_placement(plc2);
3679 plc2 = cpuset_get_placement(0);
3680 } while (!cpuset_equal_placement(plc1, plc2));
3681
3682 cpuset_free_placement(plc1);
3683 cpuset_free_placement(plc2);
3684 return r;
3685 }
3686
3687 /* Return number CPUs in current tasks cpuset */
cpuset_size()3688 int cpuset_size()
3689 {
3690 struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3691 int r;
3692
3693 if (check() < 0)
3694 return -1;
3695
3696 do {
3697 cpuset_free_placement(plc1);
3698 plc1 = cpuset_get_placement(0);
3699
3700 r = cpuset_cpus_weight(0);
3701
3702 cpuset_free_placement(plc2);
3703 plc2 = cpuset_get_placement(0);
3704 } while (!cpuset_equal_placement(plc1, plc2));
3705
3706 cpuset_free_placement(plc1);
3707 cpuset_free_placement(plc2);
3708 return r;
3709 }
3710
3711 /* Return relative CPU number, within current cpuset, last executed on */
cpuset_where()3712 int cpuset_where()
3713 {
3714 struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3715 int r;
3716
3717 if (check() < 0)
3718 return -1;
3719
3720 do {
3721 cpuset_free_placement(plc1);
3722 plc1 = cpuset_get_placement(0);
3723
3724 r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0));
3725
3726 cpuset_free_placement(plc2);
3727 plc2 = cpuset_get_placement(0);
3728 } while (!cpuset_equal_placement(plc1, plc2));
3729
3730 cpuset_free_placement(plc1);
3731 cpuset_free_placement(plc2);
3732 return r;
3733 }
3734
3735 /* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */
cpuset_unpin()3736 int cpuset_unpin()
3737 {
3738 struct bitmask *cpus = NULL, *mems = NULL;
3739 int r = -1;
3740
3741 if (check() < 0)
3742 goto err;
3743
3744 /*
3745 * Don't need cpuset_*_placement() guard against concurrent
3746 * cpuset migration, because none of the following depends
3747 * on the tasks cpuset placement.
3748 */
3749
3750 if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3751 goto err;
3752 bitmask_setall(cpus);
3753 if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0)
3754 goto err;
3755
3756 if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3757 goto err;
3758 #if HAVE_DECL_MPOL_DEFAULT
3759 if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems),
3760 bitmask_nbits(mems) + 1) < 0)
3761 goto err;
3762 r = 0;
3763 #endif
3764 /* fall into ... */
3765 err:
3766 bitmask_free(cpus);
3767 bitmask_free(mems);
3768 return r;
3769
3770 }
3771
3772 struct cpuset_function_list {
3773 const char *fname;
3774 void *func;
3775 } flist[] = {
3776 {
3777 "cpuset_version", cpuset_version}, {
3778 "cpuset_alloc", cpuset_alloc}, {
3779 "cpuset_free", cpuset_free}, {
3780 "cpuset_cpus_nbits", cpuset_cpus_nbits}, {
3781 "cpuset_mems_nbits", cpuset_mems_nbits}, {
3782 "cpuset_setcpus", cpuset_setcpus}, {
3783 "cpuset_setmems", cpuset_setmems}, {
3784 "cpuset_set_iopt", cpuset_set_iopt}, {
3785 "cpuset_set_sopt", cpuset_set_sopt}, {
3786 "cpuset_getcpus", cpuset_getcpus}, {
3787 "cpuset_getmems", cpuset_getmems}, {
3788 "cpuset_cpus_weight", cpuset_cpus_weight}, {
3789 "cpuset_mems_weight", cpuset_mems_weight}, {
3790 "cpuset_get_iopt", cpuset_get_iopt}, {
3791 "cpuset_get_sopt", cpuset_get_sopt}, {
3792 "cpuset_localcpus", cpuset_localcpus}, {
3793 "cpuset_localmems", cpuset_localmems}, {
3794 "cpuset_cpumemdist", cpuset_cpumemdist}, {
3795 "cpuset_cpu2node", cpuset_cpu2node}, {
3796 "cpuset_addr2node", cpuset_addr2node}, {
3797 "cpuset_create", cpuset_create}, {
3798 "cpuset_delete", cpuset_delete}, {
3799 "cpuset_query", cpuset_query}, {
3800 "cpuset_modify", cpuset_modify}, {
3801 "cpuset_getcpusetpath", cpuset_getcpusetpath}, {
3802 "cpuset_cpusetofpid", cpuset_cpusetofpid}, {
3803 "cpuset_mountpoint", cpuset_mountpoint}, {
3804 "cpuset_collides_exclusive", cpuset_collides_exclusive}, {
3805 "cpuset_nuke", cpuset_nuke}, {
3806 "cpuset_init_pidlist", cpuset_init_pidlist}, {
3807 "cpuset_pidlist_length", cpuset_pidlist_length}, {
3808 "cpuset_get_pidlist", cpuset_get_pidlist}, {
3809 "cpuset_freepidlist", cpuset_freepidlist}, {
3810 "cpuset_move", cpuset_move}, {
3811 "cpuset_move_all", cpuset_move_all}, {
3812 "cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks}, {
3813 "cpuset_migrate", cpuset_migrate}, {
3814 "cpuset_migrate_all", cpuset_migrate_all}, {
3815 "cpuset_reattach", cpuset_reattach}, {
3816 "cpuset_open_memory_pressure", cpuset_open_memory_pressure}, {
3817 "cpuset_read_memory_pressure", cpuset_read_memory_pressure}, {
3818 "cpuset_close_memory_pressure", cpuset_close_memory_pressure}, {
3819 "cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu}, {
3820 "cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu}, {
3821 "cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem}, {
3822 "cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem}, {
3823 "cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu}, {
3824 "cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu}, {
3825 "cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem}, {
3826 "cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem}, {
3827 "cpuset_get_placement", cpuset_get_placement}, {
3828 "cpuset_equal_placement", cpuset_equal_placement}, {
3829 "cpuset_free_placement", cpuset_free_placement}, {
3830 "cpuset_fts_open", cpuset_fts_open}, {
3831 "cpuset_fts_read", cpuset_fts_read}, {
3832 "cpuset_fts_reverse", cpuset_fts_reverse}, {
3833 "cpuset_fts_rewind", cpuset_fts_rewind}, {
3834 "cpuset_fts_get_path", cpuset_fts_get_path}, {
3835 "cpuset_fts_get_stat", cpuset_fts_get_stat}, {
3836 "cpuset_fts_get_cpuset", cpuset_fts_get_cpuset}, {
3837 "cpuset_fts_get_errno", cpuset_fts_get_errno}, {
3838 "cpuset_fts_get_info", cpuset_fts_get_info}, {
3839 "cpuset_fts_close", cpuset_fts_close}, {
3840 "cpuset_cpubind", cpuset_cpubind}, {
3841 "cpuset_latestcpu", cpuset_latestcpu}, {
3842 "cpuset_membind", cpuset_membind}, {
3843 "cpuset_export", cpuset_export}, {
3844 "cpuset_import", cpuset_import}, {
3845 "cpuset_function", cpuset_function}, {
3846 "cpuset_pin", cpuset_pin}, {
3847 "cpuset_size", cpuset_size}, {
3848 "cpuset_where", cpuset_where}, {
3849 "cpuset_unpin", cpuset_unpin},};
3850
3851 /* Return pointer to a libcpuset.so function, or NULL */
cpuset_function(const char * function_name)3852 void *cpuset_function(const char *function_name)
3853 {
3854 unsigned int i;
3855
3856 for (i = 0; i < sizeof(flist) / sizeof(flist[0]); i++)
3857 if (streq(function_name, flist[i].fname))
3858 return flist[i].func;
3859 return NULL;
3860 }
3861
3862 /* Fortran interface to basic cpuset routines */
cpuset_pin_(int * ptr_relcpu)3863 int cpuset_pin_(int *ptr_relcpu)
3864 {
3865 return cpuset_pin(*ptr_relcpu);
3866 }
3867
cpuset_size_(void)3868 int cpuset_size_(void)
3869 {
3870 return cpuset_size();
3871 }
3872
cpuset_where_(void)3873 int cpuset_where_(void)
3874 {
3875 return cpuset_where();
3876 }
3877
cpuset_unpin_(void)3878 int cpuset_unpin_(void)
3879 {
3880 return cpuset_unpin();
3881 }
3882
3883 #endif /* HAVE_LINUX_MEMPOLICY_H */
3884