1 /*
2 * cpuset user library implementation.
3 *
4 * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved.
5 *
6 * Paul Jackson <pj@sgi.com>
7 */
8
9 /*
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25 #define _XOPEN_SOURCE 500 /* need to see pread() */
26 #define _BSD_SOURCE 1 /* need to see syscall() */
27 #include <unistd.h>
28
29 #include <ctype.h>
30 #include <dirent.h>
31 #include <errno.h>
32 #include <fcntl.h>
33 #include <fts.h>
34 #include <limits.h>
35 #include <signal.h>
36 #include <stdint.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <sys/stat.h>
41 #include <sys/syscall.h>
42 #include <sys/types.h>
43 #include <time.h>
44 #include <utime.h>
45 #include <sys/utsname.h> /* for cpuset_would_crash_kernel() */
46
47 #include "bitmask.h"
48 #include "cpuset.h"
49 #include "common.h"
50 #include "test.h"
51 #include "lapi/syscalls.h"
52 #include "config.h"
53
54 #if HAVE_LINUX_MEMPOLICY_H
55 #include <linux/mempolicy.h>
56
57 /* Bump version, and update Change History, when libcpuset API changes */
58 #define CPUSET_VERSION 3
59
60 /*
61 * For a history of what changed in each version, see the "Change
62 * History" section, at the end of the libcpuset master document.
63 */
64
cpuset_version(void)65 int cpuset_version(void)
66 {
67 return CPUSET_VERSION;
68 }
69
70 struct cpuset {
71 struct bitmask *cpus;
72 struct bitmask *mems;
73 char cpu_exclusive;
74 char mem_exclusive;
75 char mem_hardwall;
76 char notify_on_release;
77 char memory_migrate;
78 char memory_pressure_enabled;
79 char memory_spread_page;
80 char memory_spread_slab;
81 char sched_load_balance;
82 int sched_relax_domain_level;
83
84 /*
85 * Each field 'x' above gets an 'x_valid' field below.
86 * The apply_cpuset_settings() will only set those fields whose
87 * corresponding *_valid flags are set. The cpuset_alloc()
88 * routine clears these flags as part of the clear in calloc(),
89 * and the various cpuset_set*() routines set these flags when
90 * setting the corresponding value.
91 *
92 * The purpose of these valid fields is to ensure that when
93 * we create a new cpuset, we don't accidentally overwrite
94 * some non-zero kernel default, such as an inherited
95 * memory_spread_* flag, just because the user application
96 * code didn't override the default zero settings resulting
97 * from the calloc() call in cpuset_alloc().
98 *
99 * The choice of 'char' for the type of the flags above,
100 * but a bitfield for the flags below, is somewhat capricious.
101 */
102 unsigned cpus_valid:1;
103 unsigned mems_valid:1;
104 unsigned cpu_exclusive_valid:1;
105 unsigned mem_exclusive_valid:1;
106 unsigned mem_hardwall_valid:1;
107 unsigned notify_on_release_valid:1;
108 unsigned memory_migrate_valid:1;
109 unsigned memory_pressure_enabled_valid:1;
110 unsigned memory_spread_page_valid:1;
111 unsigned memory_spread_slab_valid:1;
112 unsigned sched_load_balance_valid:1;
113 unsigned sched_relax_domain_level_valid:1;
114
115 /*
116 * if the relative variable was modified, use following flags
117 * to put a mark
118 */
119 unsigned cpus_dirty:1;
120 unsigned mems_dirty:1;
121 unsigned cpu_exclusive_dirty:1;
122 unsigned mem_exclusive_dirty:1;
123 unsigned mem_hardwall_dirty:1;
124 unsigned notify_on_release_dirty:1;
125 unsigned memory_migrate_dirty:1;
126 unsigned memory_pressure_enabled_dirty:1;
127 unsigned memory_spread_page_dirty:1;
128 unsigned memory_spread_slab_dirty:1;
129 unsigned sched_load_balance_dirty:1;
130 unsigned sched_relax_domain_level_dirty:1;
131 };
132
133 /* Presumed cpuset file system mount point */
134 static const char *cpusetmnt = "/dev/cpuset";
135
136 /* Stashed copy of cpunodemap[], mapping each cpu to its node. */
137 static const char *mapfile = "/var/run/cpunodemap";
138
139 /* The primary source for the cpunodemap[] is available below here. */
140 static const char *sysdevices = "/sys/devices/system";
141
142 /* small buffer size - for reading boolean flags or map file (1 or 2 ints) */
143 #define SMALL_BUFSZ 16
144
145 /*
146 * The 'mask_size_file' is used to ferrit out the kernel cpumask_t
147 * and nodemask_t sizes. The lines in this file that begin with the
148 * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask
149 * and nodemask string, respectively. The lengths of these strings
150 * reflect the kernel's internal cpumask_t and nodemask_t sizes,
151 * which sizes are needed to correctly call the sched_setaffinity
152 * and set_mempolicy system calls, and to size user level
153 * bitmasks to match the kernels.
154 */
155
156 static const char *mask_size_file = "/proc/self/status";
157 static const char *cpumask_prefix = "Cpus_allowed:\t";
158 static const char *nodemask_prefix = "Mems_allowed:\t";
159
160 /*
161 * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits.
162 *
163 * The first time we need these, we parse the Cpus_allowed and
164 * Mems_allowed lines from mask_size_file ("/proc/self/status").
165 */
166
167 static int cpumask_sz;
168 static int nodemask_sz;
169
170 /*
171 * These defaults only kick in if we fail to size the kernel
172 * cpumask and nodemask by reading the Cpus_allowed and
173 * Mems_allowed fields from the /proc/self/status file.
174 */
175
176 #define DEFCPUBITS (512)
177 #define DEFNODEBITS (DEFCPUBITS/2)
178
179 /*
180 * Arch-neutral API for obtaining NUMA distances between CPUs
181 * and Memory Nodes, via the files:
182 * /sys/devices/system/node/nodeN/distance
183 * which have lines such as:
184 * 46 66 10 20
185 * which say that for cpu on node N (from the path above), the
186 * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20,
187 * respectively.
188 */
189
190 static const char *distance_directory = "/sys/devices/system/node";
191
192 /*
193 * Someday, we should disable, then later discard, the SN code
194 * marked ALTERNATE_SN_DISTMAP.
195 */
196
197 #define ALTERNATE_SN_DISTMAP 1
198 #ifdef ALTERNATE_SN_DISTMAP
199
200 /*
201 * Alternative SN (SGI ia64) architecture specific API for obtaining
202 * NUMA distances between CPUs and Memory Nodes is via the file
203 * /proc/sgi_sn/sn_topology, which has lines such as:
204 *
205 * node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20
206 *
207 * which says that for each CPU on node 2, the distance to nodes
208 * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively.
209 *
210 * This file has other lines as well, which start with other
211 * keywords than "node". Ignore these other lines.
212 */
213
214 static const char *sn_topology = "/proc/sgi_sn/sn_topology";
215 static const char *sn_top_node_prefix = "node ";
216
217 #endif
218
219 /*
220 * Check that cpusets supported, /dev/cpuset mounted.
221 * If ok, return 0.
222 * If not, return -1 and set errno:
223 * ENOSYS - kernel doesn't support cpusets
224 * ENODEV - /dev/cpuset not mounted
225 */
226
227 static enum {
228 check_notdone,
229 check_enosys,
230 check_enodev,
231 check_ok
232 } check_state = check_notdone;
233
check()234 static int check()
235 {
236 if (check_state == check_notdone) {
237 struct stat statbuf;
238
239 if (stat("/proc/self/cpuset", &statbuf) < 0) {
240 check_state = check_enosys;
241 goto done;
242 }
243
244 if (stat("/dev/cpuset/tasks", &statbuf) < 0) {
245 check_state = check_enodev;
246 goto done;
247 }
248
249 check_state = check_ok;
250 }
251 done:
252 switch (check_state) {
253 case check_enosys:
254 errno = ENOSYS;
255 return -1;
256 case check_enodev:
257 errno = ENODEV;
258 return -1;
259 default:
260 break;
261 }
262 return 0;
263 }
264
chomp(char * s)265 static void chomp(char *s)
266 {
267 char *t;
268
269 for (t = s + strlen(s) - 1; t >= s; t--) {
270 if (*t == '\n' || *t == '\r')
271 *t = '\0';
272 else
273 break;
274 }
275 }
276
277 /*
278 * Determine number of bytes in a seekable open file, without
279 * assuming that stat(2) on that file has a useful size.
280 * Has side affect of leaving the file rewound to the beginnning.
281 */
filesize(FILE * fp)282 static int filesize(FILE * fp)
283 {
284 int sz = 0;
285 rewind(fp);
286 while (fgetc(fp) != EOF)
287 sz++;
288 rewind(fp);
289 return sz;
290 }
291
292 /* Are strings s1 and s2 equal? */
streq(const char * s1,const char * s2)293 static int streq(const char *s1, const char *s2)
294 {
295 return strcmp(s1, s2) == 0;
296 }
297
298 /* Is string 'pre' a prefix of string 's'? */
strprefix(const char * s,const char * pre)299 static int strprefix(const char *s, const char *pre)
300 {
301 return strncmp(s, pre, strlen(pre)) == 0;
302 }
303
304 /*
305 * char *flgets(char *buf, int buflen, FILE *fp)
306 *
307 * Obtain one line from input file fp. Copy up to first
308 * buflen-1 chars of line into buffer buf, discarding any remainder
309 * of line. Stop reading at newline, discarding newline.
310 * Nul terminate result and return pointer to buffer buf
311 * on success, or NULL if nothing more to read or failure.
312 */
313
flgets(char * buf,int buflen,FILE * fp)314 static char *flgets(char *buf, int buflen, FILE * fp)
315 {
316 int c = -1;
317 char *bp;
318
319 bp = buf;
320 while ((--buflen > 0) && ((c = getc(fp)) >= 0)) {
321 if (c == '\n')
322 goto newline;
323 *bp++ = c;
324 }
325 if ((c < 0) && (bp == buf))
326 return NULL;
327
328 if (c > 0) {
329 while ((c = getc(fp)) >= 0) {
330 if (c == '\n')
331 break;
332 }
333 }
334
335 newline:
336 *bp++ = '\0';
337 return buf;
338 }
339
340 /*
341 * sgetc(const char *inputbuf, int *offsetptr)
342 *
343 * Return next char from nul-terminated input buffer inputbuf,
344 * starting at offset *offsetptr. Increment *offsetptr.
345 * If next char would be nul ('\0'), return EOF and don't
346 * increment *offsetptr.
347 */
348
sgetc(const char * inputbuf,int * offsetptr)349 static int sgetc(const char *inputbuf, int *offsetptr)
350 {
351 char c;
352
353 if ((c = inputbuf[*offsetptr]) != 0) {
354 *offsetptr = *offsetptr + 1;
355 return c;
356 } else {
357 return EOF;
358 }
359 }
360
361 /*
362 * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
363 *
364 * Obtain next line from nul-terminated input buffer 'inputbuf',
365 * starting at offset *offsetptr. Copy up to first buflen-1
366 * chars of line into output buffer buf, discarding any remainder
367 * of line. Stop reading at newline, discarding newline.
368 * Nul terminate result and return pointer to output buffer
369 * buf on success, or NULL if nothing more to read.
370 */
371
slgets(char * buf,int buflen,const char * inputbuf,int * offsetptr)372 static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
373 {
374 int c = -1;
375 char *bp;
376
377 bp = buf;
378 while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) {
379 if (c == '\n')
380 goto newline;
381 *bp++ = c;
382 }
383 if ((c < 0) && (bp == buf))
384 return NULL;
385
386 if (c > 0) {
387 while ((c = sgetc(inputbuf, offsetptr)) >= 0) {
388 if (c == '\n')
389 break;
390 }
391 }
392
393 newline:
394 *bp++ = '\0';
395 return buf;
396 }
397
398 /*
399 * time_t get_mtime(char *path)
400 *
401 * Return modtime of file at location path, else return 0.
402 */
403
get_mtime(const char * path)404 static time_t get_mtime(const char *path)
405 {
406 struct stat statbuf;
407
408 if (stat(path, &statbuf) != 0)
409 return 0;
410 return statbuf.st_mtime;
411 }
412
413 /*
414 * int set_mtime(const char *path, time_t mtime)
415 *
416 * Set modtime of file 'path' to 'mtime'. Return 0 on success,
417 * or -1 on error, setting errno.
418 */
419
set_mtime(const char * path,time_t mtime)420 static int set_mtime(const char *path, time_t mtime)
421 {
422 struct utimbuf times;
423
424 times.actime = mtime;
425 times.modtime = mtime;
426 return utime(path, ×);
427 }
428
429 /*
430 * True if two pathnames resolve to same file.
431 * False if either path can not be stat'd,
432 * or if the two paths resolve to a different file.
433 */
434
samefile(const char * path1,const char * path2)435 static int samefile(const char *path1, const char *path2)
436 {
437 struct stat sb1, sb2;
438
439 if (stat(path1, &sb1) != 0)
440 return 0;
441 if (stat(path2, &sb2) != 0)
442 return 0;
443 return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev;
444 }
445
446 #define slash(c) (*(c) == '/')
447 #define eocomp(c) (slash(c) || !*(c))
448 #define dot1(c) (*(c) == '.' && eocomp(c+1))
449
450 /* In place path compression. Remove extra dots and slashes. */
pathcomp(char * p)451 static char *pathcomp(char *p)
452 {
453 char *a = p;
454 char *b = p;
455
456 if (!p || !*p)
457 return p;
458 if (slash(p))
459 *b++ = *a++;
460 for (;;) {
461 if (slash(a))
462 while (slash(++a))
463 continue;
464 if (!*a) {
465 if (b == p)
466 *b++ = '.';
467 *b = '\0';
468 return (p);
469 } else if (dot1(a)) {
470 a++;
471 } else {
472 if ((b != p) && !slash(b - 1))
473 *b++ = '/';
474 while (!eocomp(a))
475 *b++ = *a++;
476 }
477 }
478 }
479
480 #undef slash
481 #undef eocomp
482 #undef dot1
483
484 /*
485 * pathcat2(buf, buflen, name1, name2)
486 *
487 * Return buf, of length buflen, with name1/name2 stored in it.
488 */
489
pathcat2(char * buf,int buflen,const char * name1,const char * name2)490 static char *pathcat2(char *buf, int buflen, const char *name1,
491 const char *name2)
492 {
493 (void)snprintf(buf, buflen, "%s/%s", name1, name2);
494 return pathcomp(buf);
495 }
496
497 /*
498 * pathcat3(buf, buflen, name1, name2, name3)
499 *
500 * Return buf, of length buflen, with name1/name2/name3 stored in it.
501 */
502
pathcat3(char * buf,int buflen,const char * name1,const char * name2,const char * name3)503 static char *pathcat3(char *buf, int buflen, const char *name1,
504 const char *name2, const char *name3)
505 {
506 (void)snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3);
507 return pathcomp(buf);
508 }
509
510 /*
511 * fullpath(buf, buflen, name)
512 *
513 * Put full path of cpuset 'name' in buffer 'buf'. If name
514 * starts with a slash (``/``) character, then this a path
515 * relative to ``/dev/cpuset``, otherwise it is relative to
516 * the current tasks cpuset. Return 0 on success, else
517 * -1 on error, setting errno.
518 */
519
fullpath(char * buf,int buflen,const char * name)520 static int fullpath(char *buf, int buflen, const char *name)
521 {
522 int len;
523
524 /* easy case */
525 if (*name == '/') {
526 pathcat2(buf, buflen, cpusetmnt, name);
527 pathcomp(buf);
528 return 0;
529 }
530
531 /* hard case */
532 snprintf(buf, buflen, "%s/", cpusetmnt);
533 len = strlen(buf);
534 if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL)
535 return -1;
536 if (strlen(buf) >= buflen - 1 - strlen(name)) {
537 errno = E2BIG;
538 return -1;
539 }
540 strcat(buf, "/");
541 strcat(buf, name);
542 pathcomp(buf);
543 return 0;
544 }
545
546 /*
547 * fullpath2(buf, buflen, name1, name2)
548 *
549 * Like fullpath(), only concatenate two pathname components on end.
550 */
551
fullpath2(char * buf,int buflen,const char * name1,const char * name2)552 static int fullpath2(char *buf, int buflen, const char *name1,
553 const char *name2)
554 {
555 if (fullpath(buf, buflen, name1) < 0)
556 return -1;
557 if (strlen(buf) >= buflen - 1 - strlen(name2)) {
558 errno = E2BIG;
559 return -1;
560 }
561 strcat(buf, "/");
562 strcat(buf, name2);
563 pathcomp(buf);
564 return 0;
565 }
566
567 /*
568 * Convert the string length of an ascii hex mask to the number
569 * of bits represented by that mask.
570 *
571 * The cpumask and nodemask values in /proc/self/status are in an
572 * ascii format that uses 9 characters for each 32 bits of mask.
573 */
s2nbits(const char * s)574 static int s2nbits(const char *s)
575 {
576 return strlen(s) * 32 / 9;
577 }
578
update_mask_sizes()579 static void update_mask_sizes()
580 {
581 FILE *fp = NULL;
582 char *buf = NULL;
583 int fsize;
584
585 if ((fp = fopen(mask_size_file, "r")) == NULL)
586 goto done;
587 fsize = filesize(fp);
588 if ((buf = malloc(fsize)) == NULL)
589 goto done;
590
591 /*
592 * Beware: mask sizing arithmetic is fussy.
593 * The trailing newline left by fgets() is required.
594 */
595 while (fgets(buf, fsize, fp)) {
596 if (strprefix(buf, cpumask_prefix))
597 cpumask_sz = s2nbits(buf + strlen(cpumask_prefix));
598 if (strprefix(buf, nodemask_prefix))
599 nodemask_sz = s2nbits(buf + strlen(nodemask_prefix));
600 }
601 done:
602 free(buf);
603 if (fp != NULL)
604 fclose(fp);
605 if (cpumask_sz == 0)
606 cpumask_sz = DEFCPUBITS;
607 if (nodemask_sz == 0)
608 nodemask_sz = DEFNODEBITS;
609 }
610
611 /* Allocate a new struct cpuset */
cpuset_alloc()612 struct cpuset *cpuset_alloc()
613 {
614 struct cpuset *cp = NULL;
615 int nbits;
616
617 if ((cp = calloc(1, sizeof(struct cpuset))) == NULL)
618 goto err;
619
620 nbits = cpuset_cpus_nbits();
621 if ((cp->cpus = bitmask_alloc(nbits)) == NULL)
622 goto err;
623
624 nbits = cpuset_mems_nbits();
625 if ((cp->mems = bitmask_alloc(nbits)) == NULL)
626 goto err;
627
628 return cp;
629 err:
630 if (cp && cp->cpus)
631 bitmask_free(cp->cpus);
632 if (cp && cp->mems)
633 bitmask_free(cp->mems);
634 free(cp);
635 return NULL;
636 }
637
638 /* Free struct cpuset *cp */
cpuset_free(struct cpuset * cp)639 void cpuset_free(struct cpuset *cp)
640 {
641 if (!cp)
642 return;
643 if (cp->cpus)
644 bitmask_free(cp->cpus);
645 if (cp->mems)
646 bitmask_free(cp->mems);
647 free(cp);
648 }
649
650 /* Number of bits in a CPU bitmask on current system */
cpuset_cpus_nbits()651 int cpuset_cpus_nbits()
652 {
653 if (cpumask_sz == 0)
654 update_mask_sizes();
655 return cpumask_sz;
656 }
657
658 /* Number of bits in a Memory bitmask on current system */
cpuset_mems_nbits()659 int cpuset_mems_nbits()
660 {
661 if (nodemask_sz == 0)
662 update_mask_sizes();
663 return nodemask_sz;
664 }
665
666 /* Set CPUs in cpuset cp to bitmask cpus */
cpuset_setcpus(struct cpuset * cp,const struct bitmask * cpus)667 int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus)
668 {
669 if (cp->cpus)
670 bitmask_free(cp->cpus);
671 cp->cpus = bitmask_alloc(bitmask_nbits(cpus));
672 if (cp->cpus == NULL)
673 return -1;
674 bitmask_copy(cp->cpus, cpus);
675 cp->cpus_valid = 1;
676 cp->cpus_dirty = 1;
677 return 0;
678 }
679
680 /* Set Memory Nodes in cpuset cp to bitmask mems */
cpuset_setmems(struct cpuset * cp,const struct bitmask * mems)681 int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems)
682 {
683 if (cp->mems)
684 bitmask_free(cp->mems);
685 cp->mems = bitmask_alloc(bitmask_nbits(mems));
686 if (cp->mems == NULL)
687 return -1;
688 bitmask_copy(cp->mems, mems);
689 cp->mems_valid = 1;
690 cp->mems_dirty = 1;
691 return 0;
692 }
693
694 /* Set integer value optname of cpuset cp */
cpuset_set_iopt(struct cpuset * cp,const char * optionname,int value)695 int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value)
696 {
697 if (streq(optionname, "cpu_exclusive")) {
698 cp->cpu_exclusive = ! !value;
699 cp->cpu_exclusive_valid = 1;
700 cp->cpu_exclusive_dirty = 1;
701 } else if (streq(optionname, "mem_exclusive")) {
702 cp->mem_exclusive = ! !value;
703 cp->mem_exclusive_valid = 1;
704 cp->mem_exclusive_dirty = 1;
705 } else if (streq(optionname, "mem_hardwall")) {
706 cp->mem_hardwall = ! !value;
707 cp->mem_hardwall_valid = 1;
708 cp->mem_hardwall_dirty = 1;
709 } else if (streq(optionname, "notify_on_release")) {
710 cp->notify_on_release = ! !value;
711 cp->notify_on_release_valid = 1;
712 cp->notify_on_release_dirty = 1;
713 } else if (streq(optionname, "memory_pressure_enabled")) {
714 cp->memory_pressure_enabled = ! !value;
715 cp->memory_pressure_enabled_valid = 1;
716 cp->memory_pressure_enabled_dirty = 1;
717 } else if (streq(optionname, "memory_migrate")) {
718 cp->memory_migrate = ! !value;
719 cp->memory_migrate_valid = 1;
720 cp->memory_migrate_dirty = 1;
721 } else if (streq(optionname, "memory_spread_page")) {
722 cp->memory_spread_page = ! !value;
723 cp->memory_spread_page_valid = 1;
724 cp->memory_spread_page_dirty = 1;
725 } else if (streq(optionname, "memory_spread_slab")) {
726 cp->memory_spread_slab = ! !value;
727 cp->memory_spread_slab_valid = 1;
728 cp->memory_spread_slab_dirty = 1;
729 } else if (streq(optionname, "sched_load_balance")) {
730 cp->sched_load_balance = ! !value;
731 cp->sched_load_balance_valid = 1;
732 cp->sched_load_balance_dirty = 1;
733 } else if (streq(optionname, "sched_relax_domain_level")) {
734 cp->sched_relax_domain_level = value;
735 cp->sched_relax_domain_level_valid = 1;
736 cp->sched_relax_domain_level_dirty = 1;
737 } else
738 return -2; /* optionname not recognized */
739 return 0;
740 }
741
742 /* [optional] Set string value optname */
cpuset_set_sopt(UNUSED struct cpuset * cp,UNUSED const char * optionname,UNUSED const char * value)743 int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname,
744 UNUSED const char *value)
745 {
746 return -2; /* For now, all string options unrecognized */
747 }
748
749 /* Return handle for reading memory_pressure. */
cpuset_open_memory_pressure(const char * cpusetpath)750 int cpuset_open_memory_pressure(const char *cpusetpath)
751 {
752 char buf[PATH_MAX];
753
754 fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure");
755 return open(buf, O_RDONLY);
756 }
757
758 /* Return current memory_pressure of cpuset. */
cpuset_read_memory_pressure(int han)759 int cpuset_read_memory_pressure(int han)
760 {
761 char buf[SMALL_BUFSZ];
762
763 if (pread(han, buf, sizeof(buf), 0L) < 0)
764 return -1;
765 return atoi(buf);
766 }
767
768 /* Close handle for reading memory pressure. */
cpuset_close_memory_pressure(int han)769 void cpuset_close_memory_pressure(int han)
770 {
771 close(han);
772 }
773
774 /*
775 * Resolve cpuset pointer (to that of current task if cp == NULL).
776 *
777 * If cp not NULL, just return it. If cp is NULL, return pointer
778 * to temporary cpuset for current task, and set *cp_tofree to
779 * pointer to that same temporary cpuset, to be freed later.
780 *
781 * Return NULL and set errno on error. Errors can occur when
782 * resolving the current tasks cpuset.
783 */
resolve_cp(const struct cpuset * cp,struct cpuset ** cp_tofree)784 static const struct cpuset *resolve_cp(const struct cpuset *cp,
785 struct cpuset **cp_tofree)
786 {
787 const struct cpuset *rcp;
788
789 if (cp) {
790 rcp = cp;
791 } else {
792 struct cpuset *cp1 = cpuset_alloc();
793 if (cp1 == NULL)
794 goto err;
795 if (cpuset_cpusetofpid(cp1, 0) < 0) {
796 cpuset_free(cp1);
797 goto err;
798 }
799 *cp_tofree = cp1;
800 rcp = cp1;
801 }
802 return rcp;
803 err:
804 return NULL;
805 }
806
807 /* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */
cpuset_getcpus(const struct cpuset * cp,struct bitmask * cpus)808 int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus)
809 {
810 struct cpuset *cp_tofree = NULL;
811 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
812
813 if (!cp1)
814 goto err;
815 if (cp1->cpus == NULL) {
816 errno = EINVAL;
817 goto err;
818 }
819 bitmask_copy(cpus, cp1->cpus);
820 cpuset_free(cp_tofree);
821 return 0;
822 err:
823 cpuset_free(cp_tofree);
824 return -1;
825 }
826
827 /* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */
cpuset_getmems(const struct cpuset * cp,struct bitmask * mems)828 int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems)
829 {
830 struct cpuset *cp_tofree = NULL;
831 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
832
833 if (!cp1)
834 goto err;
835 if (cp1->mems == NULL) {
836 errno = EINVAL;
837 goto err;
838 }
839 bitmask_copy(mems, cp1->mems);
840 cpuset_free(cp_tofree);
841 return 0;
842 err:
843 cpuset_free(cp_tofree);
844 return -1;
845 }
846
847 /* Return number of CPUs in cpuset cp (current task if cp == NULL) */
cpuset_cpus_weight(const struct cpuset * cp)848 int cpuset_cpus_weight(const struct cpuset *cp)
849 {
850 struct cpuset *cp_tofree = NULL;
851 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
852 int w = -1;
853
854 if (!cp1)
855 goto err;
856 if (cp1->cpus == NULL) {
857 errno = EINVAL;
858 goto err;
859 }
860 w = bitmask_weight(cp1->cpus);
861 /* fall into ... */
862 err:
863 cpuset_free(cp_tofree);
864 return w;
865 }
866
867 /* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */
cpuset_mems_weight(const struct cpuset * cp)868 int cpuset_mems_weight(const struct cpuset *cp)
869 {
870 struct cpuset *cp_tofree = NULL;
871 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
872 int w = -1;
873
874 if (!cp1)
875 goto err;
876 if (cp1->mems == NULL) {
877 errno = EINVAL;
878 goto err;
879 }
880 w = bitmask_weight(cp1->mems);
881 /* fall into ... */
882 err:
883 cpuset_free(cp_tofree);
884 return w;
885 }
886
887 /* Return integer value of option optname in cp */
cpuset_get_iopt(const struct cpuset * cp,const char * optionname)888 int cpuset_get_iopt(const struct cpuset *cp, const char *optionname)
889 {
890 if (streq(optionname, "cpu_exclusive"))
891 return cp->cpu_exclusive;
892 else if (streq(optionname, "mem_exclusive"))
893 return cp->mem_exclusive;
894 else if (streq(optionname, "mem_hardwall"))
895 return cp->mem_hardwall;
896 else if (streq(optionname, "notify_on_release"))
897 return cp->notify_on_release;
898 else if (streq(optionname, "memory_pressure_enabled"))
899 return cp->memory_pressure_enabled;
900 else if (streq(optionname, "memory_migrate"))
901 return cp->memory_migrate;
902 else if (streq(optionname, "memory_spread_page"))
903 return cp->memory_spread_page;
904 else if (streq(optionname, "memory_spread_slab"))
905 return cp->memory_spread_slab;
906 else if (streq(optionname, "sched_load_balance"))
907 return cp->sched_load_balance;
908 else if (streq(optionname, "sched_relax_domain_level"))
909 return cp->sched_relax_domain_level;
910 else
911 return -2; /* optionname not recognized */
912 }
913
914 /* [optional] Return string value of optname */
cpuset_get_sopt(UNUSED const struct cpuset * cp,UNUSED const char * optionname)915 const char *cpuset_get_sopt(UNUSED const struct cpuset *cp,
916 UNUSED const char *optionname)
917 {
918 return NULL; /* For now, all string options unrecognized */
919 }
920
read_flag(const char * filepath,char * flagp)921 static int read_flag(const char *filepath, char *flagp)
922 {
923 char buf[SMALL_BUFSZ]; /* buffer a "0" or "1" flag line */
924 int fd = -1;
925
926 if ((fd = open(filepath, O_RDONLY)) < 0)
927 goto err;
928 if (read(fd, buf, sizeof(buf)) < 1)
929 goto err;
930 if (atoi(buf))
931 *flagp = 1;
932 else
933 *flagp = 0;
934 close(fd);
935 return 0;
936 err:
937 if (fd >= 0)
938 close(fd);
939 return -1;
940 }
941
load_flag(const char * path,char * flagp,const char * flag)942 static int load_flag(const char *path, char *flagp, const char *flag)
943 {
944 char buf[PATH_MAX];
945
946 pathcat2(buf, sizeof(buf), path, flag);
947 return read_flag(buf, flagp);
948 }
949
read_number(const char * filepath,int * numberp)950 static int read_number(const char *filepath, int *numberp)
951 {
952 char buf[SMALL_BUFSZ];
953 int fd = -1;
954
955 if ((fd = open(filepath, O_RDONLY)) < 0)
956 goto err;
957 if (read(fd, buf, sizeof(buf)) < 1)
958 goto err;
959 *numberp = atoi(buf);
960 close(fd);
961 return 0;
962 err:
963 if (fd >= 0)
964 close(fd);
965 return -1;
966 }
967
load_number(const char * path,int * numberp,const char * file)968 static int load_number(const char *path, int *numberp, const char *file)
969 {
970 char buf[PATH_MAX];
971
972 pathcat2(buf, sizeof(buf), path, file);
973 return read_number(buf, numberp);
974 }
975
read_mask(const char * filepath,struct bitmask ** bmpp,int nbits)976 static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits)
977 {
978 FILE *fp = NULL;
979 char *buf = NULL;
980 int buflen;
981 struct bitmask *bmp = NULL;
982
983 if ((fp = fopen(filepath, "r")) == NULL)
984 goto err;
985 buflen = filesize(fp) + 1; /* + 1 for nul term */
986 if ((buf = malloc(buflen)) == NULL)
987 goto err;
988 if (flgets(buf, buflen, fp) == NULL)
989 goto err;
990 fclose(fp);
991 fp = NULL;
992
993 if ((bmp = bitmask_alloc(nbits)) == NULL)
994 goto err;
995 if (*buf && bitmask_parselist(buf, bmp) < 0)
996 goto err;
997 if (*bmpp)
998 bitmask_free(*bmpp);
999 *bmpp = bmp;
1000 free(buf);
1001 buf = NULL;
1002 return 0;
1003 err:
1004 if (buf != NULL)
1005 free(buf);
1006 if (fp != NULL)
1007 fclose(fp);
1008 if (bmp != NULL)
1009 bitmask_free(bmp);
1010 return -1;
1011 }
1012
load_mask(const char * path,struct bitmask ** bmpp,int nbits,const char * mask)1013 static int load_mask(const char *path, struct bitmask **bmpp,
1014 int nbits, const char *mask)
1015 {
1016 char buf[PATH_MAX];
1017
1018 pathcat2(buf, sizeof(buf), path, mask);
1019 return read_mask(buf, bmpp, nbits);
1020 }
1021
1022 /* Write string to file at given filepath. Create or truncate file. */
write_string_file(const char * filepath,const char * str)1023 static int write_string_file(const char *filepath, const char *str)
1024 {
1025 int fd = -1;
1026
1027 if ((fd = open(filepath, O_WRONLY | O_CREAT, 0644)) < 0)
1028 goto err;
1029 if (write(fd, str, strlen(str)) < 0)
1030 goto err;
1031 close(fd);
1032 return 0;
1033 err:
1034 if (fd >= 0)
1035 close(fd);
1036 return -1;
1037 }
1038
1039 /* Size and allocate buffer. Write bitmask into it. Caller must free */
sprint_mask_buf(const struct bitmask * bmp)1040 static char *sprint_mask_buf(const struct bitmask *bmp)
1041 {
1042 char *buf = NULL;
1043 int buflen;
1044 char c;
1045
1046 /* First bitmask_displaylist() call just to get the length */
1047 buflen = bitmask_displaylist(&c, 1, bmp) + 1; /* "+ 1" for nul */
1048 if ((buf = malloc(buflen)) == NULL)
1049 return NULL;
1050 bitmask_displaylist(buf, buflen, bmp);
1051 return buf;
1052 }
1053
exists_flag(const char * path,const char * flag)1054 static int exists_flag(const char *path, const char *flag)
1055 {
1056 char buf[PATH_MAX];
1057 struct stat statbuf;
1058 int rc;
1059
1060 pathcat2(buf, sizeof(buf), path, flag);
1061 rc = (stat(buf, &statbuf) == 0);
1062 errno = 0;
1063 return rc;
1064 }
1065
store_flag(const char * path,const char * flag,int val)1066 static int store_flag(const char *path, const char *flag, int val)
1067 {
1068 char buf[PATH_MAX];
1069
1070 pathcat2(buf, sizeof(buf), path, flag);
1071 return write_string_file(buf, val ? "1" : "0");
1072 }
1073
store_number(const char * path,const char * file,int val)1074 static int store_number(const char *path, const char *file, int val)
1075 {
1076 char buf[PATH_MAX];
1077 char data[SMALL_BUFSZ];
1078
1079 memset(data, 0, sizeof(data));
1080 pathcat2(buf, sizeof(buf), path, file);
1081 snprintf(data, sizeof(data), "%d", val);
1082 return write_string_file(buf, data);
1083 }
1084
store_mask(const char * path,const char * mask,const struct bitmask * bmp)1085 static int store_mask(const char *path, const char *mask,
1086 const struct bitmask *bmp)
1087 {
1088 char maskpath[PATH_MAX];
1089 char *bp = NULL;
1090 int rc;
1091
1092 if (bmp == NULL)
1093 return 0;
1094 pathcat2(maskpath, sizeof(maskpath), path, mask);
1095 if ((bp = sprint_mask_buf(bmp)) == NULL)
1096 return -1;
1097 rc = write_string_file(maskpath, bp);
1098 free(bp);
1099 return rc;
1100 }
1101
1102 /*
1103 * Return 1 if 'cpu' is online, else 0 if offline. Tests the file
1104 * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents
1105 * were N == cpu number.
1106 */
1107
cpu_online(unsigned int cpu)1108 char cpu_online(unsigned int cpu)
1109 {
1110 char online;
1111 char cpupath[PATH_MAX];
1112
1113 (void)snprintf(cpupath, sizeof(cpupath),
1114 "/sys/devices/system/cpu/cpu%d/online", cpu);
1115 if (read_flag(cpupath, &online) < 0)
1116 return 0; /* oops - guess that cpu's not there */
1117 return online;
1118 }
1119
1120 /*
1121 * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()),
1122 * to the node on which that cpu resides or cpuset_mems_nbits().
1123 *
1124 * To avoid every user having to recalculate this relation
1125 * from various clues in the sysfs file system (below the
1126 * path /sys/devices/system) a copy of this map is kept at
1127 * /var/run/cpunodemap.
1128 *
1129 * The system automatically cleans out files below
1130 * /var/run on each system reboot (see the init script
1131 * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry
1132 * about stale data in this file across reboots. If the file
1133 * is missing, let the first process that needs it, and has
1134 * permission to write in the /var/run directory, rebuild it.
1135 *
1136 * If using this cached data, remember the mtime of the mapfile
1137 * the last time we read it in case something like a hotplug
1138 * event results in the file being removed and rebuilt, so we
1139 * can detect if we're using a stale cache, and need to reload.
1140 *
1141 * The mtime of this file is set to the time when we did
1142 * the recalculation of the map, from the clues beneath
1143 * /sys/devices/system. This is done so that a program
1144 * won't see the mapfile it just wrote as being newer than what
1145 * it just wrote out (store_map) and read the same map back in
1146 * (load_file).
1147 */
1148
1149 /*
1150 * Hold flockfile(stdin) while using cpunodemap for posix thread safety.
1151 *
1152 * Note on locking and flockfile(FILE *):
1153 *
1154 * We use flockfile() and funlockfile() instead of directly
1155 * calling pthread_mutex_lock and pthread_mutex_unlock on
1156 * a pthread_mutex_t, because this avoids forcing the app
1157 * to link with libpthread. The glibc implementation of
1158 * flockfile/funlockfile will fall back to no-ops if libpthread
1159 * doesn't happen to be linked.
1160 *
1161 * Since flockfile already has the moderately convoluted
1162 * combination of weak and strong symbols required to accomplish
1163 * this, it is easier to use flockfile() on some handy FILE *
1164 * stream as a surrogate for pthread locking than it is to so
1165 * re-invent that wheel.
1166 *
1167 * Forcing all apps that use cpusets to link with libpthread
1168 * would force non-transparent initialization on apps that
1169 * might not be prepared to handle it.
1170 *
1171 * The application using libcpuset should never notice this
1172 * odd use of flockfile(), because we never return to the
1173 * application from any libcpuset call with any such lock held.
1174 * We just use this locking for guarding some non-atomic cached
1175 * data updates and accesses, internal to some libcpuset calls.
1176 * Also, flockfile() allows recursive nesting, so if the app
1177 * calls libcpuset holding such a file lock, we won't deadlock
1178 * if we go to acquire the same lock. We'll just get the lock
1179 * and increment its counter while we hold it.
1180 */
1181
1182 static struct cpunodemap {
1183 int *map; /* map[cpumask_sz]: maps cpu to its node */
1184 time_t mtime; /* modtime of mapfile when last read */
1185 } cpunodemap;
1186
1187 /*
1188 * rebuild_map() - Rebuild cpunodemap[] from scratch.
1189 *
1190 * Situation:
1191 * Neither our in-memory cpunodemap[] array nor the
1192 * cache of it in mapfile is current.
1193 * Action:
1194 * Rebuild it from first principles and the information
1195 * available below /sys/devices/system.
1196 */
1197
rebuild_map()1198 static void rebuild_map()
1199 {
1200 char buf[PATH_MAX];
1201 DIR *dir1, *dir2;
1202 struct dirent *dent1, *dent2;
1203 int ncpus = cpuset_cpus_nbits();
1204 int nmems = cpuset_mems_nbits();
1205 unsigned int cpu, mem;
1206
1207 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1208 cpunodemap.map[cpu] = -1;
1209 pathcat2(buf, sizeof(buf), sysdevices, "node");
1210 if ((dir1 = opendir(buf)) == NULL)
1211 return;
1212 while ((dent1 = readdir(dir1)) != NULL) {
1213 if (sscanf(dent1->d_name, "node%u", &mem) < 1)
1214 continue;
1215 pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name);
1216 if ((dir2 = opendir(buf)) == NULL)
1217 continue;
1218 while ((dent2 = readdir(dir2)) != NULL) {
1219 if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1)
1220 continue;
1221 if (cpu >= (unsigned int)ncpus
1222 || mem >= (unsigned int)nmems)
1223 continue;
1224 cpunodemap.map[cpu] = mem;
1225 }
1226 closedir(dir2);
1227 }
1228 closedir(dir1);
1229 cpunodemap.mtime = time(0);
1230 }
1231
1232 /*
1233 * load_map() - Load cpunodemap[] from mapfile.
1234 *
1235 * Situation:
1236 * The cpunodemap in mapfile is more recent than
1237 * what we have in the cpunodemap[] array.
1238 * Action:
1239 * Reload the cpunodemap[] array from the file.
1240 */
1241
load_map()1242 static void load_map()
1243 {
1244 char buf[SMALL_BUFSZ]; /* buffer 1 line of mapfile */
1245 FILE *mapfp; /* File stream on mapfile */
1246 int ncpus = cpuset_cpus_nbits();
1247 int nmems = cpuset_mems_nbits();
1248 unsigned int cpu, mem;
1249
1250 if ((cpunodemap.map = calloc(ncpus, sizeof(int))) == NULL)
1251 return;
1252 cpunodemap.mtime = get_mtime(mapfile);
1253 if ((mapfp = fopen(mapfile, "r")) == NULL)
1254 return;
1255 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1256 cpunodemap.map[cpu] = nmems;
1257 while (flgets(buf, sizeof(buf), mapfp) != NULL) {
1258 if (sscanf(buf, "%u %u", &cpu, &mem) < 2)
1259 continue;
1260 if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems)
1261 continue;
1262 cpunodemap.map[cpu] = mem;
1263 }
1264 fclose(mapfp);
1265 }
1266
1267 /*
1268 * store_map() - Write cpunodemap[] out to mapfile.
1269 *
1270 * Situation:
1271 * The cpunodemap in the cpunodemap[] array is
1272 * more recent than the one in mapfile.
1273 * Action:
1274 * Write cpunodemap[] out to mapfile.
1275 */
1276
store_map()1277 static void store_map()
1278 {
1279 char buf[PATH_MAX];
1280 int fd = -1;
1281 FILE *mapfp = NULL;
1282 int ncpus = cpuset_cpus_nbits();
1283 int nmems = cpuset_mems_nbits();
1284 unsigned int cpu, mem;
1285
1286 snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX");
1287 if ((fd = mkstemp(buf)) < 0)
1288 goto err;
1289 if ((mapfp = fdopen(fd, "w")) == NULL)
1290 goto err;
1291 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1292 mem = cpunodemap.map[cpu];
1293 if (mem < (unsigned int)nmems)
1294 fprintf(mapfp, "%u %u\n", cpu, mem);
1295 }
1296 fclose(mapfp);
1297 set_mtime(buf, cpunodemap.mtime);
1298 if (rename(buf, mapfile) < 0)
1299 goto err;
1300 /* mkstemp() creates mode 0600 - change to world readable */
1301 (void)chmod(mapfile, 0444);
1302 return;
1303 err:
1304 if (mapfp != NULL) {
1305 fclose(mapfp);
1306 fd = -1;
1307 }
1308 if (fd >= 0)
1309 close(fd);
1310 (void)unlink(buf);
1311 }
1312
1313 /*
1314 * Load and gain thread safe access to the <cpu, node> map.
1315 *
1316 * Return 0 on success with flockfile(stdin) held.
1317 * Each successful get_map() call must be matched with a
1318 * following put_map() call to release the lock.
1319 *
1320 * On error, return -1 with errno set and no lock held.
1321 */
1322
get_map()1323 static int get_map()
1324 {
1325 time_t file_mtime;
1326
1327 flockfile(stdin);
1328
1329 if (cpunodemap.map == NULL) {
1330 cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int));
1331 if (cpunodemap.map == NULL)
1332 goto err;
1333 }
1334
1335 /* If no one has a good cpunodemap, rebuild from scratch */
1336 file_mtime = get_mtime(mapfile);
1337 if (cpunodemap.mtime == 0 && file_mtime == 0)
1338 rebuild_map();
1339
1340 /* If either cpunodemap[] or mapfile newer, update other with it */
1341 file_mtime = get_mtime(mapfile);
1342 if (cpunodemap.mtime < file_mtime)
1343 load_map();
1344 else if (cpunodemap.mtime > file_mtime)
1345 store_map();
1346 return 0;
1347 err:
1348 funlockfile(stdin);
1349 return -1;
1350 }
1351
put_map()1352 static void put_map()
1353 {
1354 funlockfile(stdin);
1355 }
1356
1357 /* Set cpus to those local to Memory Nodes mems */
cpuset_localcpus(const struct bitmask * mems,struct bitmask * cpus)1358 int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus)
1359 {
1360 int ncpus = cpuset_cpus_nbits();
1361 unsigned int cpu;
1362
1363 if (check() < 0)
1364 return -1;
1365
1366 get_map();
1367 bitmask_clearall(cpus);
1368 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1369 if (bitmask_isbitset(mems, cpunodemap.map[cpu]))
1370 bitmask_setbit(cpus, cpu);
1371 }
1372 put_map();
1373 return 0;
1374 }
1375
1376 /* Set mems to those local to CPUs cpus */
cpuset_localmems(const struct bitmask * cpus,struct bitmask * mems)1377 int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems)
1378 {
1379 int ncpus = cpuset_cpus_nbits();
1380 unsigned int cpu;
1381
1382 if (check() < 0)
1383 return -1;
1384
1385 get_map();
1386 bitmask_clearall(mems);
1387 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1388 if (bitmask_isbitset(cpus, cpu))
1389 bitmask_setbit(mems, cpunodemap.map[cpu]);
1390 }
1391 put_map();
1392 return 0;
1393 }
1394
1395 /*
1396 * distmap[]
1397 *
1398 * Array of ints of size cpumask_sz by nodemask_sz.
1399 *
1400 * Element distmap[cpu][mem] is the distance between CPU cpu
1401 * and Memory Node mem. Distances are weighted to roughly
1402 * approximate the cost of memory references, and scaled so that
1403 * the distance from a CPU to its local Memory Node is ten (10).
1404 *
1405 * The first call to cpuset_cpumemdist() builds this map, from
1406 * whatever means the kernel provides to obtain these distances.
1407 *
1408 * These distances derive from ACPI SLIT table entries, which are
1409 * eight bits in size.
1410 *
1411 * Hold flockfile(stdout) while using distmap for posix thread safety.
1412 */
1413
1414 typedef unsigned char distmap_entry_t; /* type of distmap[] entries */
1415
1416 static distmap_entry_t *distmap; /* maps <cpu, mem> to distance */
1417
1418 #define DISTMAP_MAX UCHAR_MAX /* maximum value in distmap[] */
1419
1420 #define I(i,j) ((i) * nmems + (j)) /* 2-D array index simulation */
1421
1422 /*
1423 * Parse arch neutral lines from 'distance' files of form:
1424 *
1425 * 46 66 10 20
1426 *
1427 * The lines contain a space separated list of distances, which is parsed
1428 * into array dists[] of each nodes distance from the specified node.
1429 *
1430 * Result is placed in distmap[ncpus][nmems]:
1431 *
1432 * For each cpu c on node:
1433 * For each node position n in list of distances:
1434 * distmap[c][n] = dists[n]
1435 */
1436
parse_distmap_line(unsigned int node,char * buf)1437 static int parse_distmap_line(unsigned int node, char *buf)
1438 {
1439 char *p, *q;
1440 int ncpus = cpuset_cpus_nbits();
1441 int nmems = cpuset_mems_nbits();
1442 unsigned int c, n;
1443 distmap_entry_t *dists = NULL;
1444 struct bitmask *cpus = NULL, *mems = NULL;
1445 int ret = -1;
1446
1447 p = buf;
1448 if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1449 goto err;
1450 for (n = 0; n < (unsigned int)nmems; n++)
1451 dists[n] = DISTMAP_MAX;
1452
1453 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1454 unsigned int d;
1455
1456 if ((p = strpbrk(p, "0123456789")) == NULL)
1457 break;
1458 d = strtoul(p, &q, 10);
1459 if (p == q)
1460 break;
1461 if (d < DISTMAP_MAX)
1462 dists[n] = (distmap_entry_t) d;
1463 }
1464
1465 if ((mems = bitmask_alloc(nmems)) == NULL)
1466 goto err;
1467 bitmask_setbit(mems, node);
1468
1469 if ((cpus = bitmask_alloc(ncpus)) == NULL)
1470 goto err;
1471 cpuset_localcpus(mems, cpus);
1472
1473 for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1474 c = bitmask_next(cpus, c + 1))
1475 for (n = 0; n < (unsigned int)nmems; n++)
1476 distmap[I(c, n)] = dists[n];
1477 ret = 0;
1478 /* fall into ... */
1479 err:
1480 bitmask_free(mems);
1481 bitmask_free(cpus);
1482 free(dists);
1483 return ret;
1484 }
1485
parse_distance_file(unsigned int node,const char * path)1486 static int parse_distance_file(unsigned int node, const char *path)
1487 {
1488 FILE *fp;
1489 char *buf = NULL;
1490 int buflen;
1491
1492 if ((fp = fopen(path, "r")) == NULL)
1493 goto err;
1494
1495 buflen = filesize(fp);
1496
1497 if ((buf = malloc(buflen)) == NULL)
1498 goto err;
1499
1500 if (flgets(buf, buflen, fp) == NULL)
1501 goto err;
1502
1503 if (parse_distmap_line(node, buf) < 0)
1504 goto err;
1505
1506 free(buf);
1507 fclose(fp);
1508 return 0;
1509 err:
1510 free(buf);
1511 if (fp)
1512 fclose(fp);
1513 return -1;
1514 }
1515
build_distmap()1516 static void build_distmap()
1517 {
1518 static int tried_before = 0;
1519 int ncpus = cpuset_cpus_nbits();
1520 int nmems = cpuset_mems_nbits();
1521 int c, m;
1522 DIR *dir = NULL;
1523 struct dirent *dent;
1524
1525 if (tried_before)
1526 goto err;
1527 tried_before = 1;
1528
1529 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1530 goto err;
1531
1532 for (c = 0; c < ncpus; c++)
1533 for (m = 0; m < nmems; m++)
1534 distmap[I(c, m)] = DISTMAP_MAX;
1535
1536 if ((dir = opendir(distance_directory)) == NULL)
1537 goto err;
1538 while ((dent = readdir(dir)) != NULL) {
1539 char buf[PATH_MAX];
1540 unsigned int node;
1541
1542 if (sscanf(dent->d_name, "node%u", &node) < 1)
1543 continue;
1544 pathcat3(buf, sizeof(buf), distance_directory, dent->d_name,
1545 "distance");
1546 if (parse_distance_file(node, buf) < 0)
1547 goto err;
1548 }
1549 closedir(dir);
1550 return;
1551 err:
1552 if (dir)
1553 closedir(dir);
1554 free(distmap);
1555 distmap = NULL;
1556 }
1557
1558 #ifdef ALTERNATE_SN_DISTMAP
1559
1560 /*
1561 * Parse SN architecture specific line of form:
1562 *
1563 * node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10
1564 *
1565 * Second field is node number. The "dist" field is the colon separated list
1566 * of distances, which is parsed into array dists[] of each nodes distance
1567 * from that node.
1568 *
1569 * Result is placed in distmap[ncpus][nmems]:
1570 *
1571 * For each cpu c on that node:
1572 * For each node position n in list of distances:
1573 * distmap[c][n] = dists[n]
1574 */
1575
parse_distmap_line_sn(char * buf)1576 static void parse_distmap_line_sn(char *buf)
1577 {
1578 char *p, *pend, *q;
1579 int ncpus = cpuset_cpus_nbits();
1580 int nmems = cpuset_mems_nbits();
1581 unsigned long c, n, node;
1582 distmap_entry_t *dists = NULL;
1583 struct bitmask *cpus = NULL, *mems = NULL;
1584
1585 if ((p = strchr(buf, ' ')) == NULL)
1586 goto err;
1587 if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems)
1588 goto err;
1589 if ((p = strstr(q, " dist ")) == NULL)
1590 goto err;
1591 p += strlen(" dist ");
1592 if ((pend = strchr(p, ' ')) != NULL)
1593 *pend = '\0';
1594 if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1595 goto err;
1596 for (n = 0; n < (unsigned int)nmems; n++)
1597 dists[n] = DISTMAP_MAX;
1598
1599 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1600 unsigned long d;
1601
1602 if ((p = strpbrk(p, "0123456789")) == NULL)
1603 break;
1604 d = strtoul(p, &q, 10);
1605 if (p == q)
1606 break;
1607 if (d < DISTMAP_MAX)
1608 dists[n] = (distmap_entry_t) d;
1609 }
1610
1611 if ((mems = bitmask_alloc(nmems)) == NULL)
1612 goto err;
1613 bitmask_setbit(mems, node);
1614
1615 if ((cpus = bitmask_alloc(ncpus)) == NULL)
1616 goto err;
1617 cpuset_localcpus(mems, cpus);
1618
1619 for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1620 c = bitmask_next(cpus, c + 1))
1621 for (n = 0; n < (unsigned int)nmems; n++)
1622 distmap[I(c, n)] = dists[n];
1623 /* fall into ... */
1624 err:
1625 bitmask_free(mems);
1626 bitmask_free(cpus);
1627 free(dists);
1628 }
1629
build_distmap_sn()1630 static void build_distmap_sn()
1631 {
1632 int ncpus = cpuset_cpus_nbits();
1633 int nmems = cpuset_mems_nbits();
1634 int c, m;
1635 static int tried_before = 0;
1636 FILE *fp = NULL;
1637 char *buf = NULL;
1638 int buflen;
1639
1640 if (tried_before)
1641 goto err;
1642 tried_before = 1;
1643
1644 if ((fp = fopen(sn_topology, "r")) == NULL)
1645 goto err;
1646
1647 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1648 goto err;
1649
1650 for (c = 0; c < ncpus; c++)
1651 for (m = 0; m < nmems; m++)
1652 distmap[I(c, m)] = DISTMAP_MAX;
1653
1654 buflen = filesize(fp);
1655 if ((buf = malloc(buflen)) == NULL)
1656 goto err;
1657
1658 while (flgets(buf, buflen, fp) != NULL)
1659 if (strprefix(buf, sn_top_node_prefix))
1660 parse_distmap_line_sn(buf);
1661
1662 free(buf);
1663 fclose(fp);
1664 return;
1665 err:
1666 free(buf);
1667 free(distmap);
1668 distmap = NULL;
1669 if (fp)
1670 fclose(fp);
1671 }
1672
1673 #endif
1674
1675 /* [optional] Hardware distance from CPU to Memory Node */
cpuset_cpumemdist(int cpu,int mem)1676 unsigned int cpuset_cpumemdist(int cpu, int mem)
1677 {
1678 int ncpus = cpuset_cpus_nbits();
1679 int nmems = cpuset_mems_nbits();
1680 distmap_entry_t r = DISTMAP_MAX;
1681
1682 flockfile(stdout);
1683
1684 if (check() < 0)
1685 goto err;
1686
1687 if (distmap == NULL)
1688 build_distmap();
1689
1690 #ifdef ALTERNATE_SN_DISTMAP
1691 if (distmap == NULL)
1692 build_distmap_sn();
1693 #endif
1694
1695 if (distmap == NULL)
1696 goto err;
1697
1698 if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems)
1699 goto err;
1700
1701 r = distmap[I(cpu, mem)];
1702 /* fall into ... */
1703 err:
1704 funlockfile(stdout);
1705 return r;
1706 }
1707
1708 /* [optional] Return Memory Node closest to cpu */
cpuset_cpu2node(int cpu)1709 int cpuset_cpu2node(int cpu)
1710 {
1711 int ncpus = cpuset_cpus_nbits();
1712 int nmems = cpuset_mems_nbits();
1713 struct bitmask *cpus = NULL, *mems = NULL;
1714 int r = -1;
1715
1716 if (check() < 0)
1717 goto err;
1718
1719 if ((cpus = bitmask_alloc(ncpus)) == NULL)
1720 goto err;
1721 bitmask_setbit(cpus, cpu);
1722
1723 if ((mems = bitmask_alloc(nmems)) == NULL)
1724 goto err;
1725 cpuset_localmems(cpus, mems);
1726 r = bitmask_first(mems);
1727 /* fall into ... */
1728 err:
1729 bitmask_free(cpus);
1730 bitmask_free(mems);
1731 return r;
1732 }
1733
apply_cpuset_settings(const char * path,const struct cpuset * cp)1734 static int apply_cpuset_settings(const char *path, const struct cpuset *cp)
1735 {
1736 if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) {
1737 if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0)
1738 goto err;
1739 }
1740
1741 if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) {
1742 if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0)
1743 goto err;
1744 }
1745
1746 if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) {
1747 if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0)
1748 goto err;
1749 }
1750
1751 if (cp->notify_on_release_valid && cp->notify_on_release_dirty) {
1752 if (store_flag(path, "notify_on_release", cp->notify_on_release)
1753 < 0)
1754 goto err;
1755 }
1756
1757 if (cp->memory_migrate_valid &&
1758 cp->memory_migrate_dirty && exists_flag(path, "memory_migrate")) {
1759 if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0)
1760 goto err;
1761 }
1762
1763 if (cp->memory_pressure_enabled_valid &&
1764 cp->memory_pressure_enabled_dirty &&
1765 exists_flag(path, "memory_pressure_enabled")) {
1766 if (store_flag
1767 (path, "memory_pressure_enabled",
1768 cp->memory_pressure_enabled) < 0)
1769 goto err;
1770 }
1771
1772 if (cp->memory_spread_page_valid &&
1773 cp->memory_spread_page_dirty &&
1774 exists_flag(path, "memory_spread_page")) {
1775 if (store_flag
1776 (path, "memory_spread_page", cp->memory_spread_page) < 0)
1777 goto err;
1778 }
1779
1780 if (cp->memory_spread_slab_valid &&
1781 cp->memory_spread_slab_dirty &&
1782 exists_flag(path, "memory_spread_slab")) {
1783 if (store_flag
1784 (path, "memory_spread_slab", cp->memory_spread_slab) < 0)
1785 goto err;
1786 }
1787
1788 if (cp->sched_load_balance_valid &&
1789 cp->sched_load_balance_dirty &&
1790 exists_flag(path, "sched_load_balance")) {
1791 if (store_flag
1792 (path, "sched_load_balance", cp->sched_load_balance) < 0)
1793 goto err;
1794 }
1795
1796 if (cp->sched_relax_domain_level_valid &&
1797 cp->sched_relax_domain_level_dirty &&
1798 exists_flag(path, "sched_relax_domain_level")) {
1799 if (store_number
1800 (path, "sched_relax_domain_level",
1801 cp->sched_relax_domain_level) < 0)
1802 goto err;
1803 }
1804
1805 if (cp->cpus_valid && cp->cpus_dirty) {
1806 if (store_mask(path, "cpus", cp->cpus) < 0)
1807 goto err;
1808 }
1809
1810 if (cp->mems_valid && cp->mems_dirty) {
1811 if (store_mask(path, "mems", cp->mems) < 0)
1812 goto err;
1813 }
1814 return 0;
1815 err:
1816 return -1;
1817 }
1818
1819 /*
1820 * get_siblings() - helper routine for cpuset_would_crash_kernel(), below.
1821 *
1822 * Extract max value of any 'siblings' field in /proc/cpuinfo.
1823 * Cache the result - only need to extract once in lifetime of task.
1824 *
1825 * The siblings field is the number of logical CPUs in a physical
1826 * processor package. It is equal to the product of the number of
1827 * cores in that package, times the number of hyper-threads per core.
1828 * The bug that cpuset_would_crash_kernel() is detecting arises
1829 * when a cpu_exclusive cpuset tries to include just some, not all,
1830 * of the sibling logical CPUs available in a processor package.
1831 *
1832 * In the improbable case that a system has mixed values of siblings
1833 * (some processor packages have more than others, perhaps due to
1834 * partially enabling Hyper-Threading), we take the worse case value,
1835 * the largest siblings value. This might be overkill. I don't know
1836 * if this kernel bug considers each processor package's siblings
1837 * separately or not. But it sure is easier this way ...
1838 *
1839 * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from
1840 * open to close, the first time called.
1841 */
1842
get_siblings()1843 static int get_siblings()
1844 {
1845 static int siblings;
1846 char buf[32]; /* big enough for one 'siblings' line */
1847 FILE *fp;
1848
1849 if (siblings)
1850 return siblings;
1851
1852 if ((fp = fopen("/proc/cpuinfo", "r")) == NULL)
1853 return 4; /* wing it - /proc not mounted ? */
1854 while (flgets(buf, sizeof(buf), fp) != NULL) {
1855 int s;
1856
1857 if (sscanf(buf, "siblings : %d", &s) < 1)
1858 continue;
1859 if (s > siblings)
1860 siblings = s;
1861 }
1862 fclose(fp);
1863 if (siblings == 0)
1864 siblings = 1; /* old kernel, no siblings, default to 1 */
1865 return siblings;
1866 }
1867
1868 /*
1869 * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic
1870 * scheduler domain code invoked for cpu_exclusive cpusets that causes
1871 * the kernel to freeze, requiring a hardware reset.
1872 *
1873 * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive'
1874 * cpuset is defined where that cpusets 'cpus' are not on package
1875 * boundaries then the kernel will freeze, usually as soon as this
1876 * cpuset is created, requiring a hardware reset.
1877 *
1878 * A cpusets 'cpus' are not on package boundaries if the cpuset
1879 * includes a proper non-empty subset (some, but not all) of the
1880 * logical cpus on a processor package. This requires multiple
1881 * logical CPUs per package, available with either Hyper-Thread or
1882 * Multi-Core support. Without one of these features, there is only
1883 * one logical CPU per physical package, and it's not possible to
1884 * have a proper, non-empty subset of a set of cardinality one.
1885 *
1886 * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC
1887 * on i386 and x86_64 arch's.
1888 *
1889 * The objective of this routine cpuset_would_crash_kernel() is to
1890 * determine if a proposed cpuset setting would crash the kernel due
1891 * to this bug, so that the caller can avoid the crash.
1892 *
1893 * Ideally we'd check for exactly these conditions here, but computing
1894 * the package (identified by the 'physical id' field of /proc/cpuinfo)
1895 * of each cpu in a cpuset is more effort than it's worth here.
1896 *
1897 * Also there is no obvious way to identify exactly whether the kernel
1898 * one is executing on has this bug, short of trying it, and seeing
1899 * if the kernel just crashed.
1900 *
1901 * So for now, we look for a simpler set of conditions, that meets
1902 * our immediate need - avoid this crash on SUSE SLES10 systems that
1903 * are susceptible to it. We look for the kernel version 2.6.16.*,
1904 * which is the base kernel of SUSE SLES10, and for i386 or x86_64
1905 * processors, which had CONFIG_SCHED_MC enabled.
1906 *
1907 * If these simpler conditions are met, we further simplify the check,
1908 * by presuming that the logical CPUs are numbered on processor
1909 * package boundaries. If each package has S siblings, we assume
1910 * that CPUs numbered N through N + S -1 are on the same package,
1911 * for any CPU N such that N mod S == 0.
1912 *
1913 * Yes, this is a hack, focused on avoiding kernel freezes on
1914 * susceptible SUSE SLES10 systems.
1915 */
1916
cpuset_would_crash_kernel(const struct cpuset * cp)1917 static int cpuset_would_crash_kernel(const struct cpuset *cp)
1918 {
1919 static int susceptible_system = -1;
1920
1921 if (!cp->cpu_exclusive)
1922 goto ok;
1923
1924 if (susceptible_system == -1) {
1925 struct utsname u;
1926 int rel_2_6_16, arch_i386, arch_x86_64;
1927
1928 if (uname(&u) < 0)
1929 goto fail;
1930 rel_2_6_16 = strprefix(u.release, "2.6.16.");
1931 arch_i386 = streq(u.machine, "i386");
1932 arch_x86_64 = streq(u.machine, "x86_64");
1933 susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64);
1934 }
1935
1936 if (susceptible_system) {
1937 int ncpus = cpuset_cpus_nbits();
1938 int siblings = get_siblings();
1939 unsigned int cpu;
1940
1941 for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) {
1942 int s, num_set = 0;
1943
1944 for (s = 0; s < siblings; s++) {
1945 if (bitmask_isbitset(cp->cpus, cpu + s))
1946 num_set++;
1947 }
1948
1949 /* If none or all siblings set, we're still ok */
1950 if (num_set == 0 || num_set == siblings)
1951 continue;
1952
1953 /* Found one that would crash kernel. Fail. */
1954 errno = ENXIO;
1955 goto fail;
1956 }
1957 }
1958 /* If not susceptible, or if all ok, fall into "ok" ... */
1959 ok:
1960 return 0; /* would not crash */
1961 fail:
1962 return 1; /* would crash */
1963 }
1964
1965 /* compare two cpuset and mark the dirty variable */
mark_dirty_variable(struct cpuset * cp1,const struct cpuset * cp2)1966 static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2)
1967 {
1968 if (cp1->cpu_exclusive_valid &&
1969 cp1->cpu_exclusive != cp2->cpu_exclusive)
1970 cp1->cpu_exclusive_dirty = 1;
1971
1972 if (cp1->mem_exclusive_valid &&
1973 cp1->mem_exclusive != cp2->mem_exclusive)
1974 cp1->mem_exclusive_dirty = 1;
1975
1976 if (cp1->mem_hardwall_valid && cp1->mem_hardwall != cp2->mem_hardwall)
1977 cp1->mem_hardwall_dirty = 1;
1978
1979 if (cp1->notify_on_release_valid &&
1980 cp1->notify_on_release != cp2->notify_on_release)
1981 cp1->notify_on_release_dirty = 1;
1982
1983 if (cp1->memory_migrate_valid &&
1984 cp1->memory_migrate != cp2->memory_migrate)
1985 cp1->memory_migrate_dirty = 1;
1986
1987 if (cp1->memory_pressure_enabled_valid &&
1988 cp1->memory_pressure_enabled != cp2->memory_pressure_enabled)
1989 cp1->memory_pressure_enabled_dirty = 1;
1990
1991 if (cp1->memory_spread_page_valid &&
1992 cp1->memory_spread_page != cp2->memory_spread_page)
1993 cp1->memory_spread_page_dirty = 1;
1994
1995 if (cp1->memory_spread_slab_valid &&
1996 cp1->memory_spread_slab != cp2->memory_spread_slab)
1997 cp1->memory_spread_slab_dirty = 1;
1998
1999 if (cp1->sched_load_balance_valid &&
2000 cp1->sched_load_balance != cp2->sched_load_balance)
2001 cp1->sched_load_balance_dirty = 1;
2002
2003 if (cp1->sched_relax_domain_level_valid &&
2004 cp1->sched_relax_domain_level != cp2->sched_relax_domain_level)
2005 cp1->sched_relax_domain_level_dirty = 1;
2006
2007 if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus))
2008 cp1->cpus_dirty = 1;
2009 if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems))
2010 cp1->mems_dirty = 1;
2011 }
2012
2013 /* Create (if new set) or modify cpuset 'cp' at location 'relpath' */
cr_or_mod(const char * relpath,const struct cpuset * cp,int new)2014 static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new)
2015 {
2016 char buf[PATH_MAX];
2017 int do_rmdir_on_err = 0;
2018 int do_restore_cp_sav_on_err = 0;
2019 struct cpuset *cp_sav = NULL;
2020 int sav_errno;
2021
2022 if (check() < 0)
2023 goto err;
2024
2025 if (cpuset_would_crash_kernel(cp))
2026 goto err;
2027
2028 fullpath(buf, sizeof(buf), relpath);
2029
2030 if (new) {
2031 if (mkdir(buf, 0755) < 0)
2032 goto err;
2033 /* we made it, so we should remove it on error */
2034 do_rmdir_on_err = 1;
2035 }
2036
2037 if ((cp_sav = cpuset_alloc()) == NULL)
2038 goto err;
2039 if (cpuset_query(cp_sav, relpath) < 0)
2040 goto err;
2041 /* we have old settings to restore on error */
2042 do_restore_cp_sav_on_err = 1;
2043
2044 /* check which variable need to restore on error */
2045 mark_dirty_variable(cp_sav, cp);
2046
2047 if (apply_cpuset_settings(buf, cp) < 0)
2048 goto err;
2049
2050 cpuset_free(cp_sav);
2051 return 0;
2052 err:
2053 sav_errno = errno;
2054 if (do_restore_cp_sav_on_err)
2055 (void)apply_cpuset_settings(buf, cp_sav);
2056 if (cp_sav)
2057 cpuset_free(cp_sav);
2058 if (do_rmdir_on_err)
2059 (void)rmdir(buf);
2060 errno = sav_errno;
2061 return -1;
2062 }
2063
2064 /* Create cpuset 'cp' at location 'relpath' */
cpuset_create(const char * relpath,const struct cpuset * cp)2065 int cpuset_create(const char *relpath, const struct cpuset *cp)
2066 {
2067 return cr_or_mod(relpath, cp, 1);
2068 }
2069
2070 /* Delete cpuset at location 'path' (if empty) */
cpuset_delete(const char * relpath)2071 int cpuset_delete(const char *relpath)
2072 {
2073 char buf[PATH_MAX];
2074
2075 if (check() < 0)
2076 goto err;
2077
2078 fullpath(buf, sizeof(buf), relpath);
2079 if (rmdir(buf) < 0)
2080 goto err;
2081
2082 return 0;
2083 err:
2084 return -1;
2085 }
2086
2087 /* Set cpuset cp to the cpuset at location 'path' */
cpuset_query(struct cpuset * cp,const char * relpath)2088 int cpuset_query(struct cpuset *cp, const char *relpath)
2089 {
2090 char buf[PATH_MAX];
2091
2092 if (check() < 0)
2093 goto err;
2094
2095 fullpath(buf, sizeof(buf), relpath);
2096
2097 if (load_flag(buf, &cp->cpu_exclusive, "cpu_exclusive") < 0)
2098 goto err;
2099 cp->cpu_exclusive_valid = 1;
2100
2101 if (load_flag(buf, &cp->mem_exclusive, "mem_exclusive") < 0)
2102 goto err;
2103 cp->mem_exclusive_valid = 1;
2104
2105 if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0)
2106 goto err;
2107 cp->notify_on_release_valid = 1;
2108
2109 if (exists_flag(buf, "memory_migrate")) {
2110 if (load_flag(buf, &cp->memory_migrate, "memory_migrate") < 0)
2111 goto err;
2112 cp->memory_migrate_valid = 1;
2113 }
2114
2115 if (exists_flag(buf, "mem_hardwall")) {
2116 if (load_flag(buf, &cp->mem_hardwall, "mem_hardwall") < 0)
2117 goto err;
2118 cp->mem_hardwall_valid = 1;
2119 }
2120
2121 if (exists_flag(buf, "memory_pressure_enabled")) {
2122 if (load_flag
2123 (buf, &cp->memory_pressure_enabled,
2124 "memory_pressure_enabled") < 0)
2125 goto err;
2126 cp->memory_pressure_enabled_valid = 1;
2127 }
2128
2129 if (exists_flag(buf, "memory_spread_page")) {
2130 if (load_flag
2131 (buf, &cp->memory_spread_page, "memory_spread_page") < 0)
2132 goto err;
2133 cp->memory_spread_page_valid = 1;
2134 }
2135
2136 if (exists_flag(buf, "memory_spread_slab")) {
2137 if (load_flag
2138 (buf, &cp->memory_spread_slab, "memory_spread_slab") < 0)
2139 goto err;
2140 cp->memory_spread_slab_valid = 1;
2141 }
2142
2143 if (exists_flag(buf, "sched_load_balance")) {
2144 if (load_flag
2145 (buf, &cp->sched_load_balance, "sched_load_balance") < 0)
2146 goto err;
2147 cp->sched_load_balance_valid = 1;
2148 }
2149
2150 if (exists_flag(buf, "sched_relax_domain_level")) {
2151 if (load_number
2152 (buf, &cp->sched_relax_domain_level,
2153 "sched_relax_domain_level") < 0)
2154 goto err;
2155 cp->sched_relax_domain_level_valid = 1;
2156 }
2157
2158 if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpus") < 0)
2159 goto err;
2160 cp->cpus_valid = 1;
2161
2162 if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "mems") < 0)
2163 goto err;
2164 cp->mems_valid = 1;
2165
2166 return 0;
2167 err:
2168 return -1;
2169 }
2170
2171 /* Modify cpuset at location 'relpath' to values of 'cp' */
cpuset_modify(const char * relpath,const struct cpuset * cp)2172 int cpuset_modify(const char *relpath, const struct cpuset *cp)
2173 {
2174 return cr_or_mod(relpath, cp, 0);
2175 }
2176
2177 /* Get cpuset path of pid into buf */
cpuset_getcpusetpath(pid_t pid,char * buf,size_t size)2178 char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size)
2179 {
2180 int fd; /* dual use: cpuset file for pid and self */
2181 int rc; /* dual use: snprintf and read return codes */
2182
2183 if (check() < 0)
2184 return NULL;
2185
2186 /* borrow result buf[] to build cpuset file path */
2187 if (pid == 0)
2188 rc = snprintf(buf, size, "/proc/self/cpuset");
2189 else
2190 rc = snprintf(buf, size, "/proc/%d/cpuset", pid);
2191 if (rc >= (int)size) {
2192 errno = E2BIG;
2193 return NULL;
2194 }
2195 if ((fd = open(buf, O_RDONLY)) < 0) {
2196 int e = errno;
2197 if (e == ENOENT)
2198 e = ESRCH;
2199 if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0)
2200 e = ENOSYS;
2201 else
2202 close(fd);
2203 errno = e;
2204 return NULL;
2205 }
2206 rc = read(fd, buf, size);
2207 close(fd);
2208 if (rc < 0)
2209 return NULL;
2210 if (rc >= (int)size) {
2211 errno = E2BIG;
2212 return NULL;
2213 }
2214 buf[rc] = 0;
2215 chomp(buf);
2216 return buf;
2217
2218 }
2219
2220 /* Get cpuset 'cp' of pid */
cpuset_cpusetofpid(struct cpuset * cp,pid_t pid)2221 int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid)
2222 {
2223 char buf[PATH_MAX];
2224
2225 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
2226 return -1;
2227 if (cpuset_query(cp, buf) < 0)
2228 return -1;
2229 return 0;
2230 }
2231
2232 /* [optional] Return mountpoint of cpuset filesystem */
cpuset_mountpoint()2233 const char *cpuset_mountpoint()
2234 {
2235 if (check() < 0) {
2236 switch (errno) {
2237 case ENODEV:
2238 return "[cpuset filesystem not mounted]";
2239 default:
2240 return "[cpuset filesystem not supported]";
2241 }
2242 }
2243 return cpusetmnt;
2244 }
2245
2246 /* Return true if path is a directory. */
isdir(const char * path)2247 static int isdir(const char *path)
2248 {
2249 struct stat statbuf;
2250
2251 if (stat(path, &statbuf) < 0)
2252 return 0;
2253 return S_ISDIR(statbuf.st_mode);
2254 }
2255
2256 /*
2257 * [optional] cpuset_collides_exclusive() - True if would collide exclusive.
2258 *
2259 * Return true iff the specified cpuset would overlap with any
2260 * sibling cpusets in either cpus or mems, where either this
2261 * cpuset or the sibling is cpu_exclusive or mem_exclusive.
2262 *
2263 * cpuset_create() fails with errno == EINVAL if the requested cpuset
2264 * would overlap with any sibling, where either one is cpu_exclusive or
2265 * mem_exclusive. This is a common, and not obvious error. The
2266 * following routine checks for this particular case, so that code
2267 * creating cpusets can better identify the situation, perhaps to issue
2268 * a more informative error message.
2269 *
2270 * Can also be used to diagnose cpuset_modify failures. This
2271 * routine ignores any existing cpuset with the same path as the
2272 * given 'cpusetpath', and only looks for exclusive collisions with
2273 * sibling cpusets of that path.
2274 *
2275 * In case of any error, returns (0) -- does not collide. Presumably
2276 * any actual attempt to create or modify a cpuset will encounter the
2277 * same error, and report it usefully.
2278 *
2279 * This routine is not particularly efficient; most likely code creating or
2280 * modifying a cpuset will want to try the operation first, and then if that
2281 * fails with errno EINVAL, perhaps call this routine to determine if an
2282 * exclusive cpuset collision caused the error.
2283 */
2284
cpuset_collides_exclusive(const char * cpusetpath,const struct cpuset * cp1)2285 int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1)
2286 {
2287 char parent[PATH_MAX];
2288 char *p;
2289 char *pathcopy = NULL;
2290 char *base;
2291 DIR *dir = NULL;
2292 struct dirent *dent;
2293 struct cpuset *cp2 = NULL;
2294 struct bitmask *cpus1 = NULL, *cpus2 = NULL;
2295 struct bitmask *mems1 = NULL, *mems2 = NULL;
2296 int ret;
2297
2298 if (check() < 0)
2299 goto err;
2300
2301 fullpath(parent, sizeof(parent), cpusetpath);
2302 if (streq(parent, cpusetmnt))
2303 goto err; /* only one cpuset root - can't collide */
2304 pathcopy = strdup(parent);
2305 p = strrchr(parent, '/');
2306 if (!p)
2307 goto err; /* huh? - impossible - run and hide */
2308 *p = 0; /* now parent is dirname of fullpath */
2309
2310 p = strrchr(pathcopy, '/');
2311 base = p + 1; /* now base is basename of fullpath */
2312 if (!*base)
2313 goto err; /* this is also impossible - run away */
2314
2315 if ((dir = opendir(parent)) == NULL)
2316 goto err;
2317 if ((cp2 = cpuset_alloc()) == NULL)
2318 goto err;
2319 if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2320 goto err;
2321 if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2322 goto err;
2323 if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2324 goto err;
2325 if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2326 goto err;
2327
2328 while ((dent = readdir(dir)) != NULL) {
2329 char child[PATH_MAX];
2330
2331 if (streq(dent->d_name, ".") || streq(dent->d_name, ".."))
2332 continue;
2333 if (streq(dent->d_name, base))
2334 continue;
2335 pathcat2(child, sizeof(child), parent, dent->d_name);
2336 if (!isdir(child))
2337 continue;
2338 if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0)
2339 goto err;
2340 if (cp1->cpu_exclusive || cp2->cpu_exclusive) {
2341 cpuset_getcpus(cp1, cpus1);
2342 cpuset_getcpus(cp2, cpus2);
2343 if (bitmask_intersects(cpus1, cpus2))
2344 goto collides;
2345 }
2346 if (cp1->mem_exclusive || cp2->mem_exclusive) {
2347 cpuset_getmems(cp1, mems1);
2348 cpuset_getmems(cp2, mems2);
2349 if (bitmask_intersects(mems1, mems2))
2350 goto collides;
2351 }
2352 }
2353 err:
2354 /* error, or did not collide */
2355 ret = 0;
2356 goto done;
2357 collides:
2358 /* collides */
2359 ret = 1;
2360 /* fall into ... */
2361 done:
2362 if (dir)
2363 closedir(dir);
2364 cpuset_free(cp2);
2365 free(pathcopy);
2366 bitmask_free(cpus1);
2367 bitmask_free(cpus2);
2368 bitmask_free(mems1);
2369 bitmask_free(mems2);
2370 return ret;
2371 }
2372
2373 /*
2374 * [optional] cpuset_nuke() - Remove cpuset anyway possible
2375 *
2376 * Remove a cpuset, including killing tasks in it, and
2377 * removing any descendent cpusets and killing their tasks.
2378 *
2379 * Tasks can take a long time (minutes on some configurations)
2380 * to exit. Loop up to 'seconds' seconds, trying to kill them.
2381 *
2382 * How we do it:
2383 * 1) First, kill all the pids, looping until there are
2384 * no more pids in this cpuset or below, or until the
2385 * 'seconds' timeout limit is exceeded.
2386 * 2) Then depth first recursively rmdir the cpuset directories.
2387 * 3) If by this point the original cpuset is gone, we succeeded.
2388 *
2389 * If the timeout is exceeded, and tasks still exist, fail with
2390 * errno == ETIME.
2391 *
2392 * We sleep a variable amount of time. After the first attempt to
2393 * kill all the tasks in the cpuset or its descendents, we sleep 1
2394 * second, the next time 2 seconds, increasing 1 second each loop
2395 * up to a max of 10 seconds. If more loops past 10 are required
2396 * to kill all the tasks, we sleep 10 seconds each subsequent loop.
2397 * In any case, before the last loop, we sleep however many seconds
2398 * remain of the original timeout 'seconds' requested. The total
2399 * time of all sleeps will be no more than the requested 'seconds'.
2400 *
2401 * If the cpuset started out empty of any tasks, or if the passed in
2402 * 'seconds' was zero, then this routine will return quickly, having
2403 * not slept at all. Otherwise, this routine will at a minimum send
2404 * a SIGKILL to all the tasks in this cpuset subtree, then sleep one
2405 * second, before looking to see if any tasks remain. If tasks remain
2406 * in the cpuset subtree, and a longer 'seconds' timeout was requested
2407 * (more than one), it will continue to kill remaining tasks and sleep,
2408 * in a loop, for as long as time and tasks remain.
2409 *
2410 * The signal sent for the kill is hardcoded to SIGKILL (9). If some
2411 * other signal should be sent first, use a separate code loop,
2412 * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to
2413 * scan the task pids in a cpuset. If SIGKILL should -not- be sent,
2414 * this cpuset_nuke() routine can still be called to recursively
2415 * remove a cpuset subtree, by specifying a timeout of zero 'seconds'.
2416 *
2417 * On success, returns 0 with errno == 0.
2418 *
2419 * On failure, returns -1, with errno possibly one of:
2420 * EACCES - search permission denied on intervening directory
2421 * ETIME - timed out - tasks remain after 'seconds' timeout
2422 * EMFILE - too many open files
2423 * ENODEV - /dev/cpuset not mounted
2424 * ENOENT - component of cpuset path doesn't exist
2425 * ENOMEM - out of memory
2426 * ENOSYS - kernel doesn't support cpusets
2427 * ENOTDIR - component of cpuset path is not a directory
2428 * EPERM - lacked permission to kill a task
2429 * EPERM - lacked permission to read cpusets or files therein
2430 */
2431
2432 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree);
2433
cpuset_nuke(const char * relpath,unsigned int seconds)2434 int cpuset_nuke(const char *relpath, unsigned int seconds)
2435 {
2436 unsigned int secs_left = seconds; /* total sleep seconds left */
2437 unsigned int secs_loop = 1; /* how much sleep next loop */
2438 unsigned int secs_slept; /* seconds slept in sleep() */
2439 struct cpuset_pidlist *pl = NULL; /* pids in cpuset subtree */
2440 struct cpuset_fts_tree *cs_tree;
2441 const struct cpuset_fts_entry *cs_entry;
2442 int ret, sav_errno = 0;
2443
2444 if (check() < 0)
2445 return -1;
2446
2447 if (seconds == 0)
2448 goto rmdir_cpusets;
2449
2450 while (1) {
2451 int plen, j;
2452
2453 if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) {
2454 /* missing cpuset is as good as if already nuked */
2455 if (errno == ENOENT) {
2456 ret = 0;
2457 goto no_more_cpuset;
2458 }
2459
2460 /* other problems reading cpuset are bad news */
2461 sav_errno = errno;
2462 goto failed;
2463 }
2464
2465 if ((plen = cpuset_pidlist_length(pl)) == 0)
2466 goto rmdir_cpusets;
2467
2468 for (j = 0; j < plen; j++) {
2469 pid_t pid;
2470
2471 if ((pid = cpuset_get_pidlist(pl, j)) > 1) {
2472 if (kill(pid, SIGKILL) < 0 && errno != ESRCH) {
2473 sav_errno = errno;
2474 goto failed;
2475 }
2476 }
2477 }
2478
2479 if (secs_left == 0)
2480 goto took_too_long;
2481
2482 cpuset_freepidlist(pl);
2483 pl = NULL;
2484
2485 secs_slept = secs_loop - sleep(secs_loop);
2486
2487 /* Ensure forward progress */
2488 if (secs_slept == 0)
2489 secs_slept = 1;
2490
2491 /* Ensure sane sleep() return (unnecessary?) */
2492 if (secs_slept > secs_loop)
2493 secs_slept = secs_loop;
2494
2495 secs_left -= secs_slept;
2496
2497 if (secs_loop < 10)
2498 secs_loop++;
2499
2500 secs_loop = MIN(secs_left, secs_loop);
2501 }
2502
2503 took_too_long:
2504 sav_errno = ETIME;
2505 /* fall into ... */
2506 failed:
2507 cpuset_freepidlist(pl);
2508 errno = sav_errno;
2509 return -1;
2510
2511 rmdir_cpusets:
2512 /* Let's try removing cpuset(s) now. */
2513 cpuset_freepidlist(pl);
2514
2515 if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT)
2516 return -1;
2517 ret = 0;
2518 cpuset_fts_reverse(cs_tree); /* rmdir's must be done bottom up */
2519 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2520 char buf[PATH_MAX];
2521
2522 fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry));
2523 if (rmdir(buf) < 0 && errno != ENOENT) {
2524 sav_errno = errno;
2525 ret = -1;
2526 }
2527 }
2528 cpuset_fts_close(cs_tree);
2529 /* fall into ... */
2530 no_more_cpuset:
2531 if (ret == 0)
2532 errno = 0;
2533 else
2534 errno = sav_errno;
2535 return ret;
2536 }
2537
2538 /*
2539 * When recursively reading all the tasks files from a subtree,
2540 * chain together the read results, one pidblock per tasks file,
2541 * containing the raw unprocessed ascii as read(2) in. After
2542 * we gather up this raw data, we then go back to count how
2543 * many pid's there are in total, allocate an array of pid_t
2544 * of that size, and transform the raw ascii data into this
2545 * array of pid_t's.
2546 */
2547
2548 struct pidblock {
2549 char *buf;
2550 int buflen;
2551 struct pidblock *next;
2552 };
2553
2554 /*
2555 * Chain the raw contents of a file onto the pbhead list.
2556 *
2557 * We malloc "+ 1" extra byte for a nul-terminator, so that
2558 * the strtoul() loop in pid_transform() won't scan past
2559 * the end of pb->buf[] and accidentally find more pids.
2560 */
add_pidblock(const char * file,struct pidblock ** ppbhead)2561 static void add_pidblock(const char *file, struct pidblock **ppbhead)
2562 {
2563 FILE *fp = NULL;
2564 struct pidblock *pb = NULL;
2565 int fsz;
2566
2567 if ((fp = fopen(file, "r")) == NULL)
2568 goto err;
2569 fsz = filesize(fp);
2570 if (fsz == 0)
2571 goto err;
2572 if ((pb = calloc(1, sizeof(*pb))) == NULL)
2573 goto err;
2574 pb->buflen = fsz;
2575 if ((pb->buf = malloc(pb->buflen + 1)) == NULL)
2576 goto err;
2577 if (fread(pb->buf, 1, pb->buflen, fp) > 0) {
2578 pb->buf[pb->buflen] = '\0';
2579 pb->next = *ppbhead;
2580 *ppbhead = pb;
2581 }
2582 fclose(fp);
2583 return;
2584 err:
2585 if (fp)
2586 fclose(fp);
2587 free(pb);
2588 }
2589
read_task_file(const char * relpath,struct pidblock ** ppbhead)2590 static void read_task_file(const char *relpath, struct pidblock **ppbhead)
2591 {
2592 char buf[PATH_MAX];
2593
2594 fullpath2(buf, sizeof(buf), relpath, "tasks");
2595 add_pidblock(buf, ppbhead);
2596 }
2597
2598 struct cpuset_pidlist {
2599 pid_t *pids;
2600 int npids;
2601 };
2602
2603 /* Count how many pids in buf (one per line - just count newlines) */
pidcount(const char * buf,int buflen)2604 static int pidcount(const char *buf, int buflen)
2605 {
2606 int n = 0;
2607 const char *cp;
2608
2609 for (cp = buf; cp < buf + buflen; cp++) {
2610 if (*cp == '\n')
2611 n++;
2612 }
2613 return n;
2614 }
2615
2616 /* Transform one-per-line ascii pids in pb to pid_t entries in pl */
pid_transform(struct pidblock * pb,struct cpuset_pidlist * pl,int n)2617 static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n)
2618 {
2619 char *a, *b;
2620
2621 for (a = pb->buf; a < pb->buf + pb->buflen; a = b) {
2622 pid_t p = strtoul(a, &b, 10);
2623 if (a == b)
2624 break;
2625 pl->pids[n++] = p;
2626 }
2627 return n;
2628 }
2629
free_pidblocks(struct pidblock * pbhead)2630 static void free_pidblocks(struct pidblock *pbhead)
2631 {
2632 struct pidblock *pb, *nextpb;
2633
2634 for (pb = pbhead; pb; pb = nextpb) {
2635 nextpb = pb->next;
2636 free(pb->buf);
2637 free(pb);
2638 }
2639 }
2640
2641 /* numeric comparison routine for qsort */
numericsort(const void * m1,const void * m2)2642 static int numericsort(const void *m1, const void *m2)
2643 {
2644 pid_t p1 = *(pid_t *) m1;
2645 pid_t p2 = *(pid_t *) m2;
2646
2647 return p1 - p2;
2648 }
2649
2650 /* Return list pids in cpuset 'path' */
cpuset_init_pidlist(const char * relpath,int recursiveflag)2651 struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath,
2652 int recursiveflag)
2653 {
2654 struct pidblock *pb = NULL;
2655 struct cpuset_pidlist *pl = NULL;
2656 struct pidblock *pbhead = NULL;
2657 int n;
2658
2659 if (check() < 0)
2660 goto err;
2661
2662 if (recursiveflag) {
2663 struct cpuset_fts_tree *cs_tree;
2664 const struct cpuset_fts_entry *cs_entry;
2665
2666 if ((cs_tree = cpuset_fts_open(relpath)) == NULL)
2667 goto err;
2668 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2669 if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET)
2670 continue;
2671 read_task_file(cpuset_fts_get_path(cs_entry), &pbhead);
2672 }
2673 cpuset_fts_close(cs_tree);
2674 } else {
2675 read_task_file(relpath, &pbhead);
2676 }
2677
2678 if ((pl = calloc(1, sizeof(*pl))) == NULL)
2679 goto err;
2680 pl->npids = 0;
2681 for (pb = pbhead; pb; pb = pb->next)
2682 pl->npids += pidcount(pb->buf, pb->buflen);
2683 if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL)
2684 goto err;
2685 n = 0;
2686 for (pb = pbhead; pb; pb = pb->next)
2687 n = pid_transform(pb, pl, n);
2688 free_pidblocks(pbhead);
2689 qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort);
2690 return pl;
2691 err:
2692 cpuset_freepidlist(pl);
2693 free_pidblocks(pbhead);
2694 return NULL;
2695 }
2696
2697 /* Return number of elements in pidlist */
cpuset_pidlist_length(const struct cpuset_pidlist * pl)2698 int cpuset_pidlist_length(const struct cpuset_pidlist *pl)
2699 {
2700 if (pl)
2701 return pl->npids;
2702 else
2703 return 0;
2704 }
2705
2706 /* Return i'th element of pidlist */
cpuset_get_pidlist(const struct cpuset_pidlist * pl,int i)2707 pid_t cpuset_get_pidlist(const struct cpuset_pidlist * pl, int i)
2708 {
2709 if (pl && i >= 0 && i < pl->npids)
2710 return pl->pids[i];
2711 else
2712 return (pid_t) - 1;
2713 }
2714
2715 /* Free pidlist */
cpuset_freepidlist(struct cpuset_pidlist * pl)2716 void cpuset_freepidlist(struct cpuset_pidlist *pl)
2717 {
2718 if (pl && pl->pids)
2719 free(pl->pids);
2720 free(pl);
2721 }
2722
__cpuset_move(pid_t pid,const char * path)2723 static int __cpuset_move(pid_t pid, const char *path)
2724 {
2725 char buf[SMALL_BUFSZ];
2726
2727 snprintf(buf, sizeof(buf), "%u", pid);
2728 return write_string_file(path, buf);
2729 }
2730
2731 /* Move task (pid == 0 for current) to a cpuset */
cpuset_move(pid_t pid,const char * relpath)2732 int cpuset_move(pid_t pid, const char *relpath)
2733 {
2734 char buf[PATH_MAX];
2735
2736 if (check() < 0)
2737 return -1;
2738
2739 if (pid == 0)
2740 pid = getpid();
2741
2742 fullpath2(buf, sizeof(buf), relpath, "tasks");
2743 return __cpuset_move(pid, buf);
2744 }
2745
2746 /* Move all tasks in pidlist to a cpuset */
cpuset_move_all(struct cpuset_pidlist * pl,const char * relpath)2747 int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath)
2748 {
2749 int i;
2750 char buf[PATH_MAX];
2751 int ret;
2752
2753 if (check() < 0)
2754 return -1;
2755
2756 fullpath2(buf, sizeof(buf), relpath, "tasks");
2757
2758 ret = 0;
2759 for (i = 0; i < pl->npids; i++)
2760 if (__cpuset_move(pl->pids[i], buf) < 0)
2761 ret = -1;
2762 return ret;
2763 }
2764
2765 /*
2766 * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a
2767 * cpuset to another cpuset
2768 *
2769 * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may
2770 * race with tasks being added to or forking into fromrelpath. Loop
2771 * repeatedly, reading the tasks file of cpuset fromrelpath and writing
2772 * any task pid's found there to the tasks file of cpuset torelpath,
2773 * up to ten attempts, or until the tasks file of cpuset fromrelpath
2774 * is empty, or until fromrelpath is no longer present.
2775 *
2776 * Returns 0 with errno == 0 if able to empty the tasks file of cpuset
2777 * fromrelpath. Of course it is still possible that some independent
2778 * task could add another task to cpuset fromrelpath at the same time
2779 * that such a successful result is being returned, so there can be
2780 * no guarantee that a successful return means that fromrelpath is
2781 * still empty of tasks.
2782 *
2783 * We are careful to allow for the possibility that the cpuset
2784 * fromrelpath might disappear out from under us, perhaps because it
2785 * has notify_on_release set and gets automatically removed as soon
2786 * as we detach its last task from it. Consider a missing fromrelpath
2787 * to be a successful move.
2788 *
2789 * If called with fromrelpath and torelpath pathnames that evaluate to
2790 * the same cpuset, then treat that as if cpuset_reattach() was called,
2791 * rebinding each task in this cpuset one time, and return success or
2792 * failure depending on the return of that cpuset_reattach() call.
2793 *
2794 * On failure, returns -1, with errno possibly one of:
2795 * EACCES - search permission denied on intervening directory
2796 * ENOTEMPTY - tasks remain after multiple attempts to move them
2797 * EMFILE - too many open files
2798 * ENODEV - /dev/cpuset not mounted
2799 * ENOENT - component of cpuset path doesn't exist
2800 * ENOMEM - out of memory
2801 * ENOSYS - kernel doesn't support cpusets
2802 * ENOTDIR - component of cpuset path is not a directory
2803 * EPERM - lacked permission to kill a task
2804 * EPERM - lacked permission to read cpusets or files therein
2805 *
2806 * This is an [optional] function. Use cpuset_function to invoke it.
2807 */
2808
2809 #define NUMBER_MOVE_TASK_ATTEMPTS 10
2810
cpuset_move_cpuset_tasks(const char * fromrelpath,const char * torelpath)2811 int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath)
2812 {
2813 char fromfullpath[PATH_MAX];
2814 char tofullpath[PATH_MAX];
2815 int i;
2816 struct cpuset_pidlist *pl = NULL;
2817 int sav_errno;
2818
2819 fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath);
2820 fullpath(tofullpath, sizeof(tofullpath), torelpath);
2821
2822 if (samefile(fromfullpath, tofullpath))
2823 return cpuset_reattach(fromrelpath);
2824
2825 for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) {
2826 int plen, j;
2827
2828 if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) {
2829 /* missing cpuset is as good as if all moved */
2830 if (errno == ENOENT)
2831 goto no_more_cpuset;
2832
2833 /* other problems reading cpuset are bad news */
2834 sav_errno = errno;
2835 goto failed;
2836 }
2837
2838 if ((plen = cpuset_pidlist_length(pl)) == 0)
2839 goto no_more_pids;
2840
2841 for (j = 0; j < plen; j++) {
2842 pid_t pid;
2843
2844 pid = cpuset_get_pidlist(pl, j);
2845 if (cpuset_move(pid, torelpath) < 0) {
2846 /* missing task is as good as if moved */
2847 if (errno == ESRCH)
2848 continue;
2849
2850 /* other per-task errors are bad news */
2851 sav_errno = errno;
2852 goto failed;
2853 }
2854 }
2855
2856 cpuset_freepidlist(pl);
2857 pl = NULL;
2858 }
2859
2860 sav_errno = ENOTEMPTY;
2861 /* fall into ... */
2862 failed:
2863 cpuset_freepidlist(pl);
2864 errno = sav_errno;
2865 return -1;
2866
2867 no_more_pids:
2868 no_more_cpuset:
2869 /* Success - all tasks (or entire cpuset ;) gone. */
2870 cpuset_freepidlist(pl);
2871 errno = 0;
2872 return 0;
2873 }
2874
2875 /* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */
cpuset_migrate(pid_t pid,const char * relpath)2876 int cpuset_migrate(pid_t pid, const char *relpath)
2877 {
2878 char buf[PATH_MAX];
2879 char buf2[PATH_MAX];
2880 char memory_migrate_flag;
2881 int r;
2882
2883 if (check() < 0)
2884 return -1;
2885
2886 if (pid == 0)
2887 pid = getpid();
2888
2889 fullpath(buf2, sizeof(buf2), relpath);
2890
2891 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2892 return -1;
2893 if (store_flag(buf2, "memory_migrate", 1) < 0)
2894 return -1;
2895
2896 fullpath2(buf, sizeof(buf), relpath, "tasks");
2897
2898 r = __cpuset_move(pid, buf);
2899
2900 store_flag(buf2, "memory_migrate", memory_migrate_flag);
2901 return r;
2902 }
2903
2904 /* Migrate all tasks in pidlist to a cpuset (moves task and memory) */
cpuset_migrate_all(struct cpuset_pidlist * pl,const char * relpath)2905 int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath)
2906 {
2907 int i;
2908 char buf[PATH_MAX];
2909 char buf2[PATH_MAX];
2910 char memory_migrate_flag;
2911 int ret;
2912
2913 if (check() < 0)
2914 return -1;
2915
2916 fullpath(buf2, sizeof(buf2), relpath);
2917
2918 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2919 return -1;
2920 if (store_flag(buf2, "memory_migrate", 1) < 0)
2921 return -1;
2922
2923 fullpath2(buf, sizeof(buf), relpath, "tasks");
2924
2925 ret = 0;
2926 for (i = 0; i < pl->npids; i++)
2927 if (__cpuset_move(pl->pids[i], buf) < 0)
2928 ret = -1;
2929
2930 if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0)
2931 ret = -1;
2932 return ret;
2933 }
2934
2935 /* Rebind cpus_allowed of each task in cpuset 'path' */
cpuset_reattach(const char * relpath)2936 int cpuset_reattach(const char *relpath)
2937 {
2938 struct cpuset_pidlist *pl;
2939 int rc;
2940
2941 if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL)
2942 return -1;
2943 rc = cpuset_move_all(pl, relpath);
2944 cpuset_freepidlist(pl);
2945 return rc;
2946 }
2947
2948 /* Map cpuset relative cpu number to system wide cpu number */
cpuset_c_rel_to_sys_cpu(const struct cpuset * cp,int cpu)2949 int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu)
2950 {
2951 struct cpuset *cp_tofree = NULL;
2952 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2953 int pos = -1;
2954
2955 if (!cp1)
2956 goto err;
2957 pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu);
2958 /* fall into ... */
2959 err:
2960 cpuset_free(cp_tofree);
2961 return pos;
2962 }
2963
2964 /* Map system wide cpu number to cpuset relative cpu number */
cpuset_c_sys_to_rel_cpu(const struct cpuset * cp,int cpu)2965 int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu)
2966 {
2967 struct cpuset *cp_tofree = NULL;
2968 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2969 int pos = -1;
2970
2971 if (!cp1)
2972 goto err;
2973 pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu);
2974 /* fall into ... */
2975 err:
2976 cpuset_free(cp_tofree);
2977 return pos;
2978 }
2979
2980 /* Map cpuset relative mem number to system wide mem number */
cpuset_c_rel_to_sys_mem(const struct cpuset * cp,int mem)2981 int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem)
2982 {
2983 struct cpuset *cp_tofree = NULL;
2984 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2985 int pos = -1;
2986
2987 if (!cp1)
2988 goto err;
2989 pos = bitmask_rel_to_abs_pos(cp1->mems, mem);
2990 /* fall into ... */
2991 err:
2992 cpuset_free(cp_tofree);
2993 return pos;
2994 }
2995
2996 /* Map system wide mem number to cpuset relative mem number */
cpuset_c_sys_to_rel_mem(const struct cpuset * cp,int mem)2997 int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem)
2998 {
2999 struct cpuset *cp_tofree = NULL;
3000 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
3001 int pos = -1;
3002
3003 if (!cp1)
3004 goto err;
3005 pos = bitmask_abs_to_rel_pos(cp1->mems, mem);
3006 /* fall into ... */
3007 err:
3008 cpuset_free(cp_tofree);
3009 return pos;
3010 }
3011
3012 /* Map pid's cpuset relative cpu number to system wide cpu number */
cpuset_p_rel_to_sys_cpu(pid_t pid,int cpu)3013 int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu)
3014 {
3015 struct cpuset *cp;
3016 int rc = -1;
3017
3018 if ((cp = cpuset_alloc()) == NULL)
3019 goto done;
3020 if (cpuset_cpusetofpid(cp, pid) < 0)
3021 goto done;
3022 rc = cpuset_c_rel_to_sys_cpu(cp, cpu);
3023 done:
3024 cpuset_free(cp);
3025 return rc;
3026 }
3027
3028 /* Map system wide cpu number to pid's cpuset relative cpu number */
cpuset_p_sys_to_rel_cpu(pid_t pid,int cpu)3029 int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu)
3030 {
3031 struct cpuset *cp;
3032 int rc = -1;
3033
3034 if ((cp = cpuset_alloc()) == NULL)
3035 goto done;
3036 if (cpuset_cpusetofpid(cp, pid) < 0)
3037 goto done;
3038 rc = cpuset_c_sys_to_rel_cpu(cp, cpu);
3039 done:
3040 cpuset_free(cp);
3041 return rc;
3042 }
3043
3044 /* Map pid's cpuset relative mem number to system wide mem number */
cpuset_p_rel_to_sys_mem(pid_t pid,int mem)3045 int cpuset_p_rel_to_sys_mem(pid_t pid, int mem)
3046 {
3047 struct cpuset *cp;
3048 int rc = -1;
3049
3050 if ((cp = cpuset_alloc()) == NULL)
3051 goto done;
3052 if (cpuset_cpusetofpid(cp, pid) < 0)
3053 goto done;
3054 rc = cpuset_c_rel_to_sys_mem(cp, mem);
3055 done:
3056 cpuset_free(cp);
3057 return rc;
3058 }
3059
3060 /* Map system wide mem number to pid's cpuset relative mem number */
cpuset_p_sys_to_rel_mem(pid_t pid,int mem)3061 int cpuset_p_sys_to_rel_mem(pid_t pid, int mem)
3062 {
3063 struct cpuset *cp;
3064 int rc = -1;
3065
3066 if ((cp = cpuset_alloc()) == NULL)
3067 goto done;
3068 if (cpuset_cpusetofpid(cp, pid) < 0)
3069 goto done;
3070 rc = cpuset_c_sys_to_rel_mem(cp, mem);
3071 done:
3072 cpuset_free(cp);
3073 return rc;
3074 }
3075
3076 /*
3077 * Override glibc's calls for get/set affinity - they have
3078 * something using cpu_set_t that will die when NR_CPUS > 1024.
3079 * Go directly to the 'real' system calls. Also override calls
3080 * for get_mempolicy and set_mempolicy. None of these
3081 * calls are yet (July 2004) guaranteed to be in all glibc versions
3082 * that we care about.
3083 */
3084
sched_setaffinity(pid_t pid,unsigned len,unsigned long * mask)3085 static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask)
3086 {
3087 return ltp_syscall(__NR_sched_setaffinity, pid, len, mask);
3088 }
3089
3090 #if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE
get_mempolicy(int * policy,unsigned long * nmask,unsigned long maxnode,void * addr,int flags)3091 static int get_mempolicy(int *policy, unsigned long *nmask,
3092 unsigned long maxnode, void *addr, int flags)
3093 {
3094 return ltp_syscall(__NR_get_mempolicy, policy, nmask, maxnode,
3095 addr, flags);
3096 }
3097 #endif
3098
3099 #if HAVE_DECL_MPOL_BIND || HAVE_DECL_MPOL_DEFAULT
set_mempolicy(int mode,unsigned long * nmask,unsigned long maxnode)3100 static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode)
3101 {
3102 return ltp_syscall(__NR_set_mempolicy, mode, nmask, maxnode);
3103 }
3104 #endif
3105
3106 struct cpuset_placement {
3107 struct bitmask *cpus;
3108 struct bitmask *mems;
3109 char *path;
3110 };
3111
3112 /* Allocate and fill in a placement struct - cpatures current placement */
cpuset_get_placement(pid_t pid)3113 struct cpuset_placement *cpuset_get_placement(pid_t pid)
3114 {
3115 struct cpuset_placement *plc;
3116 struct cpuset *cp = NULL;
3117 char buf[PATH_MAX];
3118 int nbits;
3119
3120 if ((plc = calloc(1, sizeof(*plc))) == NULL)
3121 goto err;
3122
3123 nbits = cpuset_cpus_nbits();
3124 if ((plc->cpus = bitmask_alloc(nbits)) == NULL)
3125 goto err;
3126
3127 nbits = cpuset_mems_nbits();
3128 if ((plc->mems = bitmask_alloc(nbits)) == NULL)
3129 goto err;
3130
3131 if ((cp = cpuset_alloc()) == NULL)
3132 goto err;
3133 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
3134 goto err;
3135 if (cpuset_query(cp, buf) < 0)
3136 goto err;
3137
3138 bitmask_copy(plc->cpus, cp->cpus);
3139 bitmask_copy(plc->mems, cp->mems);
3140 plc->path = strdup(buf);
3141
3142 cpuset_free(cp);
3143 return plc;
3144 err:
3145 cpuset_free(cp);
3146 cpuset_free_placement(plc);
3147 return NULL;
3148 }
3149
3150 /* Compare two placement structs - use to detect changes in placement */
cpuset_equal_placement(const struct cpuset_placement * plc1,const struct cpuset_placement * plc2)3151 int cpuset_equal_placement(const struct cpuset_placement *plc1,
3152 const struct cpuset_placement *plc2)
3153 {
3154 return bitmask_equal(plc1->cpus, plc2->cpus) &&
3155 bitmask_equal(plc1->mems, plc2->mems) &&
3156 streq(plc1->path, plc2->path);
3157 }
3158
3159 /* Free a placement struct */
cpuset_free_placement(struct cpuset_placement * plc)3160 void cpuset_free_placement(struct cpuset_placement *plc)
3161 {
3162 if (!plc)
3163 return;
3164 bitmask_free(plc->cpus);
3165 bitmask_free(plc->mems);
3166 free(plc->path);
3167 free(plc);
3168 }
3169
3170 /*
3171 * A cpuset_fts_open() call constructs a linked list of entries
3172 * called a "cpuset_fts_tree", with one entry per cpuset below
3173 * the specified path. The cpuset_fts_read() routine returns the
3174 * next entry on this list. The various cpuset_fts_get_*() calls
3175 * return attributes of the specified entry. The cpuset_fts_close()
3176 * call frees the linked list and all associated data. All cpuset
3177 * entries and attributes for the cpuset_fts_tree returned from a
3178 * given cpuset_fts_open() call remain allocated and unchanged until
3179 * that cpuset_fts_tree is closed by a cpuset_fts_close() call. Any
3180 * subsequent changes to the cpuset filesystem will go unnoticed
3181 * (not affect open cpuset_fts_tree's.)
3182 */
3183
3184 struct cpuset_fts_entry;
3185 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree);
3186
3187 struct cpuset_fts_tree {
3188 struct cpuset_fts_entry *head; /* head of linked entry list */
3189 struct cpuset_fts_entry *next; /* cpuset_fts_read() offset */
3190 };
3191
3192 struct cpuset_fts_entry {
3193 struct cpuset_fts_entry *next; /* linked entry list chain */
3194 struct cpuset *cpuset;
3195 struct stat *stat;
3196 char *path;
3197 int info;
3198 int err;
3199 };
3200
3201 /* Open a handle on a cpuset hierarchy. All the real work is done here. */
cpuset_fts_open(const char * cpusetpath)3202 struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath)
3203 {
3204 FTS *fts = NULL;
3205 FTSENT *ftsent;
3206 char *path_argv[2];
3207 char buf[PATH_MAX];
3208 struct cpuset_fts_tree *cs_tree = NULL;
3209 struct cpuset_fts_entry *ep; /* the latest new list entry */
3210 struct cpuset_fts_entry **pnlep; /* ptr to next list entry ptr */
3211 char *relpath;
3212 int fts_flags;
3213
3214 fullpath(buf, sizeof(buf), cpusetpath);
3215 path_argv[0] = buf;
3216 path_argv[1] = NULL;
3217
3218 fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV;
3219 fts = fts_open(path_argv, fts_flags, NULL);
3220 if (fts == NULL)
3221 goto err;
3222
3223 cs_tree = malloc(sizeof(*cs_tree));
3224 if (cs_tree == NULL)
3225 goto err;
3226 pnlep = &cs_tree->head;
3227 *pnlep = NULL;
3228
3229 while ((ftsent = fts_read(fts)) != NULL) {
3230 if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR)
3231 continue;
3232
3233 /* ftsent is a directory (perhaps unreadable) ==> cpuset */
3234 ep = calloc(1, sizeof(*ep));
3235 if (ep == NULL)
3236 goto err;
3237 *pnlep = ep;
3238 pnlep = &ep->next;
3239
3240 /* Set entry's path, and if DNR, error */
3241 relpath = ftsent->fts_path + strlen(cpusetmnt);
3242 if (strlen(relpath) == 0)
3243 relpath = "/";
3244 ep->path = strdup(relpath);
3245 if (ep->path == NULL)
3246 goto err;
3247 if (ftsent->fts_info == FTS_DNR) {
3248 ep->info = CPUSET_FTS_ERR_DNR;
3249 ep->err = ftsent->fts_errno;
3250 continue;
3251 }
3252
3253 /* ftsent is a -readable- cpuset: set entry's stat, etc */
3254 ep->stat = calloc(1, sizeof(struct stat));
3255 if (ep->stat == NULL)
3256 goto err;
3257 if (stat(ftsent->fts_path, ep->stat) < 0) {
3258 ep->info = CPUSET_FTS_ERR_STAT;
3259 ep->err = ftsent->fts_errno;
3260 continue;
3261 }
3262
3263 ep->cpuset = calloc(1, sizeof(struct cpuset));
3264 if (ep->cpuset == NULL)
3265 goto err;
3266 if (cpuset_query(ep->cpuset, relpath) < 0) {
3267 ep->info = CPUSET_FTS_ERR_CPUSET;
3268 ep->err = errno;
3269 continue;
3270 }
3271 ep->info = CPUSET_FTS_CPUSET;
3272 }
3273
3274 (void)fts_close(fts);
3275 cpuset_fts_rewind(cs_tree);
3276 return cs_tree;
3277
3278 err:
3279 if (cs_tree)
3280 cpuset_fts_close(cs_tree);
3281 if (fts)
3282 (void)fts_close(fts);
3283 return NULL;
3284 }
3285
3286 /* Return pointer to next cpuset entry in hierarchy */
cpuset_fts_read(struct cpuset_fts_tree * cs_tree)3287 const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree)
3288 {
3289 const struct cpuset_fts_entry *cs_entry = cs_tree->next;
3290 if (cs_tree->next != NULL) /* seek to next entry */
3291 cs_tree->next = cs_tree->next->next;
3292 return cs_entry;
3293 }
3294
3295 /* Reverse list of cpusets, in place. Simulates pre-order/post-order flip. */
cpuset_fts_reverse(struct cpuset_fts_tree * cs_tree)3296 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree)
3297 {
3298 struct cpuset_fts_entry *cs1, *cs2, *cs3;
3299
3300 /*
3301 * At each step, cs1 < cs2 < cs3 and the cs2->next pointer
3302 * is redirected from cs3 to cs1.
3303 */
3304
3305 cs1 = cs2 = NULL;
3306 cs3 = cs_tree->head;
3307 while (cs3) {
3308 cs1 = cs2;
3309 cs2 = cs3;
3310 cs3 = cs3->next;
3311 cs2->next = cs1;
3312 }
3313 cs_tree->head = cs2;
3314 cpuset_fts_rewind(cs_tree);
3315 }
3316
3317 /* Rewind cpuset list to beginning */
cpuset_fts_rewind(struct cpuset_fts_tree * cs_tree)3318 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree)
3319 {
3320 cs_tree->next = cs_tree->head;
3321 }
3322
3323 /* Return pointer to nul-terminated cpuset path of entry in hierarchy */
cpuset_fts_get_path(const struct cpuset_fts_entry * cs_entry)3324 const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry)
3325 {
3326 return cs_entry->path;
3327 }
3328
3329 /* Return pointer to stat(2) structure of a cpuset entry's directory */
cpuset_fts_get_stat(const struct cpuset_fts_entry * cs_entry)3330 const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry)
3331 {
3332 return cs_entry->stat;
3333 }
3334
3335 /* Return pointer to cpuset structure of a cpuset entry */
cpuset_fts_get_cpuset(const struct cpuset_fts_entry * cs_entry)3336 const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry
3337 *cs_entry)
3338 {
3339 return cs_entry->cpuset;
3340 }
3341
3342 /* Return value of errno (0 if no error) on attempted cpuset operations */
cpuset_fts_get_errno(const struct cpuset_fts_entry * cs_entry)3343 int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry)
3344 {
3345 return cs_entry->err;
3346 }
3347
3348 /* Return operation identity causing error */
cpuset_fts_get_info(const struct cpuset_fts_entry * cs_entry)3349 int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry)
3350 {
3351 return cs_entry->info;
3352 }
3353
3354 /* Close a cpuset hierarchy handle (free's all associated memory) */
cpuset_fts_close(struct cpuset_fts_tree * cs_tree)3355 void cpuset_fts_close(struct cpuset_fts_tree *cs_tree)
3356 {
3357 struct cpuset_fts_entry *cs_entry = cs_tree->head;
3358
3359 while (cs_entry) {
3360 struct cpuset_fts_entry *ep = cs_entry;
3361
3362 cs_entry = cs_entry->next;
3363 free(ep->path);
3364 free(ep->stat);
3365 cpuset_free(ep->cpuset);
3366 free(ep);
3367 }
3368 free(cs_tree);
3369 }
3370
3371 /* Bind current task to cpu (uses sched_setaffinity(2)) */
cpuset_cpubind(int cpu)3372 int cpuset_cpubind(int cpu)
3373 {
3374 struct bitmask *bmp;
3375 int r;
3376
3377 if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3378 return -1;
3379 bitmask_setbit(bmp, cpu);
3380 r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp));
3381 bitmask_free(bmp);
3382 return r;
3383 }
3384
3385 /*
3386 * int cpuset_latestcpu(pid_t pid)
3387 *
3388 * Return most recent CPU on which task pid executed. If pid == 0,
3389 * examine current task.
3390 *
3391 * The last used CPU is visible for a given pid as field #39 (starting
3392 * with #1) in the file /proc/pid/stat. Currently this file has 41
3393 * fields, in which case this is the 3rd to the last field.
3394 *
3395 * Unfortunately field #2 is a command name and might have embedded
3396 * whitespace. So we can't just count white space separated fields.
3397 * Fortunately, this command name is surrounded by parentheses, as
3398 * for example "(sh)", and that closing parenthesis is the last ')'
3399 * character in the line. No remaining fields can have embedded
3400 * whitespace or parentheses. So instead of looking for the 39th
3401 * white space separated field, we can look for the 37th white space
3402 * separated field past the last ')' character on the line.
3403 */
3404
3405 /* Return most recent CPU on which task pid executed */
cpuset_latestcpu(pid_t pid)3406 int cpuset_latestcpu(pid_t pid)
3407 {
3408 char buf[PATH_MAX];
3409 char *bp;
3410 int fd = -1;
3411 int cpu = -1;
3412
3413 if (pid == 0)
3414 snprintf(buf, sizeof(buf), "/proc/self/stat");
3415 else
3416 snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
3417
3418 if ((fd = open(buf, O_RDONLY)) < 0)
3419 goto err;
3420 if (read(fd, buf, sizeof(buf)) < 1)
3421 goto err;
3422 close(fd);
3423
3424 bp = strrchr(buf, ')');
3425 if (bp)
3426 sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %u", /* 37th field past ')' */
3427 &cpu);
3428 if (cpu < 0)
3429 errno = EINVAL;
3430 return cpu;
3431 err:
3432 if (fd >= 0)
3433 close(fd);
3434 return -1;
3435 }
3436
3437 /* Bind current task to memory (uses set_mempolicy(2)) */
cpuset_membind(int mem)3438 int cpuset_membind(int mem)
3439 {
3440 struct bitmask *bmp;
3441 int r;
3442
3443 if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3444 return -1;
3445 bitmask_setbit(bmp, mem);
3446 #if HAVE_DECL_MPOL_BIND
3447 r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp), bitmask_nbits(bmp) + 1);
3448 #else
3449 r = -1;
3450 errno = ENOSYS;
3451 #endif
3452 bitmask_free(bmp);
3453 return r;
3454 }
3455
3456 /* [optional] Return Memory Node holding page at specified addr */
cpuset_addr2node(void * addr)3457 int cpuset_addr2node(void *addr)
3458 {
3459 int node = -1;
3460
3461 #if HAVE_DECL_MPOL_F_ADDR && HAVE_DECL_MPOL_F_NODE
3462 if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE | MPOL_F_ADDR)) {
3463 /* I realize this seems redundant, but I _want_ to make sure
3464 * that this value is -1. */
3465 node = -1;
3466 }
3467 #endif
3468 return node;
3469 }
3470
3471 /*
3472 * Transform cpuset into Text Format Representation in buffer 'buf',
3473 * of length 'buflen', nul-terminated if space allows. Return number
3474 * of characters that would have been written, if enough space had
3475 * been available, in the same way that snprintf() does.
3476 */
3477
3478 /* Export cpuset settings to a regular file */
cpuset_export(const struct cpuset * cp,char * buf,int buflen)3479 int cpuset_export(const struct cpuset *cp, char *buf, int buflen)
3480 {
3481 char *tmp = NULL;
3482 int n = 0;
3483
3484 if (cp->cpu_exclusive)
3485 n += snprintf(buf + n, MAX(buflen - n, 0), "cpu_exclusive\n");
3486
3487 if (cp->mem_exclusive)
3488 n += snprintf(buf + n, MAX(buflen - n, 0), "mem_exclusive\n");
3489
3490 if (cp->notify_on_release)
3491 n += snprintf(buf + n, MAX(buflen - n, 0),
3492 "notify_on_release\n");
3493
3494 if (cp->memory_pressure_enabled)
3495 n += snprintf(buf + n, MAX(buflen - n, 0),
3496 "memory_pressure_enabled\n");
3497
3498 if (cp->memory_migrate)
3499 n += snprintf(buf + n, MAX(buflen - n, 0), "memory_migrate\n");
3500
3501 if (cp->memory_spread_page)
3502 n += snprintf(buf + n, MAX(buflen - n, 0),
3503 "memory_spread_page\n");
3504
3505 if (cp->memory_spread_slab)
3506 n += snprintf(buf + n, MAX(buflen - n, 0),
3507 "memory_spread_slab\n");
3508
3509 if ((tmp = sprint_mask_buf(cp->cpus)) == NULL)
3510 return -1;
3511 n += snprintf(buf + n, MAX(buflen - n, 0), "cpus %s\n", tmp);
3512 free(tmp);
3513 tmp = NULL;
3514
3515 if ((tmp = sprint_mask_buf(cp->mems)) == NULL)
3516 return -1;
3517 n += snprintf(buf + n, MAX(buflen - n, 0), "mems %s\n", tmp);
3518 free(tmp);
3519 tmp = NULL;
3520
3521 return n;
3522 }
3523
import_list(UNUSED const char * tok,const char * arg,struct bitmask * bmp,char * emsg,int elen)3524 static int import_list(UNUSED const char *tok, const char *arg,
3525 struct bitmask *bmp, char *emsg, int elen)
3526 {
3527 if (bitmask_parselist(arg, bmp) < 0) {
3528 if (emsg)
3529 snprintf(emsg, elen, "Invalid list format: %s", arg);
3530 return -1;
3531 }
3532 return 0;
3533 }
3534
stolower(char * s)3535 static void stolower(char *s)
3536 {
3537 while (*s) {
3538 unsigned char c = *s;
3539 *s = tolower(c);
3540 s++;
3541 }
3542 }
3543
3544 /* Import cpuset settings from a regular file */
cpuset_import(struct cpuset * cp,const char * buf,int * elinenum,char * emsg,int elen)3545 int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum,
3546 char *emsg, int elen)
3547 {
3548 char *linebuf = NULL;
3549 int linebuflen;
3550 int linenum = 0;
3551 int offset = 0;
3552
3553 linebuflen = strlen(buf) + 1;
3554 if ((linebuf = malloc(linebuflen)) == NULL) {
3555 if (emsg)
3556 snprintf(emsg, elen, "Insufficient memory");
3557 goto err;
3558 }
3559
3560 while (slgets(linebuf, linebuflen, buf, &offset)) {
3561 char *tok, *arg;
3562 char *ptr; /* for strtok_r */
3563
3564 linenum++;
3565 if ((tok = strchr(linebuf, '#')) != NULL)
3566 *tok = 0;
3567 if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL)
3568 continue;
3569 stolower(tok);
3570
3571 arg = strtok_r(0, " \t", &ptr);
3572
3573 if (streq(tok, "cpu_exclusive")) {
3574 cp->cpu_exclusive = 1;
3575 goto eol;
3576 }
3577 if (streq(tok, "mem_exclusive")) {
3578 cp->mem_exclusive = 1;
3579 goto eol;
3580 }
3581 if (streq(tok, "notify_on_release")) {
3582 cp->notify_on_release = 1;
3583 goto eol;
3584 }
3585 if (streq(tok, "memory_pressure_enabled")) {
3586 cp->memory_pressure_enabled = 1;
3587 goto eol;
3588 }
3589 if (streq(tok, "memory_migrate")) {
3590 cp->memory_migrate = 1;
3591 goto eol;
3592 }
3593 if (streq(tok, "memory_spread_page")) {
3594 cp->memory_spread_page = 1;
3595 goto eol;
3596 }
3597 if (streq(tok, "memory_spread_slab")) {
3598 cp->memory_spread_slab = 1;
3599 goto eol;
3600 }
3601 if (streq(tok, "cpu") || streq(tok, "cpus")) {
3602 if (import_list(tok, arg, cp->cpus, emsg, elen) < 0)
3603 goto err;
3604 goto eol;
3605 }
3606 if (streq(tok, "mem") || streq(tok, "mems")) {
3607 if (import_list(tok, arg, cp->mems, emsg, elen) < 0)
3608 goto err;
3609 goto eol;
3610 }
3611 if (emsg)
3612 snprintf(emsg, elen, "Unrecognized token: '%s'", tok);
3613 goto err;
3614 eol:
3615 if ((tok = strtok_r(0, " \t", &ptr)) != NULL) {
3616 if (emsg)
3617 snprintf(emsg, elen, "Surplus token: '%s'",
3618 tok);
3619 goto err;
3620 }
3621 continue;
3622 }
3623
3624 free(linebuf);
3625
3626 if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems))
3627 cpuset_localcpus(cp->mems, cp->cpus);
3628 else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems))
3629 cpuset_localmems(cp->cpus, cp->mems);
3630
3631 /*
3632 * All cpuset attributes are determined in an import.
3633 * Those that aren't explicitly specified are presumed
3634 * to be unchanged (zero, if it's a freshly allocated
3635 * struct cpuset.)
3636 */
3637
3638 cp->cpus_valid = 1;
3639 cp->mems_valid = 1;
3640 cp->cpu_exclusive_valid = 1;
3641 cp->mem_exclusive_valid = 1;
3642 cp->notify_on_release_valid = 1;
3643 cp->memory_migrate_valid = 1;
3644 cp->memory_pressure_enabled_valid = 1;
3645 cp->memory_spread_page_valid = 1;
3646 cp->memory_spread_slab_valid = 1;
3647
3648 return 0;
3649 err:
3650 if (elinenum)
3651 *elinenum = linenum;
3652 free(linebuf);
3653 return -1;
3654 }
3655
3656 /* Pin current task CPU (and memory) */
cpuset_pin(int relcpu)3657 int cpuset_pin(int relcpu)
3658 {
3659 struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3660 int cpu, r;
3661
3662 if (check() < 0)
3663 return -1;
3664
3665 do {
3666 cpuset_free_placement(plc1);
3667 plc1 = cpuset_get_placement(0);
3668
3669 r = 0;
3670 if (cpuset_unpin() < 0)
3671 r = -1;
3672 cpu = cpuset_p_rel_to_sys_cpu(0, relcpu);
3673 if (cpuset_cpubind(cpu) < 0)
3674 r = -1;
3675
3676 cpuset_free_placement(plc2);
3677 plc2 = cpuset_get_placement(0);
3678 } while (!cpuset_equal_placement(plc1, plc2));
3679
3680 cpuset_free_placement(plc1);
3681 cpuset_free_placement(plc2);
3682 return r;
3683 }
3684
3685 /* Return number CPUs in current tasks cpuset */
cpuset_size()3686 int cpuset_size()
3687 {
3688 struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3689 int r;
3690
3691 if (check() < 0)
3692 return -1;
3693
3694 do {
3695 cpuset_free_placement(plc1);
3696 plc1 = cpuset_get_placement(0);
3697
3698 r = cpuset_cpus_weight(0);
3699
3700 cpuset_free_placement(plc2);
3701 plc2 = cpuset_get_placement(0);
3702 } while (!cpuset_equal_placement(plc1, plc2));
3703
3704 cpuset_free_placement(plc1);
3705 cpuset_free_placement(plc2);
3706 return r;
3707 }
3708
3709 /* Return relative CPU number, within current cpuset, last executed on */
cpuset_where()3710 int cpuset_where()
3711 {
3712 struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3713 int r;
3714
3715 if (check() < 0)
3716 return -1;
3717
3718 do {
3719 cpuset_free_placement(plc1);
3720 plc1 = cpuset_get_placement(0);
3721
3722 r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0));
3723
3724 cpuset_free_placement(plc2);
3725 plc2 = cpuset_get_placement(0);
3726 } while (!cpuset_equal_placement(plc1, plc2));
3727
3728 cpuset_free_placement(plc1);
3729 cpuset_free_placement(plc2);
3730 return r;
3731 }
3732
3733 /* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */
cpuset_unpin()3734 int cpuset_unpin()
3735 {
3736 struct bitmask *cpus = NULL, *mems = NULL;
3737 int r = -1;
3738
3739 if (check() < 0)
3740 goto err;
3741
3742 /*
3743 * Don't need cpuset_*_placement() guard against concurrent
3744 * cpuset migration, because none of the following depends
3745 * on the tasks cpuset placement.
3746 */
3747
3748 if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3749 goto err;
3750 bitmask_setall(cpus);
3751 if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0)
3752 goto err;
3753
3754 if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3755 goto err;
3756 #if HAVE_DECL_MPOL_DEFAULT
3757 if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems),
3758 bitmask_nbits(mems) + 1) < 0)
3759 goto err;
3760 r = 0;
3761 #endif
3762 /* fall into ... */
3763 err:
3764 bitmask_free(cpus);
3765 bitmask_free(mems);
3766 return r;
3767
3768 }
3769
3770 struct cpuset_function_list {
3771 const char *fname;
3772 void *func;
3773 } flist[] = {
3774 {
3775 "cpuset_version", cpuset_version}, {
3776 "cpuset_alloc", cpuset_alloc}, {
3777 "cpuset_free", cpuset_free}, {
3778 "cpuset_cpus_nbits", cpuset_cpus_nbits}, {
3779 "cpuset_mems_nbits", cpuset_mems_nbits}, {
3780 "cpuset_setcpus", cpuset_setcpus}, {
3781 "cpuset_setmems", cpuset_setmems}, {
3782 "cpuset_set_iopt", cpuset_set_iopt}, {
3783 "cpuset_set_sopt", cpuset_set_sopt}, {
3784 "cpuset_getcpus", cpuset_getcpus}, {
3785 "cpuset_getmems", cpuset_getmems}, {
3786 "cpuset_cpus_weight", cpuset_cpus_weight}, {
3787 "cpuset_mems_weight", cpuset_mems_weight}, {
3788 "cpuset_get_iopt", cpuset_get_iopt}, {
3789 "cpuset_get_sopt", cpuset_get_sopt}, {
3790 "cpuset_localcpus", cpuset_localcpus}, {
3791 "cpuset_localmems", cpuset_localmems}, {
3792 "cpuset_cpumemdist", cpuset_cpumemdist}, {
3793 "cpuset_cpu2node", cpuset_cpu2node}, {
3794 "cpuset_addr2node", cpuset_addr2node}, {
3795 "cpuset_create", cpuset_create}, {
3796 "cpuset_delete", cpuset_delete}, {
3797 "cpuset_query", cpuset_query}, {
3798 "cpuset_modify", cpuset_modify}, {
3799 "cpuset_getcpusetpath", cpuset_getcpusetpath}, {
3800 "cpuset_cpusetofpid", cpuset_cpusetofpid}, {
3801 "cpuset_mountpoint", cpuset_mountpoint}, {
3802 "cpuset_collides_exclusive", cpuset_collides_exclusive}, {
3803 "cpuset_nuke", cpuset_nuke}, {
3804 "cpuset_init_pidlist", cpuset_init_pidlist}, {
3805 "cpuset_pidlist_length", cpuset_pidlist_length}, {
3806 "cpuset_get_pidlist", cpuset_get_pidlist}, {
3807 "cpuset_freepidlist", cpuset_freepidlist}, {
3808 "cpuset_move", cpuset_move}, {
3809 "cpuset_move_all", cpuset_move_all}, {
3810 "cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks}, {
3811 "cpuset_migrate", cpuset_migrate}, {
3812 "cpuset_migrate_all", cpuset_migrate_all}, {
3813 "cpuset_reattach", cpuset_reattach}, {
3814 "cpuset_open_memory_pressure", cpuset_open_memory_pressure}, {
3815 "cpuset_read_memory_pressure", cpuset_read_memory_pressure}, {
3816 "cpuset_close_memory_pressure", cpuset_close_memory_pressure}, {
3817 "cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu}, {
3818 "cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu}, {
3819 "cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem}, {
3820 "cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem}, {
3821 "cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu}, {
3822 "cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu}, {
3823 "cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem}, {
3824 "cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem}, {
3825 "cpuset_get_placement", cpuset_get_placement}, {
3826 "cpuset_equal_placement", cpuset_equal_placement}, {
3827 "cpuset_free_placement", cpuset_free_placement}, {
3828 "cpuset_fts_open", cpuset_fts_open}, {
3829 "cpuset_fts_read", cpuset_fts_read}, {
3830 "cpuset_fts_reverse", cpuset_fts_reverse}, {
3831 "cpuset_fts_rewind", cpuset_fts_rewind}, {
3832 "cpuset_fts_get_path", cpuset_fts_get_path}, {
3833 "cpuset_fts_get_stat", cpuset_fts_get_stat}, {
3834 "cpuset_fts_get_cpuset", cpuset_fts_get_cpuset}, {
3835 "cpuset_fts_get_errno", cpuset_fts_get_errno}, {
3836 "cpuset_fts_get_info", cpuset_fts_get_info}, {
3837 "cpuset_fts_close", cpuset_fts_close}, {
3838 "cpuset_cpubind", cpuset_cpubind}, {
3839 "cpuset_latestcpu", cpuset_latestcpu}, {
3840 "cpuset_membind", cpuset_membind}, {
3841 "cpuset_export", cpuset_export}, {
3842 "cpuset_import", cpuset_import}, {
3843 "cpuset_function", cpuset_function}, {
3844 "cpuset_pin", cpuset_pin}, {
3845 "cpuset_size", cpuset_size}, {
3846 "cpuset_where", cpuset_where}, {
3847 "cpuset_unpin", cpuset_unpin},};
3848
3849 /* Return pointer to a libcpuset.so function, or NULL */
cpuset_function(const char * function_name)3850 void *cpuset_function(const char *function_name)
3851 {
3852 unsigned int i;
3853
3854 for (i = 0; i < sizeof(flist) / sizeof(flist[0]); i++)
3855 if (streq(function_name, flist[i].fname))
3856 return flist[i].func;
3857 return NULL;
3858 }
3859
3860 /* Fortran interface to basic cpuset routines */
cpuset_pin_(int * ptr_relcpu)3861 int cpuset_pin_(int *ptr_relcpu)
3862 {
3863 return cpuset_pin(*ptr_relcpu);
3864 }
3865
cpuset_size_(void)3866 int cpuset_size_(void)
3867 {
3868 return cpuset_size();
3869 }
3870
cpuset_where_(void)3871 int cpuset_where_(void)
3872 {
3873 return cpuset_where();
3874 }
3875
cpuset_unpin_(void)3876 int cpuset_unpin_(void)
3877 {
3878 return cpuset_unpin();
3879 }
3880
3881 #endif /* HAVE_LINUX_MEMPOLICY_H */
3882