1 /*
2 * cpuset user library implementation.
3 *
4 * Copyright (c) 2006-2007 Silicon Graphics, Inc. All rights reserved.
5 *
6 * Paul Jackson <pj@sgi.com>
7 */
8
9 /*
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25 #define _GNU_SOURCE /* need to see pread() and syscall() */
26 #include <unistd.h>
27
28 #include <ctype.h>
29 #include <dirent.h>
30 #include <errno.h>
31 #include <fcntl.h>
32 #include <fts.h>
33 #include <limits.h>
34 #include <signal.h>
35 #include <stdint.h>
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <string.h>
39 #include <sys/stat.h>
40 #include <sys/syscall.h>
41 #include <sys/types.h>
42 #include <time.h>
43 #include <utime.h>
44 #include <sys/utsname.h> /* for cpuset_would_crash_kernel() */
45
46 #include "bitmask.h"
47 #include "cpuset.h"
48 #include "common.h"
49 #include "test.h"
50 #include "lapi/syscalls.h"
51 #include "config.h"
52
53 #if HAVE_LINUX_MEMPOLICY_H
54 #include <linux/mempolicy.h>
55
56 /* Bump version, and update Change History, when libcpuset API changes */
57 #define CPUSET_VERSION 3
58
59 /*
60 * For a history of what changed in each version, see the "Change
61 * History" section, at the end of the libcpuset master document.
62 */
63
cpuset_version(void)64 int cpuset_version(void)
65 {
66 return CPUSET_VERSION;
67 }
68
69 struct cpuset {
70 struct bitmask *cpus;
71 struct bitmask *mems;
72 char cpu_exclusive;
73 char mem_exclusive;
74 char mem_hardwall;
75 char notify_on_release;
76 char memory_migrate;
77 char memory_pressure_enabled;
78 char memory_spread_page;
79 char memory_spread_slab;
80 char sched_load_balance;
81 int sched_relax_domain_level;
82
83 /*
84 * Each field 'x' above gets an 'x_valid' field below.
85 * The apply_cpuset_settings() will only set those fields whose
86 * corresponding *_valid flags are set. The cpuset_alloc()
87 * routine clears these flags as part of the clear in calloc(),
88 * and the various cpuset_set*() routines set these flags when
89 * setting the corresponding value.
90 *
91 * The purpose of these valid fields is to ensure that when
92 * we create a new cpuset, we don't accidentally overwrite
93 * some non-zero kernel default, such as an inherited
94 * memory_spread_* flag, just because the user application
95 * code didn't override the default zero settings resulting
96 * from the calloc() call in cpuset_alloc().
97 *
98 * The choice of 'char' for the type of the flags above,
99 * but a bitfield for the flags below, is somewhat capricious.
100 */
101 unsigned cpus_valid:1;
102 unsigned mems_valid:1;
103 unsigned cpu_exclusive_valid:1;
104 unsigned mem_exclusive_valid:1;
105 unsigned mem_hardwall_valid:1;
106 unsigned notify_on_release_valid:1;
107 unsigned memory_migrate_valid:1;
108 unsigned memory_pressure_enabled_valid:1;
109 unsigned memory_spread_page_valid:1;
110 unsigned memory_spread_slab_valid:1;
111 unsigned sched_load_balance_valid:1;
112 unsigned sched_relax_domain_level_valid:1;
113
114 /*
115 * if the relative variable was modified, use following flags
116 * to put a mark
117 */
118 unsigned cpus_dirty:1;
119 unsigned mems_dirty:1;
120 unsigned cpu_exclusive_dirty:1;
121 unsigned mem_exclusive_dirty:1;
122 unsigned mem_hardwall_dirty:1;
123 unsigned notify_on_release_dirty:1;
124 unsigned memory_migrate_dirty:1;
125 unsigned memory_pressure_enabled_dirty:1;
126 unsigned memory_spread_page_dirty:1;
127 unsigned memory_spread_slab_dirty:1;
128 unsigned sched_load_balance_dirty:1;
129 unsigned sched_relax_domain_level_dirty:1;
130 };
131
132 /* Presumed cpuset file system mount point */
133 static const char *cpusetmnt = "/dev/cpuset";
134
135 /* Stashed copy of cpunodemap[], mapping each cpu to its node. */
136 static const char *mapfile = "/var/run/cpunodemap";
137
138 /* The primary source for the cpunodemap[] is available below here. */
139 static const char *sysdevices = "/sys/devices/system";
140
141 /* small buffer size - for reading boolean flags or map file (1 or 2 ints) */
142 #define SMALL_BUFSZ 16
143
144 /*
145 * The 'mask_size_file' is used to ferrit out the kernel cpumask_t
146 * and nodemask_t sizes. The lines in this file that begin with the
147 * strings 'cpumask_prefix' and 'nodemask_prefix' display a cpumask
148 * and nodemask string, respectively. The lengths of these strings
149 * reflect the kernel's internal cpumask_t and nodemask_t sizes,
150 * which sizes are needed to correctly call the sched_setaffinity
151 * and set_mempolicy system calls, and to size user level
152 * bitmasks to match the kernels.
153 */
154
155 static const char *mask_size_file = "/proc/self/status";
156 static const char *cpumask_prefix = "Cpus_allowed:\t";
157 static const char *nodemask_prefix = "Mems_allowed:\t";
158
159 /*
160 * Sizes of kernel cpumask_t and nodemask_t bitmaps, in bits.
161 *
162 * The first time we need these, we parse the Cpus_allowed and
163 * Mems_allowed lines from mask_size_file ("/proc/self/status").
164 */
165
166 static int cpumask_sz;
167 static int nodemask_sz;
168
169 /*
170 * These defaults only kick in if we fail to size the kernel
171 * cpumask and nodemask by reading the Cpus_allowed and
172 * Mems_allowed fields from the /proc/self/status file.
173 */
174
175 #define DEFCPUBITS (512)
176 #define DEFNODEBITS (DEFCPUBITS/2)
177
178 /*
179 * Arch-neutral API for obtaining NUMA distances between CPUs
180 * and Memory Nodes, via the files:
181 * /sys/devices/system/node/nodeN/distance
182 * which have lines such as:
183 * 46 66 10 20
184 * which say that for cpu on node N (from the path above), the
185 * distance to nodes 0, 1, 2, and 3 are 44, 66, 10, and 20,
186 * respectively.
187 */
188
189 static const char *distance_directory = "/sys/devices/system/node";
190
191 /*
192 * Someday, we should disable, then later discard, the SN code
193 * marked ALTERNATE_SN_DISTMAP.
194 */
195
196 #define ALTERNATE_SN_DISTMAP 1
197 #ifdef ALTERNATE_SN_DISTMAP
198
199 /*
200 * Alternative SN (SGI ia64) architecture specific API for obtaining
201 * NUMA distances between CPUs and Memory Nodes is via the file
202 * /proc/sgi_sn/sn_topology, which has lines such as:
203 *
204 * node 2 001c14#0 local asic SHub_1.1, nasid 0x4, dist 46:66:10:20
205 *
206 * which says that for each CPU on node 2, the distance to nodes
207 * 0, 1, 2 and 3 are 46, 66, 10 and 20, respectively.
208 *
209 * This file has other lines as well, which start with other
210 * keywords than "node". Ignore these other lines.
211 */
212
213 static const char *sn_topology = "/proc/sgi_sn/sn_topology";
214 static const char *sn_top_node_prefix = "node ";
215
216 #endif
217
218 /*
219 * Check that cpusets supported, /dev/cpuset mounted.
220 * If ok, return 0.
221 * If not, return -1 and set errno:
222 * ENOSYS - kernel doesn't support cpusets
223 * ENODEV - /dev/cpuset not mounted
224 */
225
226 static enum {
227 check_notdone,
228 check_enosys,
229 check_enodev,
230 check_ok
231 } check_state = check_notdone;
232
check(void)233 static int check(void)
234 {
235 if (check_state == check_notdone) {
236 struct stat statbuf;
237
238 if (stat("/proc/self/cpuset", &statbuf) < 0) {
239 check_state = check_enosys;
240 goto done;
241 }
242
243 if (stat("/dev/cpuset/tasks", &statbuf) < 0) {
244 check_state = check_enodev;
245 goto done;
246 }
247
248 check_state = check_ok;
249 }
250 done:
251 switch (check_state) {
252 case check_enosys:
253 errno = ENOSYS;
254 return -1;
255 case check_enodev:
256 errno = ENODEV;
257 return -1;
258 default:
259 break;
260 }
261 return 0;
262 }
263
chomp(char * s)264 static void chomp(char *s)
265 {
266 char *t;
267
268 for (t = s + strlen(s) - 1; t >= s; t--) {
269 if (*t == '\n' || *t == '\r')
270 *t = '\0';
271 else
272 break;
273 }
274 }
275
276 /*
277 * Determine number of bytes in a seekable open file, without
278 * assuming that stat(2) on that file has a useful size.
279 * Has side affect of leaving the file rewound to the beginnning.
280 */
filesize(FILE * fp)281 static int filesize(FILE * fp)
282 {
283 int sz = 0;
284 rewind(fp);
285 while (fgetc(fp) != EOF)
286 sz++;
287 rewind(fp);
288 return sz;
289 }
290
291 /* Are strings s1 and s2 equal? */
streq(const char * s1,const char * s2)292 static int streq(const char *s1, const char *s2)
293 {
294 return strcmp(s1, s2) == 0;
295 }
296
297 /* Is string 'pre' a prefix of string 's'? */
strprefix(const char * s,const char * pre)298 static int strprefix(const char *s, const char *pre)
299 {
300 return strncmp(s, pre, strlen(pre)) == 0;
301 }
302
303 /*
304 * char *flgets(char *buf, int buflen, FILE *fp)
305 *
306 * Obtain one line from input file fp. Copy up to first
307 * buflen-1 chars of line into buffer buf, discarding any remainder
308 * of line. Stop reading at newline, discarding newline.
309 * Nul terminate result and return pointer to buffer buf
310 * on success, or NULL if nothing more to read or failure.
311 */
312
flgets(char * buf,int buflen,FILE * fp)313 static char *flgets(char *buf, int buflen, FILE * fp)
314 {
315 int c = -1;
316 char *bp;
317
318 bp = buf;
319 while ((--buflen > 0) && ((c = getc(fp)) >= 0)) {
320 if (c == '\n')
321 goto newline;
322 *bp++ = c;
323 }
324 if ((c < 0) && (bp == buf))
325 return NULL;
326
327 if (c > 0) {
328 while ((c = getc(fp)) >= 0) {
329 if (c == '\n')
330 break;
331 }
332 }
333
334 newline:
335 *bp++ = '\0';
336 return buf;
337 }
338
339 /*
340 * sgetc(const char *inputbuf, int *offsetptr)
341 *
342 * Return next char from nul-terminated input buffer inputbuf,
343 * starting at offset *offsetptr. Increment *offsetptr.
344 * If next char would be nul ('\0'), return EOF and don't
345 * increment *offsetptr.
346 */
347
sgetc(const char * inputbuf,int * offsetptr)348 static int sgetc(const char *inputbuf, int *offsetptr)
349 {
350 char c;
351
352 if ((c = inputbuf[*offsetptr]) != 0) {
353 *offsetptr = *offsetptr + 1;
354 return c;
355 } else {
356 return EOF;
357 }
358 }
359
360 /*
361 * char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
362 *
363 * Obtain next line from nul-terminated input buffer 'inputbuf',
364 * starting at offset *offsetptr. Copy up to first buflen-1
365 * chars of line into output buffer buf, discarding any remainder
366 * of line. Stop reading at newline, discarding newline.
367 * Nul terminate result and return pointer to output buffer
368 * buf on success, or NULL if nothing more to read.
369 */
370
slgets(char * buf,int buflen,const char * inputbuf,int * offsetptr)371 static char *slgets(char *buf, int buflen, const char *inputbuf, int *offsetptr)
372 {
373 int c = -1;
374 char *bp;
375
376 bp = buf;
377 while ((--buflen > 0) && ((c = sgetc(inputbuf, offsetptr)) >= 0)) {
378 if (c == '\n')
379 goto newline;
380 *bp++ = c;
381 }
382 if ((c < 0) && (bp == buf))
383 return NULL;
384
385 if (c > 0) {
386 while ((c = sgetc(inputbuf, offsetptr)) >= 0) {
387 if (c == '\n')
388 break;
389 }
390 }
391
392 newline:
393 *bp++ = '\0';
394 return buf;
395 }
396
397 /*
398 * time_t get_mtime(char *path)
399 *
400 * Return modtime of file at location path, else return 0.
401 */
402
get_mtime(const char * path)403 static time_t get_mtime(const char *path)
404 {
405 struct stat statbuf;
406
407 if (stat(path, &statbuf) != 0)
408 return 0;
409 return statbuf.st_mtime;
410 }
411
412 /*
413 * int set_mtime(const char *path, time_t mtime)
414 *
415 * Set modtime of file 'path' to 'mtime'. Return 0 on success,
416 * or -1 on error, setting errno.
417 */
418
set_mtime(const char * path,time_t mtime)419 static int set_mtime(const char *path, time_t mtime)
420 {
421 struct utimbuf times;
422
423 times.actime = mtime;
424 times.modtime = mtime;
425 return utime(path, ×);
426 }
427
428 /*
429 * True if two pathnames resolve to same file.
430 * False if either path can not be stat'd,
431 * or if the two paths resolve to a different file.
432 */
433
samefile(const char * path1,const char * path2)434 static int samefile(const char *path1, const char *path2)
435 {
436 struct stat sb1, sb2;
437
438 if (stat(path1, &sb1) != 0)
439 return 0;
440 if (stat(path2, &sb2) != 0)
441 return 0;
442 return sb1.st_ino == sb2.st_ino && sb1.st_dev == sb2.st_dev;
443 }
444
445 #define slash(c) (*(c) == '/')
446 #define eocomp(c) (slash(c) || !*(c))
447 #define dot1(c) (*(c) == '.' && eocomp(c+1))
448
449 /* In place path compression. Remove extra dots and slashes. */
pathcomp(char * p)450 static char *pathcomp(char *p)
451 {
452 char *a = p;
453 char *b = p;
454
455 if (!p || !*p)
456 return p;
457 if (slash(p))
458 *b++ = *a++;
459 for (;;) {
460 if (slash(a))
461 while (slash(++a))
462 continue;
463 if (!*a) {
464 if (b == p)
465 *b++ = '.';
466 *b = '\0';
467 return (p);
468 } else if (dot1(a)) {
469 a++;
470 } else {
471 if ((b != p) && !slash(b - 1))
472 *b++ = '/';
473 while (!eocomp(a))
474 *b++ = *a++;
475 }
476 }
477 }
478
479 #undef slash
480 #undef eocomp
481 #undef dot1
482
483 /*
484 * pathcat2(buf, buflen, name1, name2)
485 *
486 * Return buf, of length buflen, with name1/name2 stored in it.
487 */
488
pathcat2(char * buf,int buflen,const char * name1,const char * name2)489 static char *pathcat2(char *buf, int buflen, const char *name1,
490 const char *name2)
491 {
492 (void)snprintf(buf, buflen, "%s/%s", name1, name2);
493 return pathcomp(buf);
494 }
495
496 /*
497 * pathcat3(buf, buflen, name1, name2, name3)
498 *
499 * Return buf, of length buflen, with name1/name2/name3 stored in it.
500 */
501
pathcat3(char * buf,int buflen,const char * name1,const char * name2,const char * name3)502 static char *pathcat3(char *buf, int buflen, const char *name1,
503 const char *name2, const char *name3)
504 {
505 (void)snprintf(buf, buflen, "%s/%s/%s", name1, name2, name3);
506 return pathcomp(buf);
507 }
508
509 /*
510 * fullpath(buf, buflen, name)
511 *
512 * Put full path of cpuset 'name' in buffer 'buf'. If name
513 * starts with a slash (``/``) character, then this a path
514 * relative to ``/dev/cpuset``, otherwise it is relative to
515 * the current tasks cpuset. Return 0 on success, else
516 * -1 on error, setting errno.
517 */
518
fullpath(char * buf,int buflen,const char * name)519 static int fullpath(char *buf, int buflen, const char *name)
520 {
521 int len;
522
523 /* easy case */
524 if (*name == '/') {
525 pathcat2(buf, buflen, cpusetmnt, name);
526 pathcomp(buf);
527 return 0;
528 }
529
530 /* hard case */
531 snprintf(buf, buflen, "%s/", cpusetmnt);
532 len = strlen(buf);
533 if (cpuset_getcpusetpath(0, buf + len, buflen - len) == NULL)
534 return -1;
535 if (strlen(buf) >= buflen - 1 - strlen(name)) {
536 errno = E2BIG;
537 return -1;
538 }
539 strcat(buf, "/");
540 strcat(buf, name);
541 pathcomp(buf);
542 return 0;
543 }
544
545 /*
546 * fullpath2(buf, buflen, name1, name2)
547 *
548 * Like fullpath(), only concatenate two pathname components on end.
549 */
550
fullpath2(char * buf,int buflen,const char * name1,const char * name2)551 static int fullpath2(char *buf, int buflen, const char *name1,
552 const char *name2)
553 {
554 if (fullpath(buf, buflen, name1) < 0)
555 return -1;
556 if (strlen(buf) >= buflen - 1 - strlen(name2)) {
557 errno = E2BIG;
558 return -1;
559 }
560 strcat(buf, "/");
561 strcat(buf, name2);
562 pathcomp(buf);
563 return 0;
564 }
565
566 /*
567 * Convert the string length of an ascii hex mask to the number
568 * of bits represented by that mask.
569 *
570 * The cpumask and nodemask values in /proc/self/status are in an
571 * ascii format that uses 9 characters for each 32 bits of mask.
572 */
s2nbits(const char * s)573 static int s2nbits(const char *s)
574 {
575 return strlen(s) * 32 / 9;
576 }
577
update_mask_sizes(void)578 static void update_mask_sizes(void)
579 {
580 FILE *fp = NULL;
581 char *buf = NULL;
582 int fsize;
583
584 if ((fp = fopen(mask_size_file, "r")) == NULL)
585 goto done;
586 fsize = filesize(fp);
587 if ((buf = malloc(fsize)) == NULL)
588 goto done;
589
590 /*
591 * Beware: mask sizing arithmetic is fussy.
592 * The trailing newline left by fgets() is required.
593 */
594 while (fgets(buf, fsize, fp)) {
595 if (strprefix(buf, cpumask_prefix))
596 cpumask_sz = s2nbits(buf + strlen(cpumask_prefix));
597 if (strprefix(buf, nodemask_prefix))
598 nodemask_sz = s2nbits(buf + strlen(nodemask_prefix));
599 }
600 done:
601 free(buf);
602 if (fp != NULL)
603 fclose(fp);
604 if (cpumask_sz == 0)
605 cpumask_sz = DEFCPUBITS;
606 if (nodemask_sz == 0)
607 nodemask_sz = DEFNODEBITS;
608 }
609
610 /* Allocate a new struct cpuset */
cpuset_alloc(void)611 struct cpuset *cpuset_alloc(void)
612 {
613 struct cpuset *cp = NULL;
614 int nbits;
615
616 if ((cp = calloc(1, sizeof(struct cpuset))) == NULL)
617 goto err;
618
619 nbits = cpuset_cpus_nbits();
620 if ((cp->cpus = bitmask_alloc(nbits)) == NULL)
621 goto err;
622
623 nbits = cpuset_mems_nbits();
624 if ((cp->mems = bitmask_alloc(nbits)) == NULL)
625 goto err;
626
627 return cp;
628 err:
629 if (cp && cp->cpus)
630 bitmask_free(cp->cpus);
631 if (cp && cp->mems)
632 bitmask_free(cp->mems);
633 free(cp);
634 return NULL;
635 }
636
637 /* Free struct cpuset *cp */
cpuset_free(struct cpuset * cp)638 void cpuset_free(struct cpuset *cp)
639 {
640 if (!cp)
641 return;
642 if (cp->cpus)
643 bitmask_free(cp->cpus);
644 if (cp->mems)
645 bitmask_free(cp->mems);
646 free(cp);
647 }
648
649 /* Number of bits in a CPU bitmask on current system */
cpuset_cpus_nbits(void)650 int cpuset_cpus_nbits(void)
651 {
652 if (cpumask_sz == 0)
653 update_mask_sizes();
654 return cpumask_sz;
655 }
656
657 /* Number of bits in a Memory bitmask on current system */
cpuset_mems_nbits(void)658 int cpuset_mems_nbits(void)
659 {
660 if (nodemask_sz == 0)
661 update_mask_sizes();
662 return nodemask_sz;
663 }
664
665 /* Set CPUs in cpuset cp to bitmask cpus */
cpuset_setcpus(struct cpuset * cp,const struct bitmask * cpus)666 int cpuset_setcpus(struct cpuset *cp, const struct bitmask *cpus)
667 {
668 if (cp->cpus)
669 bitmask_free(cp->cpus);
670 cp->cpus = bitmask_alloc(bitmask_nbits(cpus));
671 if (cp->cpus == NULL)
672 return -1;
673 bitmask_copy(cp->cpus, cpus);
674 cp->cpus_valid = 1;
675 cp->cpus_dirty = 1;
676 return 0;
677 }
678
679 /* Set Memory Nodes in cpuset cp to bitmask mems */
cpuset_setmems(struct cpuset * cp,const struct bitmask * mems)680 int cpuset_setmems(struct cpuset *cp, const struct bitmask *mems)
681 {
682 if (cp->mems)
683 bitmask_free(cp->mems);
684 cp->mems = bitmask_alloc(bitmask_nbits(mems));
685 if (cp->mems == NULL)
686 return -1;
687 bitmask_copy(cp->mems, mems);
688 cp->mems_valid = 1;
689 cp->mems_dirty = 1;
690 return 0;
691 }
692
693 /* Set integer value optname of cpuset cp */
cpuset_set_iopt(struct cpuset * cp,const char * optionname,int value)694 int cpuset_set_iopt(struct cpuset *cp, const char *optionname, int value)
695 {
696 if (streq(optionname, "cpu_exclusive")) {
697 cp->cpu_exclusive = ! !value;
698 cp->cpu_exclusive_valid = 1;
699 cp->cpu_exclusive_dirty = 1;
700 } else if (streq(optionname, "mem_exclusive")) {
701 cp->mem_exclusive = ! !value;
702 cp->mem_exclusive_valid = 1;
703 cp->mem_exclusive_dirty = 1;
704 } else if (streq(optionname, "mem_hardwall")) {
705 cp->mem_hardwall = ! !value;
706 cp->mem_hardwall_valid = 1;
707 cp->mem_hardwall_dirty = 1;
708 } else if (streq(optionname, "notify_on_release")) {
709 cp->notify_on_release = ! !value;
710 cp->notify_on_release_valid = 1;
711 cp->notify_on_release_dirty = 1;
712 } else if (streq(optionname, "memory_pressure_enabled")) {
713 cp->memory_pressure_enabled = ! !value;
714 cp->memory_pressure_enabled_valid = 1;
715 cp->memory_pressure_enabled_dirty = 1;
716 } else if (streq(optionname, "memory_migrate")) {
717 cp->memory_migrate = ! !value;
718 cp->memory_migrate_valid = 1;
719 cp->memory_migrate_dirty = 1;
720 } else if (streq(optionname, "memory_spread_page")) {
721 cp->memory_spread_page = ! !value;
722 cp->memory_spread_page_valid = 1;
723 cp->memory_spread_page_dirty = 1;
724 } else if (streq(optionname, "memory_spread_slab")) {
725 cp->memory_spread_slab = ! !value;
726 cp->memory_spread_slab_valid = 1;
727 cp->memory_spread_slab_dirty = 1;
728 } else if (streq(optionname, "sched_load_balance")) {
729 cp->sched_load_balance = ! !value;
730 cp->sched_load_balance_valid = 1;
731 cp->sched_load_balance_dirty = 1;
732 } else if (streq(optionname, "sched_relax_domain_level")) {
733 cp->sched_relax_domain_level = value;
734 cp->sched_relax_domain_level_valid = 1;
735 cp->sched_relax_domain_level_dirty = 1;
736 } else
737 return -2; /* optionname not recognized */
738 return 0;
739 }
740
741 /* [optional] Set string value optname */
cpuset_set_sopt(UNUSED struct cpuset * cp,UNUSED const char * optionname,UNUSED const char * value)742 int cpuset_set_sopt(UNUSED struct cpuset *cp, UNUSED const char *optionname,
743 UNUSED const char *value)
744 {
745 return -2; /* For now, all string options unrecognized */
746 }
747
748 /* Return handle for reading memory_pressure. */
cpuset_open_memory_pressure(const char * cpusetpath)749 int cpuset_open_memory_pressure(const char *cpusetpath)
750 {
751 char buf[PATH_MAX];
752
753 fullpath2(buf, sizeof(buf), cpusetpath, "memory_pressure");
754 return open(buf, O_RDONLY);
755 }
756
757 /* Return current memory_pressure of cpuset. */
cpuset_read_memory_pressure(int han)758 int cpuset_read_memory_pressure(int han)
759 {
760 char buf[SMALL_BUFSZ];
761
762 if (pread(han, buf, sizeof(buf), 0L) < 0)
763 return -1;
764 return atoi(buf);
765 }
766
767 /* Close handle for reading memory pressure. */
cpuset_close_memory_pressure(int han)768 void cpuset_close_memory_pressure(int han)
769 {
770 close(han);
771 }
772
773 /*
774 * Resolve cpuset pointer (to that of current task if cp == NULL).
775 *
776 * If cp not NULL, just return it. If cp is NULL, return pointer
777 * to temporary cpuset for current task, and set *cp_tofree to
778 * pointer to that same temporary cpuset, to be freed later.
779 *
780 * Return NULL and set errno on error. Errors can occur when
781 * resolving the current tasks cpuset.
782 */
resolve_cp(const struct cpuset * cp,struct cpuset ** cp_tofree)783 static const struct cpuset *resolve_cp(const struct cpuset *cp,
784 struct cpuset **cp_tofree)
785 {
786 const struct cpuset *rcp;
787
788 if (cp) {
789 rcp = cp;
790 } else {
791 struct cpuset *cp1 = cpuset_alloc();
792 if (cp1 == NULL)
793 goto err;
794 if (cpuset_cpusetofpid(cp1, 0) < 0) {
795 cpuset_free(cp1);
796 goto err;
797 }
798 *cp_tofree = cp1;
799 rcp = cp1;
800 }
801 return rcp;
802 err:
803 return NULL;
804 }
805
806 /* Write CPUs in cpuset cp (current task if cp == NULL) to bitmask cpus */
cpuset_getcpus(const struct cpuset * cp,struct bitmask * cpus)807 int cpuset_getcpus(const struct cpuset *cp, struct bitmask *cpus)
808 {
809 struct cpuset *cp_tofree = NULL;
810 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
811
812 if (!cp1)
813 goto err;
814 if (cp1->cpus == NULL) {
815 errno = EINVAL;
816 goto err;
817 }
818 bitmask_copy(cpus, cp1->cpus);
819 cpuset_free(cp_tofree);
820 return 0;
821 err:
822 cpuset_free(cp_tofree);
823 return -1;
824 }
825
826 /* Write Memory Nodes in cp (current task if cp == NULL) to bitmask mems */
cpuset_getmems(const struct cpuset * cp,struct bitmask * mems)827 int cpuset_getmems(const struct cpuset *cp, struct bitmask *mems)
828 {
829 struct cpuset *cp_tofree = NULL;
830 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
831
832 if (!cp1)
833 goto err;
834 if (cp1->mems == NULL) {
835 errno = EINVAL;
836 goto err;
837 }
838 bitmask_copy(mems, cp1->mems);
839 cpuset_free(cp_tofree);
840 return 0;
841 err:
842 cpuset_free(cp_tofree);
843 return -1;
844 }
845
846 /* Return number of CPUs in cpuset cp (current task if cp == NULL) */
cpuset_cpus_weight(const struct cpuset * cp)847 int cpuset_cpus_weight(const struct cpuset *cp)
848 {
849 struct cpuset *cp_tofree = NULL;
850 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
851 int w = -1;
852
853 if (!cp1)
854 goto err;
855 if (cp1->cpus == NULL) {
856 errno = EINVAL;
857 goto err;
858 }
859 w = bitmask_weight(cp1->cpus);
860 /* fall into ... */
861 err:
862 cpuset_free(cp_tofree);
863 return w;
864 }
865
866 /* Return number of Memory Nodes in cpuset cp (current task if cp == NULL) */
cpuset_mems_weight(const struct cpuset * cp)867 int cpuset_mems_weight(const struct cpuset *cp)
868 {
869 struct cpuset *cp_tofree = NULL;
870 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
871 int w = -1;
872
873 if (!cp1)
874 goto err;
875 if (cp1->mems == NULL) {
876 errno = EINVAL;
877 goto err;
878 }
879 w = bitmask_weight(cp1->mems);
880 /* fall into ... */
881 err:
882 cpuset_free(cp_tofree);
883 return w;
884 }
885
886 /* Return integer value of option optname in cp */
cpuset_get_iopt(const struct cpuset * cp,const char * optionname)887 int cpuset_get_iopt(const struct cpuset *cp, const char *optionname)
888 {
889 if (streq(optionname, "cpu_exclusive"))
890 return cp->cpu_exclusive;
891 else if (streq(optionname, "mem_exclusive"))
892 return cp->mem_exclusive;
893 else if (streq(optionname, "mem_hardwall"))
894 return cp->mem_hardwall;
895 else if (streq(optionname, "notify_on_release"))
896 return cp->notify_on_release;
897 else if (streq(optionname, "memory_pressure_enabled"))
898 return cp->memory_pressure_enabled;
899 else if (streq(optionname, "memory_migrate"))
900 return cp->memory_migrate;
901 else if (streq(optionname, "memory_spread_page"))
902 return cp->memory_spread_page;
903 else if (streq(optionname, "memory_spread_slab"))
904 return cp->memory_spread_slab;
905 else if (streq(optionname, "sched_load_balance"))
906 return cp->sched_load_balance;
907 else if (streq(optionname, "sched_relax_domain_level"))
908 return cp->sched_relax_domain_level;
909 else
910 return -2; /* optionname not recognized */
911 }
912
913 /* [optional] Return string value of optname */
cpuset_get_sopt(UNUSED const struct cpuset * cp,UNUSED const char * optionname)914 const char *cpuset_get_sopt(UNUSED const struct cpuset *cp,
915 UNUSED const char *optionname)
916 {
917 return NULL; /* For now, all string options unrecognized */
918 }
919
read_flag(const char * filepath,char * flagp)920 static int read_flag(const char *filepath, char *flagp)
921 {
922 char buf[SMALL_BUFSZ]; /* buffer a "0" or "1" flag line */
923 int fd = -1;
924
925 if ((fd = open(filepath, O_RDONLY)) < 0)
926 goto err;
927 if (read(fd, buf, sizeof(buf)) < 1)
928 goto err;
929 if (atoi(buf))
930 *flagp = 1;
931 else
932 *flagp = 0;
933 close(fd);
934 return 0;
935 err:
936 if (fd >= 0)
937 close(fd);
938 return -1;
939 }
940
load_flag(const char * path,char * flagp,const char * flag)941 static int load_flag(const char *path, char *flagp, const char *flag)
942 {
943 char buf[PATH_MAX];
944
945 pathcat2(buf, sizeof(buf), path, flag);
946 return read_flag(buf, flagp);
947 }
948
read_number(const char * filepath,int * numberp)949 static int read_number(const char *filepath, int *numberp)
950 {
951 char buf[SMALL_BUFSZ];
952 int fd = -1;
953
954 if ((fd = open(filepath, O_RDONLY)) < 0)
955 goto err;
956 if (read(fd, buf, sizeof(buf)) < 1)
957 goto err;
958 *numberp = atoi(buf);
959 close(fd);
960 return 0;
961 err:
962 if (fd >= 0)
963 close(fd);
964 return -1;
965 }
966
load_number(const char * path,int * numberp,const char * file)967 static int load_number(const char *path, int *numberp, const char *file)
968 {
969 char buf[PATH_MAX];
970
971 pathcat2(buf, sizeof(buf), path, file);
972 return read_number(buf, numberp);
973 }
974
read_mask(const char * filepath,struct bitmask ** bmpp,int nbits)975 static int read_mask(const char *filepath, struct bitmask **bmpp, int nbits)
976 {
977 FILE *fp = NULL;
978 char *buf = NULL;
979 int buflen;
980 struct bitmask *bmp = NULL;
981
982 if ((fp = fopen(filepath, "r")) == NULL)
983 goto err;
984 buflen = filesize(fp) + 1; /* + 1 for nul term */
985 if ((buf = malloc(buflen)) == NULL)
986 goto err;
987 if (flgets(buf, buflen, fp) == NULL)
988 goto err;
989 fclose(fp);
990 fp = NULL;
991
992 if ((bmp = bitmask_alloc(nbits)) == NULL)
993 goto err;
994 if (*buf && bitmask_parselist(buf, bmp) < 0)
995 goto err;
996 if (*bmpp)
997 bitmask_free(*bmpp);
998 *bmpp = bmp;
999 free(buf);
1000 buf = NULL;
1001 return 0;
1002 err:
1003 if (buf != NULL)
1004 free(buf);
1005 if (fp != NULL)
1006 fclose(fp);
1007 if (bmp != NULL)
1008 bitmask_free(bmp);
1009 return -1;
1010 }
1011
load_mask(const char * path,struct bitmask ** bmpp,int nbits,const char * mask)1012 static int load_mask(const char *path, struct bitmask **bmpp,
1013 int nbits, const char *mask)
1014 {
1015 char buf[PATH_MAX];
1016
1017 pathcat2(buf, sizeof(buf), path, mask);
1018 return read_mask(buf, bmpp, nbits);
1019 }
1020
1021 /* Write string to file at given filepath. Create or truncate file. */
write_string_file(const char * filepath,const char * str)1022 static int write_string_file(const char *filepath, const char *str)
1023 {
1024 int fd = -1;
1025
1026 if ((fd = open(filepath, O_WRONLY | O_CREAT, 0644)) < 0)
1027 goto err;
1028 if (write(fd, str, strlen(str)) < 0)
1029 goto err;
1030 close(fd);
1031 return 0;
1032 err:
1033 if (fd >= 0)
1034 close(fd);
1035 return -1;
1036 }
1037
1038 /* Size and allocate buffer. Write bitmask into it. Caller must free */
sprint_mask_buf(const struct bitmask * bmp)1039 static char *sprint_mask_buf(const struct bitmask *bmp)
1040 {
1041 char *buf = NULL;
1042 int buflen;
1043 char c;
1044
1045 /* First bitmask_displaylist() call just to get the length */
1046 buflen = bitmask_displaylist(&c, 1, bmp) + 1; /* "+ 1" for nul */
1047 if ((buf = malloc(buflen)) == NULL)
1048 return NULL;
1049 bitmask_displaylist(buf, buflen, bmp);
1050 return buf;
1051 }
1052
exists_flag(const char * path,const char * flag)1053 static int exists_flag(const char *path, const char *flag)
1054 {
1055 char buf[PATH_MAX];
1056 struct stat statbuf;
1057 int rc;
1058
1059 pathcat2(buf, sizeof(buf), path, flag);
1060 rc = (stat(buf, &statbuf) == 0);
1061 errno = 0;
1062 return rc;
1063 }
1064
store_flag(const char * path,const char * flag,int val)1065 static int store_flag(const char *path, const char *flag, int val)
1066 {
1067 char buf[PATH_MAX];
1068
1069 pathcat2(buf, sizeof(buf), path, flag);
1070 return write_string_file(buf, val ? "1" : "0");
1071 }
1072
store_number(const char * path,const char * file,int val)1073 static int store_number(const char *path, const char *file, int val)
1074 {
1075 char buf[PATH_MAX];
1076 char data[SMALL_BUFSZ];
1077
1078 memset(data, 0, sizeof(data));
1079 pathcat2(buf, sizeof(buf), path, file);
1080 snprintf(data, sizeof(data), "%d", val);
1081 return write_string_file(buf, data);
1082 }
1083
store_mask(const char * path,const char * mask,const struct bitmask * bmp)1084 static int store_mask(const char *path, const char *mask,
1085 const struct bitmask *bmp)
1086 {
1087 char maskpath[PATH_MAX];
1088 char *bp = NULL;
1089 int rc;
1090
1091 if (bmp == NULL)
1092 return 0;
1093 pathcat2(maskpath, sizeof(maskpath), path, mask);
1094 if ((bp = sprint_mask_buf(bmp)) == NULL)
1095 return -1;
1096 rc = write_string_file(maskpath, bp);
1097 free(bp);
1098 return rc;
1099 }
1100
1101 /*
1102 * Return 1 if 'cpu' is online, else 0 if offline. Tests the file
1103 * /sys/devices/system/cpu/cpuN/online file for 0 or 1 contents
1104 * were N == cpu number.
1105 */
1106
cpu_online(unsigned int cpu)1107 char cpu_online(unsigned int cpu)
1108 {
1109 char online;
1110 char cpupath[PATH_MAX];
1111
1112 (void)snprintf(cpupath, sizeof(cpupath),
1113 "/sys/devices/system/cpu/cpu%d/online", cpu);
1114 if (read_flag(cpupath, &online) < 0)
1115 return 0; /* oops - guess that cpu's not there */
1116 return online;
1117 }
1118
1119 /*
1120 * The cpunodemap maps each cpu in [0 ... cpuset_cpus_nbits()),
1121 * to the node on which that cpu resides or cpuset_mems_nbits().
1122 *
1123 * To avoid every user having to recalculate this relation
1124 * from various clues in the sysfs file system (below the
1125 * path /sys/devices/system) a copy of this map is kept at
1126 * /var/run/cpunodemap.
1127 *
1128 * The system automatically cleans out files below
1129 * /var/run on each system reboot (see the init script
1130 * /etc/rc.d/boot.d/S*boot.localnet), so we don't have to worry
1131 * about stale data in this file across reboots. If the file
1132 * is missing, let the first process that needs it, and has
1133 * permission to write in the /var/run directory, rebuild it.
1134 *
1135 * If using this cached data, remember the mtime of the mapfile
1136 * the last time we read it in case something like a hotplug
1137 * event results in the file being removed and rebuilt, so we
1138 * can detect if we're using a stale cache, and need to reload.
1139 *
1140 * The mtime of this file is set to the time when we did
1141 * the recalculation of the map, from the clues beneath
1142 * /sys/devices/system. This is done so that a program
1143 * won't see the mapfile it just wrote as being newer than what
1144 * it just wrote out (store_map) and read the same map back in
1145 * (load_file).
1146 */
1147
1148 /*
1149 * Hold flockfile(stdin) while using cpunodemap for posix thread safety.
1150 *
1151 * Note on locking and flockfile(FILE *):
1152 *
1153 * We use flockfile() and funlockfile() instead of directly
1154 * calling pthread_mutex_lock and pthread_mutex_unlock on
1155 * a pthread_mutex_t, because this avoids forcing the app
1156 * to link with libpthread. The glibc implementation of
1157 * flockfile/funlockfile will fall back to no-ops if libpthread
1158 * doesn't happen to be linked.
1159 *
1160 * Since flockfile already has the moderately convoluted
1161 * combination of weak and strong symbols required to accomplish
1162 * this, it is easier to use flockfile() on some handy FILE *
1163 * stream as a surrogate for pthread locking than it is to so
1164 * re-invent that wheel.
1165 *
1166 * Forcing all apps that use cpusets to link with libpthread
1167 * would force non-transparent initialization on apps that
1168 * might not be prepared to handle it.
1169 *
1170 * The application using libcpuset should never notice this
1171 * odd use of flockfile(), because we never return to the
1172 * application from any libcpuset call with any such lock held.
1173 * We just use this locking for guarding some non-atomic cached
1174 * data updates and accesses, internal to some libcpuset calls.
1175 * Also, flockfile() allows recursive nesting, so if the app
1176 * calls libcpuset holding such a file lock, we won't deadlock
1177 * if we go to acquire the same lock. We'll just get the lock
1178 * and increment its counter while we hold it.
1179 */
1180
1181 static struct cpunodemap {
1182 int *map; /* map[cpumask_sz]: maps cpu to its node */
1183 time_t mtime; /* modtime of mapfile when last read */
1184 } cpunodemap;
1185
1186 /*
1187 * rebuild_map() - Rebuild cpunodemap[] from scratch.
1188 *
1189 * Situation:
1190 * Neither our in-memory cpunodemap[] array nor the
1191 * cache of it in mapfile is current.
1192 * Action:
1193 * Rebuild it from first principles and the information
1194 * available below /sys/devices/system.
1195 */
1196
rebuild_map(void)1197 static void rebuild_map(void)
1198 {
1199 char buf[PATH_MAX];
1200 DIR *dir1, *dir2;
1201 struct dirent *dent1, *dent2;
1202 int ncpus = cpuset_cpus_nbits();
1203 int nmems = cpuset_mems_nbits();
1204 unsigned int cpu, mem;
1205
1206 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1207 cpunodemap.map[cpu] = -1;
1208 pathcat2(buf, sizeof(buf), sysdevices, "node");
1209 if ((dir1 = opendir(buf)) == NULL)
1210 return;
1211 while ((dent1 = readdir(dir1)) != NULL) {
1212 if (sscanf(dent1->d_name, "node%u", &mem) < 1)
1213 continue;
1214 pathcat3(buf, sizeof(buf), sysdevices, "node", dent1->d_name);
1215 if ((dir2 = opendir(buf)) == NULL)
1216 continue;
1217 while ((dent2 = readdir(dir2)) != NULL) {
1218 if (sscanf(dent2->d_name, "cpu%u", &cpu) < 1)
1219 continue;
1220 if (cpu >= (unsigned int)ncpus
1221 || mem >= (unsigned int)nmems)
1222 continue;
1223 cpunodemap.map[cpu] = mem;
1224 }
1225 closedir(dir2);
1226 }
1227 closedir(dir1);
1228 cpunodemap.mtime = time(0);
1229 }
1230
1231 /*
1232 * load_map() - Load cpunodemap[] from mapfile.
1233 *
1234 * Situation:
1235 * The cpunodemap in mapfile is more recent than
1236 * what we have in the cpunodemap[] array.
1237 * Action:
1238 * Reload the cpunodemap[] array from the file.
1239 */
1240
load_map(void)1241 static void load_map(void)
1242 {
1243 char buf[SMALL_BUFSZ]; /* buffer 1 line of mapfile */
1244 FILE *mapfp; /* File stream on mapfile */
1245 int ncpus = cpuset_cpus_nbits();
1246 int nmems = cpuset_mems_nbits();
1247 unsigned int cpu, mem;
1248
1249 if ((cpunodemap.map = calloc(ncpus, sizeof(int))) == NULL)
1250 return;
1251 cpunodemap.mtime = get_mtime(mapfile);
1252 if ((mapfp = fopen(mapfile, "r")) == NULL)
1253 return;
1254 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++)
1255 cpunodemap.map[cpu] = nmems;
1256 while (flgets(buf, sizeof(buf), mapfp) != NULL) {
1257 if (sscanf(buf, "%u %u", &cpu, &mem) < 2)
1258 continue;
1259 if (cpu >= (unsigned int)ncpus || mem >= (unsigned int)nmems)
1260 continue;
1261 cpunodemap.map[cpu] = mem;
1262 }
1263 fclose(mapfp);
1264 }
1265
1266 /*
1267 * store_map() - Write cpunodemap[] out to mapfile.
1268 *
1269 * Situation:
1270 * The cpunodemap in the cpunodemap[] array is
1271 * more recent than the one in mapfile.
1272 * Action:
1273 * Write cpunodemap[] out to mapfile.
1274 */
1275
store_map(void)1276 static void store_map(void)
1277 {
1278 char buf[PATH_MAX];
1279 int fd = -1;
1280 FILE *mapfp = NULL;
1281 int ncpus = cpuset_cpus_nbits();
1282 int nmems = cpuset_mems_nbits();
1283 unsigned int cpu, mem;
1284
1285 snprintf(buf, sizeof(buf), "%s.%s", mapfile, "XXXXXX");
1286 if ((fd = mkstemp(buf)) < 0)
1287 goto err;
1288 if ((mapfp = fdopen(fd, "w")) == NULL)
1289 goto err;
1290 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1291 mem = cpunodemap.map[cpu];
1292 if (mem < (unsigned int)nmems)
1293 fprintf(mapfp, "%u %u\n", cpu, mem);
1294 }
1295 fclose(mapfp);
1296 set_mtime(buf, cpunodemap.mtime);
1297 if (rename(buf, mapfile) < 0)
1298 goto err;
1299 /* mkstemp() creates mode 0600 - change to world readable */
1300 (void)chmod(mapfile, 0444);
1301 return;
1302 err:
1303 if (mapfp != NULL) {
1304 fclose(mapfp);
1305 fd = -1;
1306 }
1307 if (fd >= 0)
1308 close(fd);
1309 (void)unlink(buf);
1310 }
1311
1312 /*
1313 * Load and gain thread safe access to the <cpu, node> map.
1314 *
1315 * Return 0 on success with flockfile(stdin) held.
1316 * Each successful get_map() call must be matched with a
1317 * following put_map() call to release the lock.
1318 *
1319 * On error, return -1 with errno set and no lock held.
1320 */
1321
get_map(void)1322 static int get_map(void)
1323 {
1324 time_t file_mtime;
1325
1326 flockfile(stdin);
1327
1328 if (cpunodemap.map == NULL) {
1329 cpunodemap.map = calloc(cpuset_cpus_nbits(), sizeof(int));
1330 if (cpunodemap.map == NULL)
1331 goto err;
1332 }
1333
1334 /* If no one has a good cpunodemap, rebuild from scratch */
1335 file_mtime = get_mtime(mapfile);
1336 if (cpunodemap.mtime == 0 && file_mtime == 0)
1337 rebuild_map();
1338
1339 /* If either cpunodemap[] or mapfile newer, update other with it */
1340 file_mtime = get_mtime(mapfile);
1341 if (cpunodemap.mtime < file_mtime)
1342 load_map();
1343 else if (cpunodemap.mtime > file_mtime)
1344 store_map();
1345 return 0;
1346 err:
1347 funlockfile(stdin);
1348 return -1;
1349 }
1350
put_map(void)1351 static void put_map(void)
1352 {
1353 funlockfile(stdin);
1354 }
1355
1356 /* Set cpus to those local to Memory Nodes mems */
cpuset_localcpus(const struct bitmask * mems,struct bitmask * cpus)1357 int cpuset_localcpus(const struct bitmask *mems, struct bitmask *cpus)
1358 {
1359 int ncpus = cpuset_cpus_nbits();
1360 unsigned int cpu;
1361
1362 if (check() < 0)
1363 return -1;
1364
1365 get_map();
1366 bitmask_clearall(cpus);
1367 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1368 if (bitmask_isbitset(mems, cpunodemap.map[cpu]))
1369 bitmask_setbit(cpus, cpu);
1370 }
1371 put_map();
1372 return 0;
1373 }
1374
1375 /* Set mems to those local to CPUs cpus */
cpuset_localmems(const struct bitmask * cpus,struct bitmask * mems)1376 int cpuset_localmems(const struct bitmask *cpus, struct bitmask *mems)
1377 {
1378 int ncpus = cpuset_cpus_nbits();
1379 unsigned int cpu;
1380
1381 if (check() < 0)
1382 return -1;
1383
1384 get_map();
1385 bitmask_clearall(mems);
1386 for (cpu = 0; cpu < (unsigned int)ncpus; cpu++) {
1387 if (bitmask_isbitset(cpus, cpu))
1388 bitmask_setbit(mems, cpunodemap.map[cpu]);
1389 }
1390 put_map();
1391 return 0;
1392 }
1393
1394 /*
1395 * distmap[]
1396 *
1397 * Array of ints of size cpumask_sz by nodemask_sz.
1398 *
1399 * Element distmap[cpu][mem] is the distance between CPU cpu
1400 * and Memory Node mem. Distances are weighted to roughly
1401 * approximate the cost of memory references, and scaled so that
1402 * the distance from a CPU to its local Memory Node is ten (10).
1403 *
1404 * The first call to cpuset_cpumemdist() builds this map, from
1405 * whatever means the kernel provides to obtain these distances.
1406 *
1407 * These distances derive from ACPI SLIT table entries, which are
1408 * eight bits in size.
1409 *
1410 * Hold flockfile(stdout) while using distmap for posix thread safety.
1411 */
1412
1413 typedef unsigned char distmap_entry_t; /* type of distmap[] entries */
1414
1415 static distmap_entry_t *distmap; /* maps <cpu, mem> to distance */
1416
1417 #define DISTMAP_MAX UCHAR_MAX /* maximum value in distmap[] */
1418
1419 #define I(i,j) ((i) * nmems + (j)) /* 2-D array index simulation */
1420
1421 /*
1422 * Parse arch neutral lines from 'distance' files of form:
1423 *
1424 * 46 66 10 20
1425 *
1426 * The lines contain a space separated list of distances, which is parsed
1427 * into array dists[] of each nodes distance from the specified node.
1428 *
1429 * Result is placed in distmap[ncpus][nmems]:
1430 *
1431 * For each cpu c on node:
1432 * For each node position n in list of distances:
1433 * distmap[c][n] = dists[n]
1434 */
1435
parse_distmap_line(unsigned int node,char * buf)1436 static int parse_distmap_line(unsigned int node, char *buf)
1437 {
1438 char *p, *q;
1439 int ncpus = cpuset_cpus_nbits();
1440 int nmems = cpuset_mems_nbits();
1441 unsigned int c, n;
1442 distmap_entry_t *dists = NULL;
1443 struct bitmask *cpus = NULL, *mems = NULL;
1444 int ret = -1;
1445
1446 p = buf;
1447 if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1448 goto err;
1449 for (n = 0; n < (unsigned int)nmems; n++)
1450 dists[n] = DISTMAP_MAX;
1451
1452 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1453 unsigned int d;
1454
1455 if ((p = strpbrk(p, "0123456789")) == NULL)
1456 break;
1457 d = strtoul(p, &q, 10);
1458 if (p == q)
1459 break;
1460 if (d < DISTMAP_MAX)
1461 dists[n] = (distmap_entry_t) d;
1462 }
1463
1464 if ((mems = bitmask_alloc(nmems)) == NULL)
1465 goto err;
1466 bitmask_setbit(mems, node);
1467
1468 if ((cpus = bitmask_alloc(ncpus)) == NULL)
1469 goto err;
1470 cpuset_localcpus(mems, cpus);
1471
1472 for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1473 c = bitmask_next(cpus, c + 1))
1474 for (n = 0; n < (unsigned int)nmems; n++)
1475 distmap[I(c, n)] = dists[n];
1476 ret = 0;
1477 /* fall into ... */
1478 err:
1479 bitmask_free(mems);
1480 bitmask_free(cpus);
1481 free(dists);
1482 return ret;
1483 }
1484
parse_distance_file(unsigned int node,const char * path)1485 static int parse_distance_file(unsigned int node, const char *path)
1486 {
1487 FILE *fp;
1488 char *buf = NULL;
1489 int buflen;
1490
1491 if ((fp = fopen(path, "r")) == NULL)
1492 goto err;
1493
1494 buflen = filesize(fp);
1495
1496 if ((buf = malloc(buflen)) == NULL)
1497 goto err;
1498
1499 if (flgets(buf, buflen, fp) == NULL)
1500 goto err;
1501
1502 if (parse_distmap_line(node, buf) < 0)
1503 goto err;
1504
1505 free(buf);
1506 fclose(fp);
1507 return 0;
1508 err:
1509 free(buf);
1510 if (fp)
1511 fclose(fp);
1512 return -1;
1513 }
1514
build_distmap(void)1515 static void build_distmap(void)
1516 {
1517 static int tried_before = 0;
1518 int ncpus = cpuset_cpus_nbits();
1519 int nmems = cpuset_mems_nbits();
1520 int c, m;
1521 DIR *dir = NULL;
1522 struct dirent *dent;
1523
1524 if (tried_before)
1525 goto err;
1526 tried_before = 1;
1527
1528 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1529 goto err;
1530
1531 for (c = 0; c < ncpus; c++)
1532 for (m = 0; m < nmems; m++)
1533 distmap[I(c, m)] = DISTMAP_MAX;
1534
1535 if ((dir = opendir(distance_directory)) == NULL)
1536 goto err;
1537 while ((dent = readdir(dir)) != NULL) {
1538 char buf[PATH_MAX];
1539 unsigned int node;
1540
1541 if (sscanf(dent->d_name, "node%u", &node) < 1)
1542 continue;
1543 pathcat3(buf, sizeof(buf), distance_directory, dent->d_name,
1544 "distance");
1545 if (parse_distance_file(node, buf) < 0)
1546 goto err;
1547 }
1548 closedir(dir);
1549 return;
1550 err:
1551 if (dir)
1552 closedir(dir);
1553 free(distmap);
1554 distmap = NULL;
1555 }
1556
1557 #ifdef ALTERNATE_SN_DISTMAP
1558
1559 /*
1560 * Parse SN architecture specific line of form:
1561 *
1562 * node 3 001c14#1 local asic SHub_1.1, nasid 0x6, dist 66:46:20:10
1563 *
1564 * Second field is node number. The "dist" field is the colon separated list
1565 * of distances, which is parsed into array dists[] of each nodes distance
1566 * from that node.
1567 *
1568 * Result is placed in distmap[ncpus][nmems]:
1569 *
1570 * For each cpu c on that node:
1571 * For each node position n in list of distances:
1572 * distmap[c][n] = dists[n]
1573 */
1574
parse_distmap_line_sn(char * buf)1575 static void parse_distmap_line_sn(char *buf)
1576 {
1577 char *p, *pend, *q;
1578 int ncpus = cpuset_cpus_nbits();
1579 int nmems = cpuset_mems_nbits();
1580 unsigned long c, n, node;
1581 distmap_entry_t *dists = NULL;
1582 struct bitmask *cpus = NULL, *mems = NULL;
1583
1584 if ((p = strchr(buf, ' ')) == NULL)
1585 goto err;
1586 if ((node = strtoul(p, &q, 10)) >= (unsigned int)nmems)
1587 goto err;
1588 if ((p = strstr(q, " dist ")) == NULL)
1589 goto err;
1590 p += strlen(" dist ");
1591 if ((pend = strchr(p, ' ')) != NULL)
1592 *pend = '\0';
1593 if ((dists = calloc(nmems, sizeof(*dists))) == NULL)
1594 goto err;
1595 for (n = 0; n < (unsigned int)nmems; n++)
1596 dists[n] = DISTMAP_MAX;
1597
1598 for (n = 0; n < (unsigned int)nmems && *p; n++, p = q) {
1599 unsigned long d;
1600
1601 if ((p = strpbrk(p, "0123456789")) == NULL)
1602 break;
1603 d = strtoul(p, &q, 10);
1604 if (p == q)
1605 break;
1606 if (d < DISTMAP_MAX)
1607 dists[n] = (distmap_entry_t) d;
1608 }
1609
1610 if ((mems = bitmask_alloc(nmems)) == NULL)
1611 goto err;
1612 bitmask_setbit(mems, node);
1613
1614 if ((cpus = bitmask_alloc(ncpus)) == NULL)
1615 goto err;
1616 cpuset_localcpus(mems, cpus);
1617
1618 for (c = bitmask_first(cpus); c < (unsigned int)ncpus;
1619 c = bitmask_next(cpus, c + 1))
1620 for (n = 0; n < (unsigned int)nmems; n++)
1621 distmap[I(c, n)] = dists[n];
1622 /* fall into ... */
1623 err:
1624 bitmask_free(mems);
1625 bitmask_free(cpus);
1626 free(dists);
1627 }
1628
build_distmap_sn(void)1629 static void build_distmap_sn(void)
1630 {
1631 int ncpus = cpuset_cpus_nbits();
1632 int nmems = cpuset_mems_nbits();
1633 int c, m;
1634 static int tried_before = 0;
1635 FILE *fp = NULL;
1636 char *buf = NULL;
1637 int buflen;
1638
1639 if (tried_before)
1640 goto err;
1641 tried_before = 1;
1642
1643 if ((fp = fopen(sn_topology, "r")) == NULL)
1644 goto err;
1645
1646 if ((distmap = calloc(ncpus * nmems, sizeof(*distmap))) == NULL)
1647 goto err;
1648
1649 for (c = 0; c < ncpus; c++)
1650 for (m = 0; m < nmems; m++)
1651 distmap[I(c, m)] = DISTMAP_MAX;
1652
1653 buflen = filesize(fp);
1654 if ((buf = malloc(buflen)) == NULL)
1655 goto err;
1656
1657 while (flgets(buf, buflen, fp) != NULL)
1658 if (strprefix(buf, sn_top_node_prefix))
1659 parse_distmap_line_sn(buf);
1660
1661 free(buf);
1662 fclose(fp);
1663 return;
1664 err:
1665 free(buf);
1666 free(distmap);
1667 distmap = NULL;
1668 if (fp)
1669 fclose(fp);
1670 }
1671
1672 #endif
1673
1674 /* [optional] Hardware distance from CPU to Memory Node */
cpuset_cpumemdist(int cpu,int mem)1675 unsigned int cpuset_cpumemdist(int cpu, int mem)
1676 {
1677 int ncpus = cpuset_cpus_nbits();
1678 int nmems = cpuset_mems_nbits();
1679 distmap_entry_t r = DISTMAP_MAX;
1680
1681 flockfile(stdout);
1682
1683 if (check() < 0)
1684 goto err;
1685
1686 if (distmap == NULL)
1687 build_distmap();
1688
1689 #ifdef ALTERNATE_SN_DISTMAP
1690 if (distmap == NULL)
1691 build_distmap_sn();
1692 #endif
1693
1694 if (distmap == NULL)
1695 goto err;
1696
1697 if (cpu < 0 || cpu >= ncpus || mem < 0 || mem >= nmems)
1698 goto err;
1699
1700 r = distmap[I(cpu, mem)];
1701 /* fall into ... */
1702 err:
1703 funlockfile(stdout);
1704 return r;
1705 }
1706
1707 /* [optional] Return Memory Node closest to cpu */
cpuset_cpu2node(int cpu)1708 int cpuset_cpu2node(int cpu)
1709 {
1710 int ncpus = cpuset_cpus_nbits();
1711 int nmems = cpuset_mems_nbits();
1712 struct bitmask *cpus = NULL, *mems = NULL;
1713 int r = -1;
1714
1715 if (check() < 0)
1716 goto err;
1717
1718 if ((cpus = bitmask_alloc(ncpus)) == NULL)
1719 goto err;
1720 bitmask_setbit(cpus, cpu);
1721
1722 if ((mems = bitmask_alloc(nmems)) == NULL)
1723 goto err;
1724 cpuset_localmems(cpus, mems);
1725 r = bitmask_first(mems);
1726 /* fall into ... */
1727 err:
1728 bitmask_free(cpus);
1729 bitmask_free(mems);
1730 return r;
1731 }
1732
apply_cpuset_settings(const char * path,const struct cpuset * cp)1733 static int apply_cpuset_settings(const char *path, const struct cpuset *cp)
1734 {
1735 if (cp->cpu_exclusive_valid && cp->cpu_exclusive_dirty) {
1736 if (store_flag(path, "cpu_exclusive", cp->cpu_exclusive) < 0)
1737 goto err;
1738 }
1739
1740 if (cp->mem_exclusive_valid && cp->mem_exclusive_dirty) {
1741 if (store_flag(path, "mem_exclusive", cp->mem_exclusive) < 0)
1742 goto err;
1743 }
1744
1745 if (cp->mem_hardwall_valid && cp->mem_hardwall_dirty) {
1746 if (store_flag(path, "mem_hardwall", cp->mem_hardwall) < 0)
1747 goto err;
1748 }
1749
1750 if (cp->notify_on_release_valid && cp->notify_on_release_dirty) {
1751 if (store_flag(path, "notify_on_release", cp->notify_on_release)
1752 < 0)
1753 goto err;
1754 }
1755
1756 if (cp->memory_migrate_valid &&
1757 cp->memory_migrate_dirty && exists_flag(path, "memory_migrate")) {
1758 if (store_flag(path, "memory_migrate", cp->memory_migrate) < 0)
1759 goto err;
1760 }
1761
1762 if (cp->memory_pressure_enabled_valid &&
1763 cp->memory_pressure_enabled_dirty &&
1764 exists_flag(path, "memory_pressure_enabled")) {
1765 if (store_flag
1766 (path, "memory_pressure_enabled",
1767 cp->memory_pressure_enabled) < 0)
1768 goto err;
1769 }
1770
1771 if (cp->memory_spread_page_valid &&
1772 cp->memory_spread_page_dirty &&
1773 exists_flag(path, "memory_spread_page")) {
1774 if (store_flag
1775 (path, "memory_spread_page", cp->memory_spread_page) < 0)
1776 goto err;
1777 }
1778
1779 if (cp->memory_spread_slab_valid &&
1780 cp->memory_spread_slab_dirty &&
1781 exists_flag(path, "memory_spread_slab")) {
1782 if (store_flag
1783 (path, "memory_spread_slab", cp->memory_spread_slab) < 0)
1784 goto err;
1785 }
1786
1787 if (cp->sched_load_balance_valid &&
1788 cp->sched_load_balance_dirty &&
1789 exists_flag(path, "sched_load_balance")) {
1790 if (store_flag
1791 (path, "sched_load_balance", cp->sched_load_balance) < 0)
1792 goto err;
1793 }
1794
1795 if (cp->sched_relax_domain_level_valid &&
1796 cp->sched_relax_domain_level_dirty &&
1797 exists_flag(path, "sched_relax_domain_level")) {
1798 if (store_number
1799 (path, "sched_relax_domain_level",
1800 cp->sched_relax_domain_level) < 0)
1801 goto err;
1802 }
1803
1804 if (cp->cpus_valid && cp->cpus_dirty) {
1805 if (store_mask(path, "cpus", cp->cpus) < 0)
1806 goto err;
1807 }
1808
1809 if (cp->mems_valid && cp->mems_dirty) {
1810 if (store_mask(path, "mems", cp->mems) < 0)
1811 goto err;
1812 }
1813 return 0;
1814 err:
1815 return -1;
1816 }
1817
1818 /*
1819 * get_siblings() - helper routine for cpuset_would_crash_kernel(), below.
1820 *
1821 * Extract max value of any 'siblings' field in /proc/cpuinfo.
1822 * Cache the result - only need to extract once in lifetime of task.
1823 *
1824 * The siblings field is the number of logical CPUs in a physical
1825 * processor package. It is equal to the product of the number of
1826 * cores in that package, times the number of hyper-threads per core.
1827 * The bug that cpuset_would_crash_kernel() is detecting arises
1828 * when a cpu_exclusive cpuset tries to include just some, not all,
1829 * of the sibling logical CPUs available in a processor package.
1830 *
1831 * In the improbable case that a system has mixed values of siblings
1832 * (some processor packages have more than others, perhaps due to
1833 * partially enabling Hyper-Threading), we take the worse case value,
1834 * the largest siblings value. This might be overkill. I don't know
1835 * if this kernel bug considers each processor package's siblings
1836 * separately or not. But it sure is easier this way ...
1837 *
1838 * This routine takes about 0.7 msecs on a 4 CPU 2.8 MHz Xeon, from
1839 * open to close, the first time called.
1840 */
1841
get_siblings(void)1842 static int get_siblings(void)
1843 {
1844 static int siblings;
1845 char buf[32]; /* big enough for one 'siblings' line */
1846 FILE *fp;
1847
1848 if (siblings)
1849 return siblings;
1850
1851 if ((fp = fopen("/proc/cpuinfo", "r")) == NULL)
1852 return 4; /* wing it - /proc not mounted ? */
1853 while (flgets(buf, sizeof(buf), fp) != NULL) {
1854 int s;
1855
1856 if (sscanf(buf, "siblings : %d", &s) < 1)
1857 continue;
1858 if (s > siblings)
1859 siblings = s;
1860 }
1861 fclose(fp);
1862 if (siblings == 0)
1863 siblings = 1; /* old kernel, no siblings, default to 1 */
1864 return siblings;
1865 }
1866
1867 /*
1868 * Some 2.6.16 and 2.6.17 kernel versions have a bug in the dynamic
1869 * scheduler domain code invoked for cpu_exclusive cpusets that causes
1870 * the kernel to freeze, requiring a hardware reset.
1871 *
1872 * On kernels built with CONFIG_SCHED_MC enabled, if a 'cpu_exclusive'
1873 * cpuset is defined where that cpusets 'cpus' are not on package
1874 * boundaries then the kernel will freeze, usually as soon as this
1875 * cpuset is created, requiring a hardware reset.
1876 *
1877 * A cpusets 'cpus' are not on package boundaries if the cpuset
1878 * includes a proper non-empty subset (some, but not all) of the
1879 * logical cpus on a processor package. This requires multiple
1880 * logical CPUs per package, available with either Hyper-Thread or
1881 * Multi-Core support. Without one of these features, there is only
1882 * one logical CPU per physical package, and it's not possible to
1883 * have a proper, non-empty subset of a set of cardinality one.
1884 *
1885 * SUSE SLES10 kernels, as first released, only enable CONFIG_SCHED_MC
1886 * on i386 and x86_64 arch's.
1887 *
1888 * The objective of this routine cpuset_would_crash_kernel() is to
1889 * determine if a proposed cpuset setting would crash the kernel due
1890 * to this bug, so that the caller can avoid the crash.
1891 *
1892 * Ideally we'd check for exactly these conditions here, but computing
1893 * the package (identified by the 'physical id' field of /proc/cpuinfo)
1894 * of each cpu in a cpuset is more effort than it's worth here.
1895 *
1896 * Also there is no obvious way to identify exactly whether the kernel
1897 * one is executing on has this bug, short of trying it, and seeing
1898 * if the kernel just crashed.
1899 *
1900 * So for now, we look for a simpler set of conditions, that meets
1901 * our immediate need - avoid this crash on SUSE SLES10 systems that
1902 * are susceptible to it. We look for the kernel version 2.6.16.*,
1903 * which is the base kernel of SUSE SLES10, and for i386 or x86_64
1904 * processors, which had CONFIG_SCHED_MC enabled.
1905 *
1906 * If these simpler conditions are met, we further simplify the check,
1907 * by presuming that the logical CPUs are numbered on processor
1908 * package boundaries. If each package has S siblings, we assume
1909 * that CPUs numbered N through N + S -1 are on the same package,
1910 * for any CPU N such that N mod S == 0.
1911 *
1912 * Yes, this is a hack, focused on avoiding kernel freezes on
1913 * susceptible SUSE SLES10 systems.
1914 */
1915
cpuset_would_crash_kernel(const struct cpuset * cp)1916 static int cpuset_would_crash_kernel(const struct cpuset *cp)
1917 {
1918 static int susceptible_system = -1;
1919
1920 if (!cp->cpu_exclusive)
1921 goto ok;
1922
1923 if (susceptible_system == -1) {
1924 struct utsname u;
1925 int rel_2_6_16, arch_i386, arch_x86_64;
1926
1927 if (uname(&u) < 0)
1928 goto fail;
1929 rel_2_6_16 = strprefix(u.release, "2.6.16.");
1930 arch_i386 = streq(u.machine, "i386");
1931 arch_x86_64 = streq(u.machine, "x86_64");
1932 susceptible_system = rel_2_6_16 && (arch_i386 || arch_x86_64);
1933 }
1934
1935 if (susceptible_system) {
1936 int ncpus = cpuset_cpus_nbits();
1937 int siblings = get_siblings();
1938 unsigned int cpu;
1939
1940 for (cpu = 0; cpu < (unsigned int)ncpus; cpu += siblings) {
1941 int s, num_set = 0;
1942
1943 for (s = 0; s < siblings; s++) {
1944 if (bitmask_isbitset(cp->cpus, cpu + s))
1945 num_set++;
1946 }
1947
1948 /* If none or all siblings set, we're still ok */
1949 if (num_set == 0 || num_set == siblings)
1950 continue;
1951
1952 /* Found one that would crash kernel. Fail. */
1953 errno = ENXIO;
1954 goto fail;
1955 }
1956 }
1957 /* If not susceptible, or if all ok, fall into "ok" ... */
1958 ok:
1959 return 0; /* would not crash */
1960 fail:
1961 return 1; /* would crash */
1962 }
1963
1964 /* compare two cpuset and mark the dirty variable */
mark_dirty_variable(struct cpuset * cp1,const struct cpuset * cp2)1965 static void mark_dirty_variable(struct cpuset *cp1, const struct cpuset *cp2)
1966 {
1967 if (cp1->cpu_exclusive_valid &&
1968 cp1->cpu_exclusive != cp2->cpu_exclusive)
1969 cp1->cpu_exclusive_dirty = 1;
1970
1971 if (cp1->mem_exclusive_valid &&
1972 cp1->mem_exclusive != cp2->mem_exclusive)
1973 cp1->mem_exclusive_dirty = 1;
1974
1975 if (cp1->mem_hardwall_valid && cp1->mem_hardwall != cp2->mem_hardwall)
1976 cp1->mem_hardwall_dirty = 1;
1977
1978 if (cp1->notify_on_release_valid &&
1979 cp1->notify_on_release != cp2->notify_on_release)
1980 cp1->notify_on_release_dirty = 1;
1981
1982 if (cp1->memory_migrate_valid &&
1983 cp1->memory_migrate != cp2->memory_migrate)
1984 cp1->memory_migrate_dirty = 1;
1985
1986 if (cp1->memory_pressure_enabled_valid &&
1987 cp1->memory_pressure_enabled != cp2->memory_pressure_enabled)
1988 cp1->memory_pressure_enabled_dirty = 1;
1989
1990 if (cp1->memory_spread_page_valid &&
1991 cp1->memory_spread_page != cp2->memory_spread_page)
1992 cp1->memory_spread_page_dirty = 1;
1993
1994 if (cp1->memory_spread_slab_valid &&
1995 cp1->memory_spread_slab != cp2->memory_spread_slab)
1996 cp1->memory_spread_slab_dirty = 1;
1997
1998 if (cp1->sched_load_balance_valid &&
1999 cp1->sched_load_balance != cp2->sched_load_balance)
2000 cp1->sched_load_balance_dirty = 1;
2001
2002 if (cp1->sched_relax_domain_level_valid &&
2003 cp1->sched_relax_domain_level != cp2->sched_relax_domain_level)
2004 cp1->sched_relax_domain_level_dirty = 1;
2005
2006 if (cp1->cpus_valid && !bitmask_equal(cp1->cpus, cp2->cpus))
2007 cp1->cpus_dirty = 1;
2008 if (cp1->mems_valid && !bitmask_equal(cp1->mems, cp2->mems))
2009 cp1->mems_dirty = 1;
2010 }
2011
2012 /* Create (if new set) or modify cpuset 'cp' at location 'relpath' */
cr_or_mod(const char * relpath,const struct cpuset * cp,int new)2013 static int cr_or_mod(const char *relpath, const struct cpuset *cp, int new)
2014 {
2015 char buf[PATH_MAX];
2016 int do_rmdir_on_err = 0;
2017 int do_restore_cp_sav_on_err = 0;
2018 struct cpuset *cp_sav = NULL;
2019 int sav_errno;
2020
2021 if (check() < 0)
2022 goto err;
2023
2024 if (cpuset_would_crash_kernel(cp))
2025 goto err;
2026
2027 fullpath(buf, sizeof(buf), relpath);
2028
2029 if (new) {
2030 if (mkdir(buf, 0755) < 0)
2031 goto err;
2032 /* we made it, so we should remove it on error */
2033 do_rmdir_on_err = 1;
2034 }
2035
2036 if ((cp_sav = cpuset_alloc()) == NULL)
2037 goto err;
2038 if (cpuset_query(cp_sav, relpath) < 0)
2039 goto err;
2040 /* we have old settings to restore on error */
2041 do_restore_cp_sav_on_err = 1;
2042
2043 /* check which variable need to restore on error */
2044 mark_dirty_variable(cp_sav, cp);
2045
2046 if (apply_cpuset_settings(buf, cp) < 0)
2047 goto err;
2048
2049 cpuset_free(cp_sav);
2050 return 0;
2051 err:
2052 sav_errno = errno;
2053 if (do_restore_cp_sav_on_err)
2054 (void)apply_cpuset_settings(buf, cp_sav);
2055 if (cp_sav)
2056 cpuset_free(cp_sav);
2057 if (do_rmdir_on_err)
2058 (void)rmdir(buf);
2059 errno = sav_errno;
2060 return -1;
2061 }
2062
2063 /* Create cpuset 'cp' at location 'relpath' */
cpuset_create(const char * relpath,const struct cpuset * cp)2064 int cpuset_create(const char *relpath, const struct cpuset *cp)
2065 {
2066 return cr_or_mod(relpath, cp, 1);
2067 }
2068
2069 /* Delete cpuset at location 'path' (if empty) */
cpuset_delete(const char * relpath)2070 int cpuset_delete(const char *relpath)
2071 {
2072 char buf[PATH_MAX];
2073
2074 if (check() < 0)
2075 goto err;
2076
2077 fullpath(buf, sizeof(buf), relpath);
2078 if (rmdir(buf) < 0)
2079 goto err;
2080
2081 return 0;
2082 err:
2083 return -1;
2084 }
2085
2086 /* Set cpuset cp to the cpuset at location 'path' */
cpuset_query(struct cpuset * cp,const char * relpath)2087 int cpuset_query(struct cpuset *cp, const char *relpath)
2088 {
2089 char buf[PATH_MAX];
2090
2091 if (check() < 0)
2092 goto err;
2093
2094 fullpath(buf, sizeof(buf), relpath);
2095
2096 if (load_flag(buf, &cp->cpu_exclusive, "cpuset.cpu_exclusive") < 0)
2097 goto err;
2098 cp->cpu_exclusive_valid = 1;
2099
2100 if (load_flag(buf, &cp->mem_exclusive, "cpuset.mem_exclusive") < 0)
2101 goto err;
2102 cp->mem_exclusive_valid = 1;
2103
2104 if (load_flag(buf, &cp->notify_on_release, "notify_on_release") < 0)
2105 goto err;
2106 cp->notify_on_release_valid = 1;
2107
2108 if (exists_flag(buf, "cpuset.memory_migrate")) {
2109 if (load_flag(buf, &cp->memory_migrate, "cpuset.memory_migrate") < 0)
2110 goto err;
2111 cp->memory_migrate_valid = 1;
2112 }
2113
2114 if (exists_flag(buf, "cpuset.mem_hardwall")) {
2115 if (load_flag(buf, &cp->mem_hardwall, "cpuset.mem_hardwall") < 0)
2116 goto err;
2117 cp->mem_hardwall_valid = 1;
2118 }
2119
2120 if (exists_flag(buf, "cpuset.memory_pressure_enabled")) {
2121 if (load_flag
2122 (buf, &cp->memory_pressure_enabled,
2123 "cpuset.memory_pressure_enabled") < 0)
2124 goto err;
2125 cp->memory_pressure_enabled_valid = 1;
2126 }
2127
2128 if (exists_flag(buf, "cpuset.memory_spread_page")) {
2129 if (load_flag
2130 (buf, &cp->memory_spread_page, "cpuset.memory_spread_page") < 0)
2131 goto err;
2132 cp->memory_spread_page_valid = 1;
2133 }
2134
2135 if (exists_flag(buf, "cpuset.memory_spread_slab")) {
2136 if (load_flag
2137 (buf, &cp->memory_spread_slab, "cpuset.memory_spread_slab") < 0)
2138 goto err;
2139 cp->memory_spread_slab_valid = 1;
2140 }
2141
2142 if (exists_flag(buf, "cpuset.sched_load_balance")) {
2143 if (load_flag
2144 (buf, &cp->sched_load_balance, "cpuset.sched_load_balance") < 0)
2145 goto err;
2146 cp->sched_load_balance_valid = 1;
2147 }
2148
2149 if (exists_flag(buf, "cpuset.sched_relax_domain_level")) {
2150 if (load_number
2151 (buf, &cp->sched_relax_domain_level,
2152 "cpuset.sched_relax_domain_level") < 0)
2153 goto err;
2154 cp->sched_relax_domain_level_valid = 1;
2155 }
2156
2157 if (load_mask(buf, &cp->cpus, cpuset_cpus_nbits(), "cpuset.cpus") < 0)
2158 goto err;
2159 cp->cpus_valid = 1;
2160
2161 if (load_mask(buf, &cp->mems, cpuset_mems_nbits(), "cpuset.mems") < 0)
2162 goto err;
2163 cp->mems_valid = 1;
2164
2165 return 0;
2166 err:
2167 return -1;
2168 }
2169
2170 /* Modify cpuset at location 'relpath' to values of 'cp' */
cpuset_modify(const char * relpath,const struct cpuset * cp)2171 int cpuset_modify(const char *relpath, const struct cpuset *cp)
2172 {
2173 return cr_or_mod(relpath, cp, 0);
2174 }
2175
2176 /* Get cpuset path of pid into buf */
cpuset_getcpusetpath(pid_t pid,char * buf,size_t size)2177 char *cpuset_getcpusetpath(pid_t pid, char *buf, size_t size)
2178 {
2179 int fd; /* dual use: cpuset file for pid and self */
2180 int rc; /* dual use: snprintf and read return codes */
2181
2182 if (check() < 0)
2183 return NULL;
2184
2185 /* borrow result buf[] to build cpuset file path */
2186 if (pid == 0)
2187 rc = snprintf(buf, size, "/proc/self/cpuset");
2188 else
2189 rc = snprintf(buf, size, "/proc/%d/cpuset", pid);
2190 if (rc >= (int)size) {
2191 errno = E2BIG;
2192 return NULL;
2193 }
2194 if ((fd = open(buf, O_RDONLY)) < 0) {
2195 int e = errno;
2196 if (e == ENOENT)
2197 e = ESRCH;
2198 if ((fd = open("/proc/self/cpuset", O_RDONLY)) < 0)
2199 e = ENOSYS;
2200 else
2201 close(fd);
2202 errno = e;
2203 return NULL;
2204 }
2205 rc = read(fd, buf, size);
2206 close(fd);
2207 if (rc < 0)
2208 return NULL;
2209 if (rc >= (int)size) {
2210 errno = E2BIG;
2211 return NULL;
2212 }
2213 buf[rc] = 0;
2214 chomp(buf);
2215 return buf;
2216
2217 }
2218
2219 /* Get cpuset 'cp' of pid */
cpuset_cpusetofpid(struct cpuset * cp,pid_t pid)2220 int cpuset_cpusetofpid(struct cpuset *cp, pid_t pid)
2221 {
2222 char buf[PATH_MAX];
2223
2224 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
2225 return -1;
2226 if (cpuset_query(cp, buf) < 0)
2227 return -1;
2228 return 0;
2229 }
2230
2231 /* [optional] Return mountpoint of cpuset filesystem */
cpuset_mountpoint(void)2232 const char *cpuset_mountpoint(void)
2233 {
2234 if (check() < 0) {
2235 switch (errno) {
2236 case ENODEV:
2237 return "[cpuset filesystem not mounted]";
2238 default:
2239 return "[cpuset filesystem not supported]";
2240 }
2241 }
2242 return cpusetmnt;
2243 }
2244
2245 /* Return true if path is a directory. */
isdir(const char * path)2246 static int isdir(const char *path)
2247 {
2248 struct stat statbuf;
2249
2250 if (stat(path, &statbuf) < 0)
2251 return 0;
2252 return S_ISDIR(statbuf.st_mode);
2253 }
2254
2255 /*
2256 * [optional] cpuset_collides_exclusive() - True if would collide exclusive.
2257 *
2258 * Return true iff the specified cpuset would overlap with any
2259 * sibling cpusets in either cpus or mems, where either this
2260 * cpuset or the sibling is cpu_exclusive or mem_exclusive.
2261 *
2262 * cpuset_create() fails with errno == EINVAL if the requested cpuset
2263 * would overlap with any sibling, where either one is cpu_exclusive or
2264 * mem_exclusive. This is a common, and not obvious error. The
2265 * following routine checks for this particular case, so that code
2266 * creating cpusets can better identify the situation, perhaps to issue
2267 * a more informative error message.
2268 *
2269 * Can also be used to diagnose cpuset_modify failures. This
2270 * routine ignores any existing cpuset with the same path as the
2271 * given 'cpusetpath', and only looks for exclusive collisions with
2272 * sibling cpusets of that path.
2273 *
2274 * In case of any error, returns (0) -- does not collide. Presumably
2275 * any actual attempt to create or modify a cpuset will encounter the
2276 * same error, and report it usefully.
2277 *
2278 * This routine is not particularly efficient; most likely code creating or
2279 * modifying a cpuset will want to try the operation first, and then if that
2280 * fails with errno EINVAL, perhaps call this routine to determine if an
2281 * exclusive cpuset collision caused the error.
2282 */
2283
cpuset_collides_exclusive(const char * cpusetpath,const struct cpuset * cp1)2284 int cpuset_collides_exclusive(const char *cpusetpath, const struct cpuset *cp1)
2285 {
2286 char parent[PATH_MAX];
2287 char *p;
2288 char *pathcopy = NULL;
2289 char *base;
2290 DIR *dir = NULL;
2291 struct dirent *dent;
2292 struct cpuset *cp2 = NULL;
2293 struct bitmask *cpus1 = NULL, *cpus2 = NULL;
2294 struct bitmask *mems1 = NULL, *mems2 = NULL;
2295 int ret;
2296
2297 if (check() < 0)
2298 goto err;
2299
2300 fullpath(parent, sizeof(parent), cpusetpath);
2301 if (streq(parent, cpusetmnt))
2302 goto err; /* only one cpuset root - can't collide */
2303 pathcopy = strdup(parent);
2304 p = strrchr(parent, '/');
2305 if (!p)
2306 goto err; /* huh? - impossible - run and hide */
2307 *p = 0; /* now parent is dirname of fullpath */
2308
2309 p = strrchr(pathcopy, '/');
2310 base = p + 1; /* now base is basename of fullpath */
2311 if (!*base)
2312 goto err; /* this is also impossible - run away */
2313
2314 if ((dir = opendir(parent)) == NULL)
2315 goto err;
2316 if ((cp2 = cpuset_alloc()) == NULL)
2317 goto err;
2318 if ((cpus1 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2319 goto err;
2320 if ((cpus2 = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
2321 goto err;
2322 if ((mems1 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2323 goto err;
2324 if ((mems2 = bitmask_alloc(cpuset_mems_nbits())) == NULL)
2325 goto err;
2326
2327 while ((dent = readdir(dir)) != NULL) {
2328 char child[PATH_MAX];
2329
2330 if (streq(dent->d_name, ".") || streq(dent->d_name, ".."))
2331 continue;
2332 if (streq(dent->d_name, base))
2333 continue;
2334 pathcat2(child, sizeof(child), parent, dent->d_name);
2335 if (!isdir(child))
2336 continue;
2337 if (cpuset_query(cp2, child + strlen(cpusetmnt)) < 0)
2338 goto err;
2339 if (cp1->cpu_exclusive || cp2->cpu_exclusive) {
2340 cpuset_getcpus(cp1, cpus1);
2341 cpuset_getcpus(cp2, cpus2);
2342 if (bitmask_intersects(cpus1, cpus2))
2343 goto collides;
2344 }
2345 if (cp1->mem_exclusive || cp2->mem_exclusive) {
2346 cpuset_getmems(cp1, mems1);
2347 cpuset_getmems(cp2, mems2);
2348 if (bitmask_intersects(mems1, mems2))
2349 goto collides;
2350 }
2351 }
2352 err:
2353 /* error, or did not collide */
2354 ret = 0;
2355 goto done;
2356 collides:
2357 /* collides */
2358 ret = 1;
2359 /* fall into ... */
2360 done:
2361 if (dir)
2362 closedir(dir);
2363 cpuset_free(cp2);
2364 free(pathcopy);
2365 bitmask_free(cpus1);
2366 bitmask_free(cpus2);
2367 bitmask_free(mems1);
2368 bitmask_free(mems2);
2369 return ret;
2370 }
2371
2372 /*
2373 * [optional] cpuset_nuke() - Remove cpuset anyway possible
2374 *
2375 * Remove a cpuset, including killing tasks in it, and
2376 * removing any descendent cpusets and killing their tasks.
2377 *
2378 * Tasks can take a long time (minutes on some configurations)
2379 * to exit. Loop up to 'seconds' seconds, trying to kill them.
2380 *
2381 * How we do it:
2382 * 1) First, kill all the pids, looping until there are
2383 * no more pids in this cpuset or below, or until the
2384 * 'seconds' timeout limit is exceeded.
2385 * 2) Then depth first recursively rmdir the cpuset directories.
2386 * 3) If by this point the original cpuset is gone, we succeeded.
2387 *
2388 * If the timeout is exceeded, and tasks still exist, fail with
2389 * errno == ETIME.
2390 *
2391 * We sleep a variable amount of time. After the first attempt to
2392 * kill all the tasks in the cpuset or its descendents, we sleep 1
2393 * second, the next time 2 seconds, increasing 1 second each loop
2394 * up to a max of 10 seconds. If more loops past 10 are required
2395 * to kill all the tasks, we sleep 10 seconds each subsequent loop.
2396 * In any case, before the last loop, we sleep however many seconds
2397 * remain of the original timeout 'seconds' requested. The total
2398 * time of all sleeps will be no more than the requested 'seconds'.
2399 *
2400 * If the cpuset started out empty of any tasks, or if the passed in
2401 * 'seconds' was zero, then this routine will return quickly, having
2402 * not slept at all. Otherwise, this routine will at a minimum send
2403 * a SIGKILL to all the tasks in this cpuset subtree, then sleep one
2404 * second, before looking to see if any tasks remain. If tasks remain
2405 * in the cpuset subtree, and a longer 'seconds' timeout was requested
2406 * (more than one), it will continue to kill remaining tasks and sleep,
2407 * in a loop, for as long as time and tasks remain.
2408 *
2409 * The signal sent for the kill is hardcoded to SIGKILL (9). If some
2410 * other signal should be sent first, use a separate code loop,
2411 * perhaps based on cpuset_init_pidlist and cpuset_get_pidlist, to
2412 * scan the task pids in a cpuset. If SIGKILL should -not- be sent,
2413 * this cpuset_nuke() routine can still be called to recursively
2414 * remove a cpuset subtree, by specifying a timeout of zero 'seconds'.
2415 *
2416 * On success, returns 0 with errno == 0.
2417 *
2418 * On failure, returns -1, with errno possibly one of:
2419 * EACCES - search permission denied on intervening directory
2420 * ETIME - timed out - tasks remain after 'seconds' timeout
2421 * EMFILE - too many open files
2422 * ENODEV - /dev/cpuset not mounted
2423 * ENOENT - component of cpuset path doesn't exist
2424 * ENOMEM - out of memory
2425 * ENOSYS - kernel doesn't support cpusets
2426 * ENOTDIR - component of cpuset path is not a directory
2427 * EPERM - lacked permission to kill a task
2428 * EPERM - lacked permission to read cpusets or files therein
2429 */
2430
2431 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree);
2432
cpuset_nuke(const char * relpath,unsigned int seconds)2433 int cpuset_nuke(const char *relpath, unsigned int seconds)
2434 {
2435 unsigned int secs_left = seconds; /* total sleep seconds left */
2436 unsigned int secs_loop = 1; /* how much sleep next loop */
2437 unsigned int secs_slept; /* seconds slept in sleep() */
2438 struct cpuset_pidlist *pl = NULL; /* pids in cpuset subtree */
2439 struct cpuset_fts_tree *cs_tree;
2440 const struct cpuset_fts_entry *cs_entry;
2441 int ret, sav_errno = 0;
2442
2443 if (check() < 0)
2444 return -1;
2445
2446 if (seconds == 0)
2447 goto rmdir_cpusets;
2448
2449 while (1) {
2450 int plen, j;
2451
2452 if ((pl = cpuset_init_pidlist(relpath, 1)) == NULL) {
2453 /* missing cpuset is as good as if already nuked */
2454 if (errno == ENOENT) {
2455 ret = 0;
2456 goto no_more_cpuset;
2457 }
2458
2459 /* other problems reading cpuset are bad news */
2460 sav_errno = errno;
2461 goto failed;
2462 }
2463
2464 if ((plen = cpuset_pidlist_length(pl)) == 0)
2465 goto rmdir_cpusets;
2466
2467 for (j = 0; j < plen; j++) {
2468 pid_t pid;
2469
2470 if ((pid = cpuset_get_pidlist(pl, j)) > 1) {
2471 if (kill(pid, SIGKILL) < 0 && errno != ESRCH) {
2472 sav_errno = errno;
2473 goto failed;
2474 }
2475 }
2476 }
2477
2478 if (secs_left == 0)
2479 goto took_too_long;
2480
2481 cpuset_freepidlist(pl);
2482 pl = NULL;
2483
2484 secs_slept = secs_loop - sleep(secs_loop);
2485
2486 /* Ensure forward progress */
2487 if (secs_slept == 0)
2488 secs_slept = 1;
2489
2490 /* Ensure sane sleep() return (unnecessary?) */
2491 if (secs_slept > secs_loop)
2492 secs_slept = secs_loop;
2493
2494 secs_left -= secs_slept;
2495
2496 if (secs_loop < 10)
2497 secs_loop++;
2498
2499 secs_loop = MIN(secs_left, secs_loop);
2500 }
2501
2502 took_too_long:
2503 sav_errno = ETIME;
2504 /* fall into ... */
2505 failed:
2506 cpuset_freepidlist(pl);
2507 errno = sav_errno;
2508 return -1;
2509
2510 rmdir_cpusets:
2511 /* Let's try removing cpuset(s) now. */
2512 cpuset_freepidlist(pl);
2513
2514 if ((cs_tree = cpuset_fts_open(relpath)) == NULL && errno != ENOENT)
2515 return -1;
2516 ret = 0;
2517 cpuset_fts_reverse(cs_tree); /* rmdir's must be done bottom up */
2518 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2519 char buf[PATH_MAX];
2520
2521 fullpath(buf, sizeof(buf), cpuset_fts_get_path(cs_entry));
2522 if (rmdir(buf) < 0 && errno != ENOENT) {
2523 sav_errno = errno;
2524 ret = -1;
2525 }
2526 }
2527 cpuset_fts_close(cs_tree);
2528 /* fall into ... */
2529 no_more_cpuset:
2530 if (ret == 0)
2531 errno = 0;
2532 else
2533 errno = sav_errno;
2534 return ret;
2535 }
2536
2537 /*
2538 * When recursively reading all the tasks files from a subtree,
2539 * chain together the read results, one pidblock per tasks file,
2540 * containing the raw unprocessed ascii as read(2) in. After
2541 * we gather up this raw data, we then go back to count how
2542 * many pid's there are in total, allocate an array of pid_t
2543 * of that size, and transform the raw ascii data into this
2544 * array of pid_t's.
2545 */
2546
2547 struct pidblock {
2548 char *buf;
2549 int buflen;
2550 struct pidblock *next;
2551 };
2552
2553 /*
2554 * Chain the raw contents of a file onto the pbhead list.
2555 *
2556 * We malloc "+ 1" extra byte for a nul-terminator, so that
2557 * the strtoul() loop in pid_transform() won't scan past
2558 * the end of pb->buf[] and accidentally find more pids.
2559 */
add_pidblock(const char * file,struct pidblock ** ppbhead)2560 static void add_pidblock(const char *file, struct pidblock **ppbhead)
2561 {
2562 FILE *fp = NULL;
2563 struct pidblock *pb = NULL;
2564 int fsz;
2565
2566 if ((fp = fopen(file, "r")) == NULL)
2567 goto err;
2568 fsz = filesize(fp);
2569 if (fsz == 0)
2570 goto err;
2571 if ((pb = calloc(1, sizeof(*pb))) == NULL)
2572 goto err;
2573 pb->buflen = fsz;
2574 if ((pb->buf = malloc(pb->buflen + 1)) == NULL)
2575 goto err;
2576 if (fread(pb->buf, 1, pb->buflen, fp) > 0) {
2577 pb->buf[pb->buflen] = '\0';
2578 pb->next = *ppbhead;
2579 *ppbhead = pb;
2580 }
2581 fclose(fp);
2582 return;
2583 err:
2584 if (fp)
2585 fclose(fp);
2586 free(pb);
2587 }
2588
read_task_file(const char * relpath,struct pidblock ** ppbhead)2589 static void read_task_file(const char *relpath, struct pidblock **ppbhead)
2590 {
2591 char buf[PATH_MAX];
2592
2593 fullpath2(buf, sizeof(buf), relpath, "tasks");
2594 add_pidblock(buf, ppbhead);
2595 }
2596
2597 struct cpuset_pidlist {
2598 pid_t *pids;
2599 int npids;
2600 };
2601
2602 /* Count how many pids in buf (one per line - just count newlines) */
pidcount(const char * buf,int buflen)2603 static int pidcount(const char *buf, int buflen)
2604 {
2605 int n = 0;
2606 const char *cp;
2607
2608 for (cp = buf; cp < buf + buflen; cp++) {
2609 if (*cp == '\n')
2610 n++;
2611 }
2612 return n;
2613 }
2614
2615 /* Transform one-per-line ascii pids in pb to pid_t entries in pl */
pid_transform(struct pidblock * pb,struct cpuset_pidlist * pl,int n)2616 static int pid_transform(struct pidblock *pb, struct cpuset_pidlist *pl, int n)
2617 {
2618 char *a, *b;
2619
2620 for (a = pb->buf; a < pb->buf + pb->buflen; a = b) {
2621 pid_t p = strtoul(a, &b, 10);
2622 if (a == b)
2623 break;
2624 pl->pids[n++] = p;
2625 }
2626 return n;
2627 }
2628
free_pidblocks(struct pidblock * pbhead)2629 static void free_pidblocks(struct pidblock *pbhead)
2630 {
2631 struct pidblock *pb, *nextpb;
2632
2633 for (pb = pbhead; pb; pb = nextpb) {
2634 nextpb = pb->next;
2635 free(pb->buf);
2636 free(pb);
2637 }
2638 }
2639
2640 /* numeric comparison routine for qsort */
numericsort(const void * m1,const void * m2)2641 static int numericsort(const void *m1, const void *m2)
2642 {
2643 pid_t p1 = *(pid_t *) m1;
2644 pid_t p2 = *(pid_t *) m2;
2645
2646 return p1 - p2;
2647 }
2648
2649 /* Return list pids in cpuset 'path' */
cpuset_init_pidlist(const char * relpath,int recursiveflag)2650 struct cpuset_pidlist *cpuset_init_pidlist(const char *relpath,
2651 int recursiveflag)
2652 {
2653 struct pidblock *pb = NULL;
2654 struct cpuset_pidlist *pl = NULL;
2655 struct pidblock *pbhead = NULL;
2656 int n;
2657
2658 if (check() < 0)
2659 goto err;
2660
2661 if (recursiveflag) {
2662 struct cpuset_fts_tree *cs_tree;
2663 const struct cpuset_fts_entry *cs_entry;
2664
2665 if ((cs_tree = cpuset_fts_open(relpath)) == NULL)
2666 goto err;
2667 while ((cs_entry = cpuset_fts_read(cs_tree)) != NULL) {
2668 if (cpuset_fts_get_info(cs_entry) != CPUSET_FTS_CPUSET)
2669 continue;
2670 read_task_file(cpuset_fts_get_path(cs_entry), &pbhead);
2671 }
2672 cpuset_fts_close(cs_tree);
2673 } else {
2674 read_task_file(relpath, &pbhead);
2675 }
2676
2677 if ((pl = calloc(1, sizeof(*pl))) == NULL)
2678 goto err;
2679 pl->npids = 0;
2680 for (pb = pbhead; pb; pb = pb->next)
2681 pl->npids += pidcount(pb->buf, pb->buflen);
2682 if ((pl->pids = calloc(pl->npids, sizeof(pid_t))) == NULL)
2683 goto err;
2684 n = 0;
2685 for (pb = pbhead; pb; pb = pb->next)
2686 n = pid_transform(pb, pl, n);
2687 free_pidblocks(pbhead);
2688 qsort(pl->pids, pl->npids, sizeof(pid_t), numericsort);
2689 return pl;
2690 err:
2691 cpuset_freepidlist(pl);
2692 free_pidblocks(pbhead);
2693 return NULL;
2694 }
2695
2696 /* Return number of elements in pidlist */
cpuset_pidlist_length(const struct cpuset_pidlist * pl)2697 int cpuset_pidlist_length(const struct cpuset_pidlist *pl)
2698 {
2699 if (pl)
2700 return pl->npids;
2701 else
2702 return 0;
2703 }
2704
2705 /* Return i'th element of pidlist */
cpuset_get_pidlist(const struct cpuset_pidlist * pl,int i)2706 pid_t cpuset_get_pidlist(const struct cpuset_pidlist * pl, int i)
2707 {
2708 if (pl && i >= 0 && i < pl->npids)
2709 return pl->pids[i];
2710 else
2711 return (pid_t) - 1;
2712 }
2713
2714 /* Free pidlist */
cpuset_freepidlist(struct cpuset_pidlist * pl)2715 void cpuset_freepidlist(struct cpuset_pidlist *pl)
2716 {
2717 if (pl && pl->pids)
2718 free(pl->pids);
2719 free(pl);
2720 }
2721
__cpuset_move(pid_t pid,const char * path)2722 static int __cpuset_move(pid_t pid, const char *path)
2723 {
2724 char buf[SMALL_BUFSZ];
2725
2726 snprintf(buf, sizeof(buf), "%u", pid);
2727 return write_string_file(path, buf);
2728 }
2729
2730 /* Move task (pid == 0 for current) to a cpuset */
cpuset_move(pid_t pid,const char * relpath)2731 int cpuset_move(pid_t pid, const char *relpath)
2732 {
2733 char buf[PATH_MAX];
2734
2735 if (check() < 0)
2736 return -1;
2737
2738 if (pid == 0)
2739 pid = getpid();
2740
2741 fullpath2(buf, sizeof(buf), relpath, "tasks");
2742 return __cpuset_move(pid, buf);
2743 }
2744
2745 /* Move all tasks in pidlist to a cpuset */
cpuset_move_all(struct cpuset_pidlist * pl,const char * relpath)2746 int cpuset_move_all(struct cpuset_pidlist *pl, const char *relpath)
2747 {
2748 int i;
2749 char buf[PATH_MAX];
2750 int ret;
2751
2752 if (check() < 0)
2753 return -1;
2754
2755 fullpath2(buf, sizeof(buf), relpath, "tasks");
2756
2757 ret = 0;
2758 for (i = 0; i < pl->npids; i++)
2759 if (__cpuset_move(pl->pids[i], buf) < 0)
2760 ret = -1;
2761 return ret;
2762 }
2763
2764 /*
2765 * [optional] cpuset_move_cpuset_tasks() - Move all tasks in a
2766 * cpuset to another cpuset
2767 *
2768 * Move all tasks in cpuset fromrelpath to cpuset torelpath. This may
2769 * race with tasks being added to or forking into fromrelpath. Loop
2770 * repeatedly, reading the tasks file of cpuset fromrelpath and writing
2771 * any task pid's found there to the tasks file of cpuset torelpath,
2772 * up to ten attempts, or until the tasks file of cpuset fromrelpath
2773 * is empty, or until fromrelpath is no longer present.
2774 *
2775 * Returns 0 with errno == 0 if able to empty the tasks file of cpuset
2776 * fromrelpath. Of course it is still possible that some independent
2777 * task could add another task to cpuset fromrelpath at the same time
2778 * that such a successful result is being returned, so there can be
2779 * no guarantee that a successful return means that fromrelpath is
2780 * still empty of tasks.
2781 *
2782 * We are careful to allow for the possibility that the cpuset
2783 * fromrelpath might disappear out from under us, perhaps because it
2784 * has notify_on_release set and gets automatically removed as soon
2785 * as we detach its last task from it. Consider a missing fromrelpath
2786 * to be a successful move.
2787 *
2788 * If called with fromrelpath and torelpath pathnames that evaluate to
2789 * the same cpuset, then treat that as if cpuset_reattach() was called,
2790 * rebinding each task in this cpuset one time, and return success or
2791 * failure depending on the return of that cpuset_reattach() call.
2792 *
2793 * On failure, returns -1, with errno possibly one of:
2794 * EACCES - search permission denied on intervening directory
2795 * ENOTEMPTY - tasks remain after multiple attempts to move them
2796 * EMFILE - too many open files
2797 * ENODEV - /dev/cpuset not mounted
2798 * ENOENT - component of cpuset path doesn't exist
2799 * ENOMEM - out of memory
2800 * ENOSYS - kernel doesn't support cpusets
2801 * ENOTDIR - component of cpuset path is not a directory
2802 * EPERM - lacked permission to kill a task
2803 * EPERM - lacked permission to read cpusets or files therein
2804 *
2805 * This is an [optional] function. Use cpuset_function to invoke it.
2806 */
2807
2808 #define NUMBER_MOVE_TASK_ATTEMPTS 10
2809
cpuset_move_cpuset_tasks(const char * fromrelpath,const char * torelpath)2810 int cpuset_move_cpuset_tasks(const char *fromrelpath, const char *torelpath)
2811 {
2812 char fromfullpath[PATH_MAX];
2813 char tofullpath[PATH_MAX];
2814 int i;
2815 struct cpuset_pidlist *pl = NULL;
2816 int sav_errno;
2817
2818 fullpath(fromfullpath, sizeof(fromfullpath), fromrelpath);
2819 fullpath(tofullpath, sizeof(tofullpath), torelpath);
2820
2821 if (samefile(fromfullpath, tofullpath))
2822 return cpuset_reattach(fromrelpath);
2823
2824 for (i = 0; i < NUMBER_MOVE_TASK_ATTEMPTS; i++) {
2825 int plen, j;
2826
2827 if ((pl = cpuset_init_pidlist(fromrelpath, 0)) == NULL) {
2828 /* missing cpuset is as good as if all moved */
2829 if (errno == ENOENT)
2830 goto no_more_cpuset;
2831
2832 /* other problems reading cpuset are bad news */
2833 sav_errno = errno;
2834 goto failed;
2835 }
2836
2837 if ((plen = cpuset_pidlist_length(pl)) == 0)
2838 goto no_more_pids;
2839
2840 for (j = 0; j < plen; j++) {
2841 pid_t pid;
2842
2843 pid = cpuset_get_pidlist(pl, j);
2844 if (cpuset_move(pid, torelpath) < 0) {
2845 /* missing task is as good as if moved */
2846 if (errno == ESRCH)
2847 continue;
2848
2849 /* other per-task errors are bad news */
2850 sav_errno = errno;
2851 goto failed;
2852 }
2853 }
2854
2855 cpuset_freepidlist(pl);
2856 pl = NULL;
2857 }
2858
2859 sav_errno = ENOTEMPTY;
2860 /* fall into ... */
2861 failed:
2862 cpuset_freepidlist(pl);
2863 errno = sav_errno;
2864 return -1;
2865
2866 no_more_pids:
2867 no_more_cpuset:
2868 /* Success - all tasks (or entire cpuset ;) gone. */
2869 cpuset_freepidlist(pl);
2870 errno = 0;
2871 return 0;
2872 }
2873
2874 /* Migrate task (pid == 0 for current) to a cpuset (moves task and memory) */
cpuset_migrate(pid_t pid,const char * relpath)2875 int cpuset_migrate(pid_t pid, const char *relpath)
2876 {
2877 char buf[PATH_MAX];
2878 char buf2[PATH_MAX];
2879 char memory_migrate_flag;
2880 int r;
2881
2882 if (check() < 0)
2883 return -1;
2884
2885 if (pid == 0)
2886 pid = getpid();
2887
2888 fullpath(buf2, sizeof(buf2), relpath);
2889
2890 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2891 return -1;
2892 if (store_flag(buf2, "memory_migrate", 1) < 0)
2893 return -1;
2894
2895 fullpath2(buf, sizeof(buf), relpath, "tasks");
2896
2897 r = __cpuset_move(pid, buf);
2898
2899 store_flag(buf2, "memory_migrate", memory_migrate_flag);
2900 return r;
2901 }
2902
2903 /* Migrate all tasks in pidlist to a cpuset (moves task and memory) */
cpuset_migrate_all(struct cpuset_pidlist * pl,const char * relpath)2904 int cpuset_migrate_all(struct cpuset_pidlist *pl, const char *relpath)
2905 {
2906 int i;
2907 char buf[PATH_MAX];
2908 char buf2[PATH_MAX];
2909 char memory_migrate_flag;
2910 int ret;
2911
2912 if (check() < 0)
2913 return -1;
2914
2915 fullpath(buf2, sizeof(buf2), relpath);
2916
2917 if (load_flag(buf2, &memory_migrate_flag, "memory_migrate") < 0)
2918 return -1;
2919 if (store_flag(buf2, "memory_migrate", 1) < 0)
2920 return -1;
2921
2922 fullpath2(buf, sizeof(buf), relpath, "tasks");
2923
2924 ret = 0;
2925 for (i = 0; i < pl->npids; i++)
2926 if (__cpuset_move(pl->pids[i], buf) < 0)
2927 ret = -1;
2928
2929 if (store_flag(buf2, "memory_migrate", memory_migrate_flag) < 0)
2930 ret = -1;
2931 return ret;
2932 }
2933
2934 /* Rebind cpus_allowed of each task in cpuset 'path' */
cpuset_reattach(const char * relpath)2935 int cpuset_reattach(const char *relpath)
2936 {
2937 struct cpuset_pidlist *pl;
2938 int rc;
2939
2940 if ((pl = cpuset_init_pidlist(relpath, 0)) == NULL)
2941 return -1;
2942 rc = cpuset_move_all(pl, relpath);
2943 cpuset_freepidlist(pl);
2944 return rc;
2945 }
2946
2947 /* Map cpuset relative cpu number to system wide cpu number */
cpuset_c_rel_to_sys_cpu(const struct cpuset * cp,int cpu)2948 int cpuset_c_rel_to_sys_cpu(const struct cpuset *cp, int cpu)
2949 {
2950 struct cpuset *cp_tofree = NULL;
2951 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2952 int pos = -1;
2953
2954 if (!cp1)
2955 goto err;
2956 pos = bitmask_rel_to_abs_pos(cp1->cpus, cpu);
2957 /* fall into ... */
2958 err:
2959 cpuset_free(cp_tofree);
2960 return pos;
2961 }
2962
2963 /* Map system wide cpu number to cpuset relative cpu number */
cpuset_c_sys_to_rel_cpu(const struct cpuset * cp,int cpu)2964 int cpuset_c_sys_to_rel_cpu(const struct cpuset *cp, int cpu)
2965 {
2966 struct cpuset *cp_tofree = NULL;
2967 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2968 int pos = -1;
2969
2970 if (!cp1)
2971 goto err;
2972 pos = bitmask_abs_to_rel_pos(cp1->cpus, cpu);
2973 /* fall into ... */
2974 err:
2975 cpuset_free(cp_tofree);
2976 return pos;
2977 }
2978
2979 /* Map cpuset relative mem number to system wide mem number */
cpuset_c_rel_to_sys_mem(const struct cpuset * cp,int mem)2980 int cpuset_c_rel_to_sys_mem(const struct cpuset *cp, int mem)
2981 {
2982 struct cpuset *cp_tofree = NULL;
2983 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
2984 int pos = -1;
2985
2986 if (!cp1)
2987 goto err;
2988 pos = bitmask_rel_to_abs_pos(cp1->mems, mem);
2989 /* fall into ... */
2990 err:
2991 cpuset_free(cp_tofree);
2992 return pos;
2993 }
2994
2995 /* Map system wide mem number to cpuset relative mem number */
cpuset_c_sys_to_rel_mem(const struct cpuset * cp,int mem)2996 int cpuset_c_sys_to_rel_mem(const struct cpuset *cp, int mem)
2997 {
2998 struct cpuset *cp_tofree = NULL;
2999 const struct cpuset *cp1 = resolve_cp(cp, &cp_tofree);
3000 int pos = -1;
3001
3002 if (!cp1)
3003 goto err;
3004 pos = bitmask_abs_to_rel_pos(cp1->mems, mem);
3005 /* fall into ... */
3006 err:
3007 cpuset_free(cp_tofree);
3008 return pos;
3009 }
3010
3011 /* Map pid's cpuset relative cpu number to system wide cpu number */
cpuset_p_rel_to_sys_cpu(pid_t pid,int cpu)3012 int cpuset_p_rel_to_sys_cpu(pid_t pid, int cpu)
3013 {
3014 struct cpuset *cp;
3015 int rc = -1;
3016
3017 if ((cp = cpuset_alloc()) == NULL)
3018 goto done;
3019 if (cpuset_cpusetofpid(cp, pid) < 0)
3020 goto done;
3021 rc = cpuset_c_rel_to_sys_cpu(cp, cpu);
3022 done:
3023 cpuset_free(cp);
3024 return rc;
3025 }
3026
3027 /* Map system wide cpu number to pid's cpuset relative cpu number */
cpuset_p_sys_to_rel_cpu(pid_t pid,int cpu)3028 int cpuset_p_sys_to_rel_cpu(pid_t pid, int cpu)
3029 {
3030 struct cpuset *cp;
3031 int rc = -1;
3032
3033 if ((cp = cpuset_alloc()) == NULL)
3034 goto done;
3035 if (cpuset_cpusetofpid(cp, pid) < 0)
3036 goto done;
3037 rc = cpuset_c_sys_to_rel_cpu(cp, cpu);
3038 done:
3039 cpuset_free(cp);
3040 return rc;
3041 }
3042
3043 /* Map pid's cpuset relative mem number to system wide mem number */
cpuset_p_rel_to_sys_mem(pid_t pid,int mem)3044 int cpuset_p_rel_to_sys_mem(pid_t pid, int mem)
3045 {
3046 struct cpuset *cp;
3047 int rc = -1;
3048
3049 if ((cp = cpuset_alloc()) == NULL)
3050 goto done;
3051 if (cpuset_cpusetofpid(cp, pid) < 0)
3052 goto done;
3053 rc = cpuset_c_rel_to_sys_mem(cp, mem);
3054 done:
3055 cpuset_free(cp);
3056 return rc;
3057 }
3058
3059 /* Map system wide mem number to pid's cpuset relative mem number */
cpuset_p_sys_to_rel_mem(pid_t pid,int mem)3060 int cpuset_p_sys_to_rel_mem(pid_t pid, int mem)
3061 {
3062 struct cpuset *cp;
3063 int rc = -1;
3064
3065 if ((cp = cpuset_alloc()) == NULL)
3066 goto done;
3067 if (cpuset_cpusetofpid(cp, pid) < 0)
3068 goto done;
3069 rc = cpuset_c_sys_to_rel_mem(cp, mem);
3070 done:
3071 cpuset_free(cp);
3072 return rc;
3073 }
3074
3075 /*
3076 * Override glibc's calls for get/set affinity - they have
3077 * something using cpu_set_t that will die when NR_CPUS > 1024.
3078 * Go directly to the 'real' system calls. Also override calls
3079 * for get_mempolicy and set_mempolicy. None of these
3080 * calls are yet (July 2004) guaranteed to be in all glibc versions
3081 * that we care about.
3082 */
3083
sched_setaffinity(pid_t pid,unsigned len,unsigned long * mask)3084 static int sched_setaffinity(pid_t pid, unsigned len, unsigned long *mask)
3085 {
3086 return ltp_syscall(__NR_sched_setaffinity, pid, len, mask);
3087 }
3088
get_mempolicy(int * policy,unsigned long * nmask,unsigned long maxnode,void * addr,int flags)3089 static int get_mempolicy(int *policy, unsigned long *nmask,
3090 unsigned long maxnode, void *addr, int flags)
3091 {
3092 return ltp_syscall(__NR_get_mempolicy, policy, nmask, maxnode,
3093 addr, flags);
3094 }
3095
set_mempolicy(int mode,unsigned long * nmask,unsigned long maxnode)3096 static int set_mempolicy(int mode, unsigned long *nmask, unsigned long maxnode)
3097 {
3098 return ltp_syscall(__NR_set_mempolicy, mode, nmask, maxnode);
3099 }
3100
3101 struct cpuset_placement {
3102 struct bitmask *cpus;
3103 struct bitmask *mems;
3104 char *path;
3105 };
3106
3107 /* Allocate and fill in a placement struct - cpatures current placement */
cpuset_get_placement(pid_t pid)3108 struct cpuset_placement *cpuset_get_placement(pid_t pid)
3109 {
3110 struct cpuset_placement *plc;
3111 struct cpuset *cp = NULL;
3112 char buf[PATH_MAX];
3113 int nbits;
3114
3115 if ((plc = calloc(1, sizeof(*plc))) == NULL)
3116 goto err;
3117
3118 nbits = cpuset_cpus_nbits();
3119 if ((plc->cpus = bitmask_alloc(nbits)) == NULL)
3120 goto err;
3121
3122 nbits = cpuset_mems_nbits();
3123 if ((plc->mems = bitmask_alloc(nbits)) == NULL)
3124 goto err;
3125
3126 if ((cp = cpuset_alloc()) == NULL)
3127 goto err;
3128 if (cpuset_getcpusetpath(pid, buf, sizeof(buf)) == NULL)
3129 goto err;
3130 if (cpuset_query(cp, buf) < 0)
3131 goto err;
3132
3133 bitmask_copy(plc->cpus, cp->cpus);
3134 bitmask_copy(plc->mems, cp->mems);
3135 plc->path = strdup(buf);
3136
3137 cpuset_free(cp);
3138 return plc;
3139 err:
3140 cpuset_free(cp);
3141 cpuset_free_placement(plc);
3142 return NULL;
3143 }
3144
3145 /* Compare two placement structs - use to detect changes in placement */
cpuset_equal_placement(const struct cpuset_placement * plc1,const struct cpuset_placement * plc2)3146 int cpuset_equal_placement(const struct cpuset_placement *plc1,
3147 const struct cpuset_placement *plc2)
3148 {
3149 return bitmask_equal(plc1->cpus, plc2->cpus) &&
3150 bitmask_equal(plc1->mems, plc2->mems) &&
3151 streq(plc1->path, plc2->path);
3152 }
3153
3154 /* Free a placement struct */
cpuset_free_placement(struct cpuset_placement * plc)3155 void cpuset_free_placement(struct cpuset_placement *plc)
3156 {
3157 if (!plc)
3158 return;
3159 bitmask_free(plc->cpus);
3160 bitmask_free(plc->mems);
3161 free(plc->path);
3162 free(plc);
3163 }
3164
3165 /*
3166 * A cpuset_fts_open() call constructs a linked list of entries
3167 * called a "cpuset_fts_tree", with one entry per cpuset below
3168 * the specified path. The cpuset_fts_read() routine returns the
3169 * next entry on this list. The various cpuset_fts_get_*() calls
3170 * return attributes of the specified entry. The cpuset_fts_close()
3171 * call frees the linked list and all associated data. All cpuset
3172 * entries and attributes for the cpuset_fts_tree returned from a
3173 * given cpuset_fts_open() call remain allocated and unchanged until
3174 * that cpuset_fts_tree is closed by a cpuset_fts_close() call. Any
3175 * subsequent changes to the cpuset filesystem will go unnoticed
3176 * (not affect open cpuset_fts_tree's.)
3177 */
3178
3179 struct cpuset_fts_entry;
3180 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree);
3181
3182 struct cpuset_fts_tree {
3183 struct cpuset_fts_entry *head; /* head of linked entry list */
3184 struct cpuset_fts_entry *next; /* cpuset_fts_read() offset */
3185 };
3186
3187 struct cpuset_fts_entry {
3188 struct cpuset_fts_entry *next; /* linked entry list chain */
3189 struct cpuset *cpuset;
3190 struct stat *stat;
3191 char *path;
3192 int info;
3193 int err;
3194 };
3195
3196 /* Open a handle on a cpuset hierarchy. All the real work is done here. */
cpuset_fts_open(const char * cpusetpath)3197 struct cpuset_fts_tree *cpuset_fts_open(const char *cpusetpath)
3198 {
3199 FTS *fts = NULL;
3200 FTSENT *ftsent;
3201 char *path_argv[2];
3202 char buf[PATH_MAX];
3203 struct cpuset_fts_tree *cs_tree = NULL;
3204 struct cpuset_fts_entry *ep; /* the latest new list entry */
3205 struct cpuset_fts_entry **pnlep; /* ptr to next list entry ptr */
3206 char *relpath;
3207 int fts_flags;
3208
3209 fullpath(buf, sizeof(buf), cpusetpath);
3210 path_argv[0] = buf;
3211 path_argv[1] = NULL;
3212
3213 fts_flags = FTS_PHYSICAL | FTS_NOCHDIR | FTS_NOSTAT | FTS_XDEV;
3214 fts = fts_open(path_argv, fts_flags, NULL);
3215 if (fts == NULL)
3216 goto err;
3217
3218 cs_tree = malloc(sizeof(*cs_tree));
3219 if (cs_tree == NULL)
3220 goto err;
3221 pnlep = &cs_tree->head;
3222 *pnlep = NULL;
3223
3224 while ((ftsent = fts_read(fts)) != NULL) {
3225 if (ftsent->fts_info != FTS_D && ftsent->fts_info != FTS_DNR)
3226 continue;
3227
3228 /* ftsent is a directory (perhaps unreadable) ==> cpuset */
3229 ep = calloc(1, sizeof(*ep));
3230 if (ep == NULL)
3231 goto err;
3232 *pnlep = ep;
3233 pnlep = &ep->next;
3234
3235 /* Set entry's path, and if DNR, error */
3236 relpath = ftsent->fts_path + strlen(cpusetmnt);
3237 if (strlen(relpath) == 0)
3238 relpath = "/";
3239 ep->path = strdup(relpath);
3240 if (ep->path == NULL)
3241 goto err;
3242 if (ftsent->fts_info == FTS_DNR) {
3243 ep->info = CPUSET_FTS_ERR_DNR;
3244 ep->err = ftsent->fts_errno;
3245 continue;
3246 }
3247
3248 /* ftsent is a -readable- cpuset: set entry's stat, etc */
3249 ep->stat = calloc(1, sizeof(struct stat));
3250 if (ep->stat == NULL)
3251 goto err;
3252 if (stat(ftsent->fts_path, ep->stat) < 0) {
3253 ep->info = CPUSET_FTS_ERR_STAT;
3254 ep->err = ftsent->fts_errno;
3255 continue;
3256 }
3257
3258 ep->cpuset = calloc(1, sizeof(struct cpuset));
3259 if (ep->cpuset == NULL)
3260 goto err;
3261 if (cpuset_query(ep->cpuset, relpath) < 0) {
3262 ep->info = CPUSET_FTS_ERR_CPUSET;
3263 ep->err = errno;
3264 continue;
3265 }
3266 ep->info = CPUSET_FTS_CPUSET;
3267 }
3268
3269 (void)fts_close(fts);
3270 cpuset_fts_rewind(cs_tree);
3271 return cs_tree;
3272
3273 err:
3274 if (cs_tree)
3275 cpuset_fts_close(cs_tree);
3276 if (fts)
3277 (void)fts_close(fts);
3278 return NULL;
3279 }
3280
3281 /* Return pointer to next cpuset entry in hierarchy */
cpuset_fts_read(struct cpuset_fts_tree * cs_tree)3282 const struct cpuset_fts_entry *cpuset_fts_read(struct cpuset_fts_tree *cs_tree)
3283 {
3284 const struct cpuset_fts_entry *cs_entry = cs_tree->next;
3285 if (cs_tree->next != NULL) /* seek to next entry */
3286 cs_tree->next = cs_tree->next->next;
3287 return cs_entry;
3288 }
3289
3290 /* Reverse list of cpusets, in place. Simulates pre-order/post-order flip. */
cpuset_fts_reverse(struct cpuset_fts_tree * cs_tree)3291 void cpuset_fts_reverse(struct cpuset_fts_tree *cs_tree)
3292 {
3293 struct cpuset_fts_entry *cs1, *cs2, *cs3;
3294
3295 /*
3296 * At each step, cs1 < cs2 < cs3 and the cs2->next pointer
3297 * is redirected from cs3 to cs1.
3298 */
3299
3300 cs1 = cs2 = NULL;
3301 cs3 = cs_tree->head;
3302 while (cs3) {
3303 cs1 = cs2;
3304 cs2 = cs3;
3305 cs3 = cs3->next;
3306 cs2->next = cs1;
3307 }
3308 cs_tree->head = cs2;
3309 cpuset_fts_rewind(cs_tree);
3310 }
3311
3312 /* Rewind cpuset list to beginning */
cpuset_fts_rewind(struct cpuset_fts_tree * cs_tree)3313 void cpuset_fts_rewind(struct cpuset_fts_tree *cs_tree)
3314 {
3315 cs_tree->next = cs_tree->head;
3316 }
3317
3318 /* Return pointer to nul-terminated cpuset path of entry in hierarchy */
cpuset_fts_get_path(const struct cpuset_fts_entry * cs_entry)3319 const char *cpuset_fts_get_path(const struct cpuset_fts_entry *cs_entry)
3320 {
3321 return cs_entry->path;
3322 }
3323
3324 /* Return pointer to stat(2) structure of a cpuset entry's directory */
cpuset_fts_get_stat(const struct cpuset_fts_entry * cs_entry)3325 const struct stat *cpuset_fts_get_stat(const struct cpuset_fts_entry *cs_entry)
3326 {
3327 return cs_entry->stat;
3328 }
3329
3330 /* Return pointer to cpuset structure of a cpuset entry */
cpuset_fts_get_cpuset(const struct cpuset_fts_entry * cs_entry)3331 const struct cpuset *cpuset_fts_get_cpuset(const struct cpuset_fts_entry
3332 *cs_entry)
3333 {
3334 return cs_entry->cpuset;
3335 }
3336
3337 /* Return value of errno (0 if no error) on attempted cpuset operations */
cpuset_fts_get_errno(const struct cpuset_fts_entry * cs_entry)3338 int cpuset_fts_get_errno(const struct cpuset_fts_entry *cs_entry)
3339 {
3340 return cs_entry->err;
3341 }
3342
3343 /* Return operation identity causing error */
cpuset_fts_get_info(const struct cpuset_fts_entry * cs_entry)3344 int cpuset_fts_get_info(const struct cpuset_fts_entry *cs_entry)
3345 {
3346 return cs_entry->info;
3347 }
3348
3349 /* Close a cpuset hierarchy handle (free's all associated memory) */
cpuset_fts_close(struct cpuset_fts_tree * cs_tree)3350 void cpuset_fts_close(struct cpuset_fts_tree *cs_tree)
3351 {
3352 struct cpuset_fts_entry *cs_entry = cs_tree->head;
3353
3354 while (cs_entry) {
3355 struct cpuset_fts_entry *ep = cs_entry;
3356
3357 cs_entry = cs_entry->next;
3358 free(ep->path);
3359 free(ep->stat);
3360 cpuset_free(ep->cpuset);
3361 free(ep);
3362 }
3363 free(cs_tree);
3364 }
3365
3366 /* Bind current task to cpu (uses sched_setaffinity(2)) */
cpuset_cpubind(int cpu)3367 int cpuset_cpubind(int cpu)
3368 {
3369 struct bitmask *bmp;
3370 int r;
3371
3372 if ((bmp = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3373 return -1;
3374 bitmask_setbit(bmp, cpu);
3375 r = sched_setaffinity(0, bitmask_nbytes(bmp), bitmask_mask(bmp));
3376 bitmask_free(bmp);
3377 return r;
3378 }
3379
3380 /*
3381 * int cpuset_latestcpu(pid_t pid)
3382 *
3383 * Return most recent CPU on which task pid executed. If pid == 0,
3384 * examine current task.
3385 *
3386 * The last used CPU is visible for a given pid as field #39 (starting
3387 * with #1) in the file /proc/pid/stat. Currently this file has 41
3388 * fields, in which case this is the 3rd to the last field.
3389 *
3390 * Unfortunately field #2 is a command name and might have embedded
3391 * whitespace. So we can't just count white space separated fields.
3392 * Fortunately, this command name is surrounded by parentheses, as
3393 * for example "(sh)", and that closing parenthesis is the last ')'
3394 * character in the line. No remaining fields can have embedded
3395 * whitespace or parentheses. So instead of looking for the 39th
3396 * white space separated field, we can look for the 37th white space
3397 * separated field past the last ')' character on the line.
3398 */
3399
3400 /* Return most recent CPU on which task pid executed */
cpuset_latestcpu(pid_t pid)3401 int cpuset_latestcpu(pid_t pid)
3402 {
3403 char buf[PATH_MAX];
3404 char *bp;
3405 int fd = -1;
3406 int cpu = -1;
3407
3408 if (pid == 0)
3409 snprintf(buf, sizeof(buf), "/proc/self/stat");
3410 else
3411 snprintf(buf, sizeof(buf), "/proc/%d/stat", pid);
3412
3413 if ((fd = open(buf, O_RDONLY)) < 0)
3414 goto err;
3415 if (read(fd, buf, sizeof(buf)) < 1)
3416 goto err;
3417 close(fd);
3418
3419 bp = strrchr(buf, ')');
3420 if (bp)
3421 sscanf(bp + 1, "%*s %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %*u %*u " "%*u %*u %*u %*u %*u %*u %*u %*u %u", /* 37th field past ')' */
3422 &cpu);
3423 if (cpu < 0)
3424 errno = EINVAL;
3425 return cpu;
3426 err:
3427 if (fd >= 0)
3428 close(fd);
3429 return -1;
3430 }
3431
3432 /* Bind current task to memory (uses set_mempolicy(2)) */
cpuset_membind(int mem)3433 int cpuset_membind(int mem)
3434 {
3435 struct bitmask *bmp;
3436 int r;
3437
3438 if ((bmp = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3439 return -1;
3440 bitmask_setbit(bmp, mem);
3441 r = set_mempolicy(MPOL_BIND, bitmask_mask(bmp), bitmask_nbits(bmp) + 1);
3442 bitmask_free(bmp);
3443 return r;
3444 }
3445
3446 /* [optional] Return Memory Node holding page at specified addr */
cpuset_addr2node(void * addr)3447 int cpuset_addr2node(void *addr)
3448 {
3449 int node = -1;
3450
3451 if (get_mempolicy(&node, NULL, 0, addr, MPOL_F_NODE | MPOL_F_ADDR)) {
3452 /* I realize this seems redundant, but I _want_ to make sure
3453 * that this value is -1. */
3454 node = -1;
3455 }
3456 return node;
3457 }
3458
3459 /*
3460 * Transform cpuset into Text Format Representation in buffer 'buf',
3461 * of length 'buflen', nul-terminated if space allows. Return number
3462 * of characters that would have been written, if enough space had
3463 * been available, in the same way that snprintf() does.
3464 */
3465
3466 /* Export cpuset settings to a regular file */
cpuset_export(const struct cpuset * cp,char * buf,int buflen)3467 int cpuset_export(const struct cpuset *cp, char *buf, int buflen)
3468 {
3469 char *tmp = NULL;
3470 int n = 0;
3471
3472 if (cp->cpu_exclusive)
3473 n += snprintf(buf + n, MAX(buflen - n, 0), "cpu_exclusive\n");
3474
3475 if (cp->mem_exclusive)
3476 n += snprintf(buf + n, MAX(buflen - n, 0), "mem_exclusive\n");
3477
3478 if (cp->notify_on_release)
3479 n += snprintf(buf + n, MAX(buflen - n, 0),
3480 "notify_on_release\n");
3481
3482 if (cp->memory_pressure_enabled)
3483 n += snprintf(buf + n, MAX(buflen - n, 0),
3484 "memory_pressure_enabled\n");
3485
3486 if (cp->memory_migrate)
3487 n += snprintf(buf + n, MAX(buflen - n, 0), "memory_migrate\n");
3488
3489 if (cp->memory_spread_page)
3490 n += snprintf(buf + n, MAX(buflen - n, 0),
3491 "memory_spread_page\n");
3492
3493 if (cp->memory_spread_slab)
3494 n += snprintf(buf + n, MAX(buflen - n, 0),
3495 "memory_spread_slab\n");
3496
3497 if ((tmp = sprint_mask_buf(cp->cpus)) == NULL)
3498 return -1;
3499 n += snprintf(buf + n, MAX(buflen - n, 0), "cpus %s\n", tmp);
3500 free(tmp);
3501 tmp = NULL;
3502
3503 if ((tmp = sprint_mask_buf(cp->mems)) == NULL)
3504 return -1;
3505 n += snprintf(buf + n, MAX(buflen - n, 0), "mems %s\n", tmp);
3506 free(tmp);
3507 tmp = NULL;
3508
3509 return n;
3510 }
3511
import_list(UNUSED const char * tok,const char * arg,struct bitmask * bmp,char * emsg,int elen)3512 static int import_list(UNUSED const char *tok, const char *arg,
3513 struct bitmask *bmp, char *emsg, int elen)
3514 {
3515 if (bitmask_parselist(arg, bmp) < 0) {
3516 if (emsg)
3517 snprintf(emsg, elen, "Invalid list format: %s", arg);
3518 return -1;
3519 }
3520 return 0;
3521 }
3522
stolower(char * s)3523 static void stolower(char *s)
3524 {
3525 while (*s) {
3526 unsigned char c = *s;
3527 *s = tolower(c);
3528 s++;
3529 }
3530 }
3531
3532 /* Import cpuset settings from a regular file */
cpuset_import(struct cpuset * cp,const char * buf,int * elinenum,char * emsg,int elen)3533 int cpuset_import(struct cpuset *cp, const char *buf, int *elinenum,
3534 char *emsg, int elen)
3535 {
3536 char *linebuf = NULL;
3537 int linebuflen;
3538 int linenum = 0;
3539 int offset = 0;
3540
3541 linebuflen = strlen(buf) + 1;
3542 if ((linebuf = malloc(linebuflen)) == NULL) {
3543 if (emsg)
3544 snprintf(emsg, elen, "Insufficient memory");
3545 goto err;
3546 }
3547
3548 while (slgets(linebuf, linebuflen, buf, &offset)) {
3549 char *tok, *arg;
3550 char *ptr; /* for strtok_r */
3551
3552 linenum++;
3553 if ((tok = strchr(linebuf, '#')) != NULL)
3554 *tok = 0;
3555 if ((tok = strtok_r(linebuf, " \t", &ptr)) == NULL)
3556 continue;
3557 stolower(tok);
3558
3559 arg = strtok_r(0, " \t", &ptr);
3560
3561 if (streq(tok, "cpu_exclusive")) {
3562 cp->cpu_exclusive = 1;
3563 goto eol;
3564 }
3565 if (streq(tok, "mem_exclusive")) {
3566 cp->mem_exclusive = 1;
3567 goto eol;
3568 }
3569 if (streq(tok, "notify_on_release")) {
3570 cp->notify_on_release = 1;
3571 goto eol;
3572 }
3573 if (streq(tok, "memory_pressure_enabled")) {
3574 cp->memory_pressure_enabled = 1;
3575 goto eol;
3576 }
3577 if (streq(tok, "memory_migrate")) {
3578 cp->memory_migrate = 1;
3579 goto eol;
3580 }
3581 if (streq(tok, "memory_spread_page")) {
3582 cp->memory_spread_page = 1;
3583 goto eol;
3584 }
3585 if (streq(tok, "memory_spread_slab")) {
3586 cp->memory_spread_slab = 1;
3587 goto eol;
3588 }
3589 if (streq(tok, "cpu") || streq(tok, "cpus")) {
3590 if (import_list(tok, arg, cp->cpus, emsg, elen) < 0)
3591 goto err;
3592 goto eol;
3593 }
3594 if (streq(tok, "mem") || streq(tok, "mems")) {
3595 if (import_list(tok, arg, cp->mems, emsg, elen) < 0)
3596 goto err;
3597 goto eol;
3598 }
3599 if (emsg)
3600 snprintf(emsg, elen, "Unrecognized token: '%s'", tok);
3601 goto err;
3602 eol:
3603 if ((tok = strtok_r(0, " \t", &ptr)) != NULL) {
3604 if (emsg)
3605 snprintf(emsg, elen, "Surplus token: '%s'",
3606 tok);
3607 goto err;
3608 }
3609 continue;
3610 }
3611
3612 free(linebuf);
3613
3614 if (bitmask_isallclear(cp->cpus) && !bitmask_isallclear(cp->mems))
3615 cpuset_localcpus(cp->mems, cp->cpus);
3616 else if (!bitmask_isallclear(cp->cpus) && bitmask_isallclear(cp->mems))
3617 cpuset_localmems(cp->cpus, cp->mems);
3618
3619 /*
3620 * All cpuset attributes are determined in an import.
3621 * Those that aren't explicitly specified are presumed
3622 * to be unchanged (zero, if it's a freshly allocated
3623 * struct cpuset.)
3624 */
3625
3626 cp->cpus_valid = 1;
3627 cp->mems_valid = 1;
3628 cp->cpu_exclusive_valid = 1;
3629 cp->mem_exclusive_valid = 1;
3630 cp->notify_on_release_valid = 1;
3631 cp->memory_migrate_valid = 1;
3632 cp->memory_pressure_enabled_valid = 1;
3633 cp->memory_spread_page_valid = 1;
3634 cp->memory_spread_slab_valid = 1;
3635
3636 return 0;
3637 err:
3638 if (elinenum)
3639 *elinenum = linenum;
3640 free(linebuf);
3641 return -1;
3642 }
3643
3644 /* Pin current task CPU (and memory) */
cpuset_pin(int relcpu)3645 int cpuset_pin(int relcpu)
3646 {
3647 struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3648 int cpu, r;
3649
3650 if (check() < 0)
3651 return -1;
3652
3653 do {
3654 cpuset_free_placement(plc1);
3655 plc1 = cpuset_get_placement(0);
3656
3657 r = 0;
3658 if (cpuset_unpin() < 0)
3659 r = -1;
3660 cpu = cpuset_p_rel_to_sys_cpu(0, relcpu);
3661 if (cpuset_cpubind(cpu) < 0)
3662 r = -1;
3663
3664 cpuset_free_placement(plc2);
3665 plc2 = cpuset_get_placement(0);
3666 } while (!cpuset_equal_placement(plc1, plc2));
3667
3668 cpuset_free_placement(plc1);
3669 cpuset_free_placement(plc2);
3670 return r;
3671 }
3672
3673 /* Return number CPUs in current tasks cpuset */
cpuset_size(void)3674 int cpuset_size(void)
3675 {
3676 struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3677 int r;
3678
3679 if (check() < 0)
3680 return -1;
3681
3682 do {
3683 cpuset_free_placement(plc1);
3684 plc1 = cpuset_get_placement(0);
3685
3686 r = cpuset_cpus_weight(0);
3687
3688 cpuset_free_placement(plc2);
3689 plc2 = cpuset_get_placement(0);
3690 } while (!cpuset_equal_placement(plc1, plc2));
3691
3692 cpuset_free_placement(plc1);
3693 cpuset_free_placement(plc2);
3694 return r;
3695 }
3696
3697 /* Return relative CPU number, within current cpuset, last executed on */
cpuset_where(void)3698 int cpuset_where(void)
3699 {
3700 struct cpuset_placement *plc1 = NULL, *plc2 = NULL;
3701 int r;
3702
3703 if (check() < 0)
3704 return -1;
3705
3706 do {
3707 cpuset_free_placement(plc1);
3708 plc1 = cpuset_get_placement(0);
3709
3710 r = cpuset_p_sys_to_rel_cpu(0, cpuset_latestcpu(0));
3711
3712 cpuset_free_placement(plc2);
3713 plc2 = cpuset_get_placement(0);
3714 } while (!cpuset_equal_placement(plc1, plc2));
3715
3716 cpuset_free_placement(plc1);
3717 cpuset_free_placement(plc2);
3718 return r;
3719 }
3720
3721 /* Undo cpuset_pin - let current task have the run of all CPUs in its cpuset */
cpuset_unpin(void)3722 int cpuset_unpin(void)
3723 {
3724 struct bitmask *cpus = NULL, *mems = NULL;
3725 int r = -1;
3726
3727 if (check() < 0)
3728 goto err;
3729
3730 /*
3731 * Don't need cpuset_*_placement() guard against concurrent
3732 * cpuset migration, because none of the following depends
3733 * on the tasks cpuset placement.
3734 */
3735
3736 if ((cpus = bitmask_alloc(cpuset_cpus_nbits())) == NULL)
3737 goto err;
3738 bitmask_setall(cpus);
3739 if (sched_setaffinity(0, bitmask_nbytes(cpus), bitmask_mask(cpus)) < 0)
3740 goto err;
3741
3742 if ((mems = bitmask_alloc(cpuset_mems_nbits())) == NULL)
3743 goto err;
3744 if (set_mempolicy(MPOL_DEFAULT, bitmask_mask(mems),
3745 bitmask_nbits(mems) + 1) < 0)
3746 goto err;
3747 r = 0;
3748 /* fall into ... */
3749 err:
3750 bitmask_free(cpus);
3751 bitmask_free(mems);
3752 return r;
3753
3754 }
3755
3756 struct cpuset_function_list {
3757 const char *fname;
3758 void *func;
3759 } flist[] = {
3760 {
3761 "cpuset_version", cpuset_version}, {
3762 "cpuset_alloc", cpuset_alloc}, {
3763 "cpuset_free", cpuset_free}, {
3764 "cpuset_cpus_nbits", cpuset_cpus_nbits}, {
3765 "cpuset_mems_nbits", cpuset_mems_nbits}, {
3766 "cpuset_setcpus", cpuset_setcpus}, {
3767 "cpuset_setmems", cpuset_setmems}, {
3768 "cpuset_set_iopt", cpuset_set_iopt}, {
3769 "cpuset_set_sopt", cpuset_set_sopt}, {
3770 "cpuset_getcpus", cpuset_getcpus}, {
3771 "cpuset_getmems", cpuset_getmems}, {
3772 "cpuset_cpus_weight", cpuset_cpus_weight}, {
3773 "cpuset_mems_weight", cpuset_mems_weight}, {
3774 "cpuset_get_iopt", cpuset_get_iopt}, {
3775 "cpuset_get_sopt", cpuset_get_sopt}, {
3776 "cpuset_localcpus", cpuset_localcpus}, {
3777 "cpuset_localmems", cpuset_localmems}, {
3778 "cpuset_cpumemdist", cpuset_cpumemdist}, {
3779 "cpuset_cpu2node", cpuset_cpu2node}, {
3780 "cpuset_addr2node", cpuset_addr2node}, {
3781 "cpuset_create", cpuset_create}, {
3782 "cpuset_delete", cpuset_delete}, {
3783 "cpuset_query", cpuset_query}, {
3784 "cpuset_modify", cpuset_modify}, {
3785 "cpuset_getcpusetpath", cpuset_getcpusetpath}, {
3786 "cpuset_cpusetofpid", cpuset_cpusetofpid}, {
3787 "cpuset_mountpoint", cpuset_mountpoint}, {
3788 "cpuset_collides_exclusive", cpuset_collides_exclusive}, {
3789 "cpuset_nuke", cpuset_nuke}, {
3790 "cpuset_init_pidlist", cpuset_init_pidlist}, {
3791 "cpuset_pidlist_length", cpuset_pidlist_length}, {
3792 "cpuset_get_pidlist", cpuset_get_pidlist}, {
3793 "cpuset_freepidlist", cpuset_freepidlist}, {
3794 "cpuset_move", cpuset_move}, {
3795 "cpuset_move_all", cpuset_move_all}, {
3796 "cpuset_move_cpuset_tasks", cpuset_move_cpuset_tasks}, {
3797 "cpuset_migrate", cpuset_migrate}, {
3798 "cpuset_migrate_all", cpuset_migrate_all}, {
3799 "cpuset_reattach", cpuset_reattach}, {
3800 "cpuset_open_memory_pressure", cpuset_open_memory_pressure}, {
3801 "cpuset_read_memory_pressure", cpuset_read_memory_pressure}, {
3802 "cpuset_close_memory_pressure", cpuset_close_memory_pressure}, {
3803 "cpuset_c_rel_to_sys_cpu", cpuset_c_rel_to_sys_cpu}, {
3804 "cpuset_c_sys_to_rel_cpu", cpuset_c_sys_to_rel_cpu}, {
3805 "cpuset_c_rel_to_sys_mem", cpuset_c_rel_to_sys_mem}, {
3806 "cpuset_c_sys_to_rel_mem", cpuset_c_sys_to_rel_mem}, {
3807 "cpuset_p_rel_to_sys_cpu", cpuset_p_rel_to_sys_cpu}, {
3808 "cpuset_p_sys_to_rel_cpu", cpuset_p_sys_to_rel_cpu}, {
3809 "cpuset_p_rel_to_sys_mem", cpuset_p_rel_to_sys_mem}, {
3810 "cpuset_p_sys_to_rel_mem", cpuset_p_sys_to_rel_mem}, {
3811 "cpuset_get_placement", cpuset_get_placement}, {
3812 "cpuset_equal_placement", cpuset_equal_placement}, {
3813 "cpuset_free_placement", cpuset_free_placement}, {
3814 "cpuset_fts_open", cpuset_fts_open}, {
3815 "cpuset_fts_read", cpuset_fts_read}, {
3816 "cpuset_fts_reverse", cpuset_fts_reverse}, {
3817 "cpuset_fts_rewind", cpuset_fts_rewind}, {
3818 "cpuset_fts_get_path", cpuset_fts_get_path}, {
3819 "cpuset_fts_get_stat", cpuset_fts_get_stat}, {
3820 "cpuset_fts_get_cpuset", cpuset_fts_get_cpuset}, {
3821 "cpuset_fts_get_errno", cpuset_fts_get_errno}, {
3822 "cpuset_fts_get_info", cpuset_fts_get_info}, {
3823 "cpuset_fts_close", cpuset_fts_close}, {
3824 "cpuset_cpubind", cpuset_cpubind}, {
3825 "cpuset_latestcpu", cpuset_latestcpu}, {
3826 "cpuset_membind", cpuset_membind}, {
3827 "cpuset_export", cpuset_export}, {
3828 "cpuset_import", cpuset_import}, {
3829 "cpuset_function", cpuset_function}, {
3830 "cpuset_pin", cpuset_pin}, {
3831 "cpuset_size", cpuset_size}, {
3832 "cpuset_where", cpuset_where}, {
3833 "cpuset_unpin", cpuset_unpin},};
3834
3835 /* Return pointer to a libcpuset.so function, or NULL */
cpuset_function(const char * function_name)3836 void *cpuset_function(const char *function_name)
3837 {
3838 unsigned int i;
3839
3840 for (i = 0; i < sizeof(flist) / sizeof(flist[0]); i++)
3841 if (streq(function_name, flist[i].fname))
3842 return flist[i].func;
3843 return NULL;
3844 }
3845
3846 /* Fortran interface to basic cpuset routines */
cpuset_pin_(int * ptr_relcpu)3847 int cpuset_pin_(int *ptr_relcpu)
3848 {
3849 return cpuset_pin(*ptr_relcpu);
3850 }
3851
cpuset_size_(void)3852 int cpuset_size_(void)
3853 {
3854 return cpuset_size();
3855 }
3856
cpuset_where_(void)3857 int cpuset_where_(void)
3858 {
3859 return cpuset_where();
3860 }
3861
cpuset_unpin_(void)3862 int cpuset_unpin_(void)
3863 {
3864 return cpuset_unpin();
3865 }
3866
3867 #endif /* HAVE_LINUX_MEMPOLICY_H */
3868