1 /*
2 * kmp_affinity.cpp -- affinity management
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_i18n.h"
16 #include "kmp_io.h"
17 #include "kmp_str.h"
18 #include "kmp_wrapper_getpid.h"
19 #if KMP_USE_HIER_SCHED
20 #include "kmp_dispatch_hier.h"
21 #endif
22
23 // Store the real or imagined machine hierarchy here
24 static hierarchy_info machine_hierarchy;
25
__kmp_cleanup_hierarchy()26 void __kmp_cleanup_hierarchy() { machine_hierarchy.fini(); }
27
__kmp_get_hierarchy(kmp_uint32 nproc,kmp_bstate_t * thr_bar)28 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) {
29 kmp_uint32 depth;
30 // The test below is true if affinity is available, but set to "none". Need to
31 // init on first use of hierarchical barrier.
32 if (TCR_1(machine_hierarchy.uninitialized))
33 machine_hierarchy.init(NULL, nproc);
34
35 // Adjust the hierarchy in case num threads exceeds original
36 if (nproc > machine_hierarchy.base_num_threads)
37 machine_hierarchy.resize(nproc);
38
39 depth = machine_hierarchy.depth;
40 KMP_DEBUG_ASSERT(depth > 0);
41
42 thr_bar->depth = depth;
43 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0] - 1;
44 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel;
45 }
46
47 #if KMP_AFFINITY_SUPPORTED
48
49 bool KMPAffinity::picked_api = false;
50
operator new(size_t n)51 void *KMPAffinity::Mask::operator new(size_t n) { return __kmp_allocate(n); }
operator new[](size_t n)52 void *KMPAffinity::Mask::operator new[](size_t n) { return __kmp_allocate(n); }
operator delete(void * p)53 void KMPAffinity::Mask::operator delete(void *p) { __kmp_free(p); }
operator delete[](void * p)54 void KMPAffinity::Mask::operator delete[](void *p) { __kmp_free(p); }
operator new(size_t n)55 void *KMPAffinity::operator new(size_t n) { return __kmp_allocate(n); }
operator delete(void * p)56 void KMPAffinity::operator delete(void *p) { __kmp_free(p); }
57
pick_api()58 void KMPAffinity::pick_api() {
59 KMPAffinity *affinity_dispatch;
60 if (picked_api)
61 return;
62 #if KMP_USE_HWLOC
63 // Only use Hwloc if affinity isn't explicitly disabled and
64 // user requests Hwloc topology method
65 if (__kmp_affinity_top_method == affinity_top_method_hwloc &&
66 __kmp_affinity_type != affinity_disabled) {
67 affinity_dispatch = new KMPHwlocAffinity();
68 } else
69 #endif
70 {
71 affinity_dispatch = new KMPNativeAffinity();
72 }
73 __kmp_affinity_dispatch = affinity_dispatch;
74 picked_api = true;
75 }
76
destroy_api()77 void KMPAffinity::destroy_api() {
78 if (__kmp_affinity_dispatch != NULL) {
79 delete __kmp_affinity_dispatch;
80 __kmp_affinity_dispatch = NULL;
81 picked_api = false;
82 }
83 }
84
85 #define KMP_ADVANCE_SCAN(scan) \
86 while (*scan != '\0') { \
87 scan++; \
88 }
89
90 // Print the affinity mask to the character array in a pretty format.
91 // The format is a comma separated list of non-negative integers or integer
92 // ranges: e.g., 1,2,3-5,7,9-15
93 // The format can also be the string "{<empty>}" if no bits are set in mask
__kmp_affinity_print_mask(char * buf,int buf_len,kmp_affin_mask_t * mask)94 char *__kmp_affinity_print_mask(char *buf, int buf_len,
95 kmp_affin_mask_t *mask) {
96 int start = 0, finish = 0, previous = 0;
97 bool first_range;
98 KMP_ASSERT(buf);
99 KMP_ASSERT(buf_len >= 40);
100 KMP_ASSERT(mask);
101 char *scan = buf;
102 char *end = buf + buf_len - 1;
103
104 // Check for empty set.
105 if (mask->begin() == mask->end()) {
106 KMP_SNPRINTF(scan, end - scan + 1, "{<empty>}");
107 KMP_ADVANCE_SCAN(scan);
108 KMP_ASSERT(scan <= end);
109 return buf;
110 }
111
112 first_range = true;
113 start = mask->begin();
114 while (1) {
115 // Find next range
116 // [start, previous] is inclusive range of contiguous bits in mask
117 for (finish = mask->next(start), previous = start;
118 finish == previous + 1 && finish != mask->end();
119 finish = mask->next(finish)) {
120 previous = finish;
121 }
122
123 // The first range does not need a comma printed before it, but the rest
124 // of the ranges do need a comma beforehand
125 if (!first_range) {
126 KMP_SNPRINTF(scan, end - scan + 1, "%s", ",");
127 KMP_ADVANCE_SCAN(scan);
128 } else {
129 first_range = false;
130 }
131 // Range with three or more contiguous bits in the affinity mask
132 if (previous - start > 1) {
133 KMP_SNPRINTF(scan, end - scan + 1, "%d-%d", static_cast<int>(start),
134 static_cast<int>(previous));
135 } else {
136 // Range with one or two contiguous bits in the affinity mask
137 KMP_SNPRINTF(scan, end - scan + 1, "%d", static_cast<int>(start));
138 KMP_ADVANCE_SCAN(scan);
139 if (previous - start > 0) {
140 KMP_SNPRINTF(scan, end - scan + 1, ",%d", static_cast<int>(previous));
141 }
142 }
143 KMP_ADVANCE_SCAN(scan);
144 // Start over with new start point
145 start = finish;
146 if (start == mask->end())
147 break;
148 // Check for overflow
149 if (end - scan < 2)
150 break;
151 }
152
153 // Check for overflow
154 KMP_ASSERT(scan <= end);
155 return buf;
156 }
157 #undef KMP_ADVANCE_SCAN
158
159 // Print the affinity mask to the string buffer object in a pretty format
160 // The format is a comma separated list of non-negative integers or integer
161 // ranges: e.g., 1,2,3-5,7,9-15
162 // The format can also be the string "{<empty>}" if no bits are set in mask
__kmp_affinity_str_buf_mask(kmp_str_buf_t * buf,kmp_affin_mask_t * mask)163 kmp_str_buf_t *__kmp_affinity_str_buf_mask(kmp_str_buf_t *buf,
164 kmp_affin_mask_t *mask) {
165 int start = 0, finish = 0, previous = 0;
166 bool first_range;
167 KMP_ASSERT(buf);
168 KMP_ASSERT(mask);
169
170 __kmp_str_buf_clear(buf);
171
172 // Check for empty set.
173 if (mask->begin() == mask->end()) {
174 __kmp_str_buf_print(buf, "%s", "{<empty>}");
175 return buf;
176 }
177
178 first_range = true;
179 start = mask->begin();
180 while (1) {
181 // Find next range
182 // [start, previous] is inclusive range of contiguous bits in mask
183 for (finish = mask->next(start), previous = start;
184 finish == previous + 1 && finish != mask->end();
185 finish = mask->next(finish)) {
186 previous = finish;
187 }
188
189 // The first range does not need a comma printed before it, but the rest
190 // of the ranges do need a comma beforehand
191 if (!first_range) {
192 __kmp_str_buf_print(buf, "%s", ",");
193 } else {
194 first_range = false;
195 }
196 // Range with three or more contiguous bits in the affinity mask
197 if (previous - start > 1) {
198 __kmp_str_buf_print(buf, "%d-%d", static_cast<int>(start),
199 static_cast<int>(previous));
200 } else {
201 // Range with one or two contiguous bits in the affinity mask
202 __kmp_str_buf_print(buf, "%d", static_cast<int>(start));
203 if (previous - start > 0) {
204 __kmp_str_buf_print(buf, ",%d", static_cast<int>(previous));
205 }
206 }
207 // Start over with new start point
208 start = finish;
209 if (start == mask->end())
210 break;
211 }
212 return buf;
213 }
214
__kmp_affinity_entire_machine_mask(kmp_affin_mask_t * mask)215 void __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) {
216 KMP_CPU_ZERO(mask);
217
218 #if KMP_GROUP_AFFINITY
219
220 if (__kmp_num_proc_groups > 1) {
221 int group;
222 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL);
223 for (group = 0; group < __kmp_num_proc_groups; group++) {
224 int i;
225 int num = __kmp_GetActiveProcessorCount(group);
226 for (i = 0; i < num; i++) {
227 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask);
228 }
229 }
230 } else
231
232 #endif /* KMP_GROUP_AFFINITY */
233
234 {
235 int proc;
236 for (proc = 0; proc < __kmp_xproc; proc++) {
237 KMP_CPU_SET(proc, mask);
238 }
239 }
240 }
241
242 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be
243 // called to renumber the labels from [0..n] and place them into the child_num
244 // vector of the address object. This is done in case the labels used for
245 // the children at one node of the hierarchy differ from those used for
246 // another node at the same level. Example: suppose the machine has 2 nodes
247 // with 2 packages each. The first node contains packages 601 and 602, and
248 // second node contains packages 603 and 604. If we try to sort the table
249 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604
250 // because we are paying attention to the labels themselves, not the ordinal
251 // child numbers. By using the child numbers in the sort, the result is
252 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604.
__kmp_affinity_assign_child_nums(AddrUnsPair * address2os,int numAddrs)253 static void __kmp_affinity_assign_child_nums(AddrUnsPair *address2os,
254 int numAddrs) {
255 KMP_DEBUG_ASSERT(numAddrs > 0);
256 int depth = address2os->first.depth;
257 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
258 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
259 int labCt;
260 for (labCt = 0; labCt < depth; labCt++) {
261 address2os[0].first.childNums[labCt] = counts[labCt] = 0;
262 lastLabel[labCt] = address2os[0].first.labels[labCt];
263 }
264 int i;
265 for (i = 1; i < numAddrs; i++) {
266 for (labCt = 0; labCt < depth; labCt++) {
267 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) {
268 int labCt2;
269 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) {
270 counts[labCt2] = 0;
271 lastLabel[labCt2] = address2os[i].first.labels[labCt2];
272 }
273 counts[labCt]++;
274 lastLabel[labCt] = address2os[i].first.labels[labCt];
275 break;
276 }
277 }
278 for (labCt = 0; labCt < depth; labCt++) {
279 address2os[i].first.childNums[labCt] = counts[labCt];
280 }
281 for (; labCt < (int)Address::maxDepth; labCt++) {
282 address2os[i].first.childNums[labCt] = 0;
283 }
284 }
285 __kmp_free(lastLabel);
286 __kmp_free(counts);
287 }
288
289 // All of the __kmp_affinity_create_*_map() routines should set
290 // __kmp_affinity_masks to a vector of affinity mask objects of length
291 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and return
292 // the number of levels in the machine topology tree (zero if
293 // __kmp_affinity_type == affinity_none).
294 //
295 // All of the __kmp_affinity_create_*_map() routines should set
296 // *__kmp_affin_fullMask to the affinity mask for the initialization thread.
297 // They need to save and restore the mask, and it could be needed later, so
298 // saving it is just an optimization to avoid calling kmp_get_system_affinity()
299 // again.
300 kmp_affin_mask_t *__kmp_affin_fullMask = NULL;
301
302 static int nCoresPerPkg, nPackages;
303 static int __kmp_nThreadsPerCore;
304 #ifndef KMP_DFLT_NTH_CORES
305 static int __kmp_ncores;
306 #endif
307 static int *__kmp_pu_os_idx = NULL;
308
309 // __kmp_affinity_uniform_topology() doesn't work when called from
310 // places which support arbitrarily many levels in the machine topology
311 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map()
312 // __kmp_affinity_create_x2apicid_map().
__kmp_affinity_uniform_topology()313 inline static bool __kmp_affinity_uniform_topology() {
314 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages);
315 }
316
317 // Print out the detailed machine topology map, i.e. the physical locations
318 // of each OS proc.
__kmp_affinity_print_topology(AddrUnsPair * address2os,int len,int depth,int pkgLevel,int coreLevel,int threadLevel)319 static void __kmp_affinity_print_topology(AddrUnsPair *address2os, int len,
320 int depth, int pkgLevel,
321 int coreLevel, int threadLevel) {
322 int proc;
323
324 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
325 for (proc = 0; proc < len; proc++) {
326 int level;
327 kmp_str_buf_t buf;
328 __kmp_str_buf_init(&buf);
329 for (level = 0; level < depth; level++) {
330 if (level == threadLevel) {
331 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread));
332 } else if (level == coreLevel) {
333 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core));
334 } else if (level == pkgLevel) {
335 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package));
336 } else if (level > pkgLevel) {
337 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node),
338 level - pkgLevel - 1);
339 } else {
340 __kmp_str_buf_print(&buf, "L%d ", level);
341 }
342 __kmp_str_buf_print(&buf, "%d ", address2os[proc].first.labels[level]);
343 }
344 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second,
345 buf.str);
346 __kmp_str_buf_free(&buf);
347 }
348 }
349
350 #if KMP_USE_HWLOC
351
__kmp_affinity_print_hwloc_tp(AddrUnsPair * addrP,int len,int depth,int * levels)352 static void __kmp_affinity_print_hwloc_tp(AddrUnsPair *addrP, int len,
353 int depth, int *levels) {
354 int proc;
355 kmp_str_buf_t buf;
356 __kmp_str_buf_init(&buf);
357 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY");
358 for (proc = 0; proc < len; proc++) {
359 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Package),
360 addrP[proc].first.labels[0]);
361 if (depth > 1) {
362 int level = 1; // iterate over levels
363 int label = 1; // iterate over labels
364 if (__kmp_numa_detected)
365 // node level follows package
366 if (levels[level++] > 0)
367 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Node),
368 addrP[proc].first.labels[label++]);
369 if (__kmp_tile_depth > 0)
370 // tile level follows node if any, or package
371 if (levels[level++] > 0)
372 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Tile),
373 addrP[proc].first.labels[label++]);
374 if (levels[level++] > 0)
375 // core level follows
376 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Core),
377 addrP[proc].first.labels[label++]);
378 if (levels[level++] > 0)
379 // thread level is the latest
380 __kmp_str_buf_print(&buf, "%s %d ", KMP_I18N_STR(Thread),
381 addrP[proc].first.labels[label++]);
382 KMP_DEBUG_ASSERT(label == depth);
383 }
384 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", addrP[proc].second, buf.str);
385 __kmp_str_buf_clear(&buf);
386 }
387 __kmp_str_buf_free(&buf);
388 }
389
390 static int nNodePerPkg, nTilePerPkg, nTilePerNode, nCorePerNode, nCorePerTile;
391
392 // This function removes the topology levels that are radix 1 and don't offer
393 // further information about the topology. The most common example is when you
394 // have one thread context per core, we don't want the extra thread context
395 // level if it offers no unique labels. So they are removed.
396 // return value: the new depth of address2os
__kmp_affinity_remove_radix_one_levels(AddrUnsPair * addrP,int nTh,int depth,int * levels)397 static int __kmp_affinity_remove_radix_one_levels(AddrUnsPair *addrP, int nTh,
398 int depth, int *levels) {
399 int level;
400 int i;
401 int radix1_detected;
402 int new_depth = depth;
403 for (level = depth - 1; level > 0; --level) {
404 // Detect if this level is radix 1
405 radix1_detected = 1;
406 for (i = 1; i < nTh; ++i) {
407 if (addrP[0].first.labels[level] != addrP[i].first.labels[level]) {
408 // There are differing label values for this level so it stays
409 radix1_detected = 0;
410 break;
411 }
412 }
413 if (!radix1_detected)
414 continue;
415 // Radix 1 was detected
416 --new_depth;
417 levels[level] = -1; // mark level as not present in address2os array
418 if (level == new_depth) {
419 // "turn off" deepest level, just decrement the depth that removes
420 // the level from address2os array
421 for (i = 0; i < nTh; ++i) {
422 addrP[i].first.depth--;
423 }
424 } else {
425 // For other levels, we move labels over and also reduce the depth
426 int j;
427 for (j = level; j < new_depth; ++j) {
428 for (i = 0; i < nTh; ++i) {
429 addrP[i].first.labels[j] = addrP[i].first.labels[j + 1];
430 addrP[i].first.depth--;
431 }
432 levels[j + 1] -= 1;
433 }
434 }
435 }
436 return new_depth;
437 }
438
439 // Returns the number of objects of type 'type' below 'obj' within the topology
440 // tree structure. e.g., if obj is a HWLOC_OBJ_PACKAGE object, and type is
441 // HWLOC_OBJ_PU, then this will return the number of PU's under the SOCKET
442 // object.
__kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,hwloc_obj_type_t type)443 static int __kmp_hwloc_get_nobjs_under_obj(hwloc_obj_t obj,
444 hwloc_obj_type_t type) {
445 int retval = 0;
446 hwloc_obj_t first;
447 for (first = hwloc_get_obj_below_by_type(__kmp_hwloc_topology, obj->type,
448 obj->logical_index, type, 0);
449 first != NULL &&
450 hwloc_get_ancestor_obj_by_type(__kmp_hwloc_topology, obj->type, first) ==
451 obj;
452 first = hwloc_get_next_obj_by_type(__kmp_hwloc_topology, first->type,
453 first)) {
454 ++retval;
455 }
456 return retval;
457 }
458
__kmp_hwloc_count_children_by_depth(hwloc_topology_t t,hwloc_obj_t o,kmp_hwloc_depth_t depth,hwloc_obj_t * f)459 static int __kmp_hwloc_count_children_by_depth(hwloc_topology_t t,
460 hwloc_obj_t o,
461 kmp_hwloc_depth_t depth,
462 hwloc_obj_t *f) {
463 if (o->depth == depth) {
464 if (*f == NULL)
465 *f = o; // output first descendant found
466 return 1;
467 }
468 int sum = 0;
469 for (unsigned i = 0; i < o->arity; i++)
470 sum += __kmp_hwloc_count_children_by_depth(t, o->children[i], depth, f);
471 return sum; // will be 0 if no one found (as PU arity is 0)
472 }
473
__kmp_hwloc_count_children_by_type(hwloc_topology_t t,hwloc_obj_t o,hwloc_obj_type_t type,hwloc_obj_t * f)474 static int __kmp_hwloc_count_children_by_type(hwloc_topology_t t, hwloc_obj_t o,
475 hwloc_obj_type_t type,
476 hwloc_obj_t *f) {
477 if (!hwloc_compare_types(o->type, type)) {
478 if (*f == NULL)
479 *f = o; // output first descendant found
480 return 1;
481 }
482 int sum = 0;
483 for (unsigned i = 0; i < o->arity; i++)
484 sum += __kmp_hwloc_count_children_by_type(t, o->children[i], type, f);
485 return sum; // will be 0 if no one found (as PU arity is 0)
486 }
487
__kmp_hwloc_process_obj_core_pu(AddrUnsPair * addrPair,int & nActiveThreads,int & num_active_cores,hwloc_obj_t obj,int depth,int * labels)488 static int __kmp_hwloc_process_obj_core_pu(AddrUnsPair *addrPair,
489 int &nActiveThreads,
490 int &num_active_cores,
491 hwloc_obj_t obj, int depth,
492 int *labels) {
493 hwloc_obj_t core = NULL;
494 hwloc_topology_t &tp = __kmp_hwloc_topology;
495 int NC = __kmp_hwloc_count_children_by_type(tp, obj, HWLOC_OBJ_CORE, &core);
496 for (int core_id = 0; core_id < NC; ++core_id, core = core->next_cousin) {
497 hwloc_obj_t pu = NULL;
498 KMP_DEBUG_ASSERT(core != NULL);
499 int num_active_threads = 0;
500 int NT = __kmp_hwloc_count_children_by_type(tp, core, HWLOC_OBJ_PU, &pu);
501 // int NT = core->arity; pu = core->first_child; // faster?
502 for (int pu_id = 0; pu_id < NT; ++pu_id, pu = pu->next_cousin) {
503 KMP_DEBUG_ASSERT(pu != NULL);
504 if (!KMP_CPU_ISSET(pu->os_index, __kmp_affin_fullMask))
505 continue; // skip inactive (inaccessible) unit
506 Address addr(depth + 2);
507 KA_TRACE(20, ("Hwloc inserting %d (%d) %d (%d) %d (%d) into address2os\n",
508 obj->os_index, obj->logical_index, core->os_index,
509 core->logical_index, pu->os_index, pu->logical_index));
510 for (int i = 0; i < depth; ++i)
511 addr.labels[i] = labels[i]; // package, etc.
512 addr.labels[depth] = core_id; // core
513 addr.labels[depth + 1] = pu_id; // pu
514 addrPair[nActiveThreads] = AddrUnsPair(addr, pu->os_index);
515 __kmp_pu_os_idx[nActiveThreads] = pu->os_index;
516 nActiveThreads++;
517 ++num_active_threads; // count active threads per core
518 }
519 if (num_active_threads) { // were there any active threads on the core?
520 ++__kmp_ncores; // count total active cores
521 ++num_active_cores; // count active cores per socket
522 if (num_active_threads > __kmp_nThreadsPerCore)
523 __kmp_nThreadsPerCore = num_active_threads; // calc maximum
524 }
525 }
526 return 0;
527 }
528
529 // Check if NUMA node detected below the package,
530 // and if tile object is detected and return its depth
__kmp_hwloc_check_numa()531 static int __kmp_hwloc_check_numa() {
532 hwloc_topology_t &tp = __kmp_hwloc_topology;
533 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
534 int depth, l2cache_depth, package_depth;
535
536 // Get some PU
537 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, 0);
538 if (hT == NULL) // something has gone wrong
539 return 1;
540
541 // check NUMA node below PACKAGE
542 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
543 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
544 KMP_DEBUG_ASSERT(hS != NULL);
545 if (hN != NULL && hN->depth > hS->depth) {
546 __kmp_numa_detected = TRUE; // socket includes node(s)
547 if (__kmp_affinity_gran == affinity_gran_node) {
548 __kmp_affinity_gran = affinity_gran_numa;
549 }
550 }
551
552 package_depth = hwloc_get_type_depth(tp, HWLOC_OBJ_PACKAGE);
553 l2cache_depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
554 // check tile, get object by depth because of multiple caches possible
555 depth = (l2cache_depth < package_depth) ? package_depth : l2cache_depth;
556 hL = hwloc_get_ancestor_obj_by_depth(tp, depth, hT);
557 hC = NULL; // not used, but reset it here just in case
558 if (hL != NULL &&
559 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1)
560 __kmp_tile_depth = depth; // tile consists of multiple cores
561 return 0;
562 }
563
__kmp_affinity_create_hwloc_map(AddrUnsPair ** address2os,kmp_i18n_id_t * const msg_id)564 static int __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os,
565 kmp_i18n_id_t *const msg_id) {
566 hwloc_topology_t &tp = __kmp_hwloc_topology; // shortcut of a long name
567 *address2os = NULL;
568 *msg_id = kmp_i18n_null;
569
570 // Save the affinity mask for the current thread.
571 kmp_affin_mask_t *oldMask;
572 KMP_CPU_ALLOC(oldMask);
573 __kmp_get_system_affinity(oldMask, TRUE);
574 __kmp_hwloc_check_numa();
575
576 if (!KMP_AFFINITY_CAPABLE()) {
577 // Hack to try and infer the machine topology using only the data
578 // available from cpuid on the current thread, and __kmp_xproc.
579 KMP_ASSERT(__kmp_affinity_type == affinity_none);
580 // hwloc only guarantees existance of PU object, so check PACKAGE and CORE
581 hwloc_obj_t o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0);
582 if (o != NULL)
583 nCoresPerPkg = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_CORE);
584 else
585 nCoresPerPkg = 1; // no PACKAGE found
586 o = hwloc_get_obj_by_type(tp, HWLOC_OBJ_CORE, 0);
587 if (o != NULL)
588 __kmp_nThreadsPerCore = __kmp_hwloc_get_nobjs_under_obj(o, HWLOC_OBJ_PU);
589 else
590 __kmp_nThreadsPerCore = 1; // no CORE found
591 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
592 if (nCoresPerPkg == 0)
593 nCoresPerPkg = 1; // to prevent possible division by 0
594 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
595 if (__kmp_affinity_verbose) {
596 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
597 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
598 if (__kmp_affinity_uniform_topology()) {
599 KMP_INFORM(Uniform, "KMP_AFFINITY");
600 } else {
601 KMP_INFORM(NonUniform, "KMP_AFFINITY");
602 }
603 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
604 __kmp_nThreadsPerCore, __kmp_ncores);
605 }
606 KMP_CPU_FREE(oldMask);
607 return 0;
608 }
609
610 int depth = 3;
611 int levels[5] = {0, 1, 2, 3, 4}; // package, [node,] [tile,] core, thread
612 int labels[3] = {0}; // package [,node] [,tile] - head of labels array
613 if (__kmp_numa_detected)
614 ++depth;
615 if (__kmp_tile_depth)
616 ++depth;
617
618 // Allocate the data structure to be returned.
619 AddrUnsPair *retval =
620 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
621 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
622 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
623
624 // When affinity is off, this routine will still be called to set
625 // __kmp_ncores, as well as __kmp_nThreadsPerCore,
626 // nCoresPerPkg, & nPackages. Make sure all these vars are set
627 // correctly, and return if affinity is not enabled.
628
629 hwloc_obj_t socket, node, tile;
630 int nActiveThreads = 0;
631 int socket_id = 0;
632 // re-calculate globals to count only accessible resources
633 __kmp_ncores = nPackages = nCoresPerPkg = __kmp_nThreadsPerCore = 0;
634 nNodePerPkg = nTilePerPkg = nTilePerNode = nCorePerNode = nCorePerTile = 0;
635 for (socket = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PACKAGE, 0); socket != NULL;
636 socket = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, socket),
637 socket_id++) {
638 labels[0] = socket_id;
639 if (__kmp_numa_detected) {
640 int NN;
641 int n_active_nodes = 0;
642 node = NULL;
643 NN = __kmp_hwloc_count_children_by_type(tp, socket, HWLOC_OBJ_NUMANODE,
644 &node);
645 for (int node_id = 0; node_id < NN; ++node_id, node = node->next_cousin) {
646 labels[1] = node_id;
647 if (__kmp_tile_depth) {
648 // NUMA + tiles
649 int NT;
650 int n_active_tiles = 0;
651 tile = NULL;
652 NT = __kmp_hwloc_count_children_by_depth(tp, node, __kmp_tile_depth,
653 &tile);
654 for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
655 labels[2] = tl_id;
656 int n_active_cores = 0;
657 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
658 n_active_cores, tile, 3, labels);
659 if (n_active_cores) { // were there any active cores on the socket?
660 ++n_active_tiles; // count active tiles per node
661 if (n_active_cores > nCorePerTile)
662 nCorePerTile = n_active_cores; // calc maximum
663 }
664 }
665 if (n_active_tiles) { // were there any active tiles on the socket?
666 ++n_active_nodes; // count active nodes per package
667 if (n_active_tiles > nTilePerNode)
668 nTilePerNode = n_active_tiles; // calc maximum
669 }
670 } else {
671 // NUMA, no tiles
672 int n_active_cores = 0;
673 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
674 n_active_cores, node, 2, labels);
675 if (n_active_cores) { // were there any active cores on the socket?
676 ++n_active_nodes; // count active nodes per package
677 if (n_active_cores > nCorePerNode)
678 nCorePerNode = n_active_cores; // calc maximum
679 }
680 }
681 }
682 if (n_active_nodes) { // were there any active nodes on the socket?
683 ++nPackages; // count total active packages
684 if (n_active_nodes > nNodePerPkg)
685 nNodePerPkg = n_active_nodes; // calc maximum
686 }
687 } else {
688 if (__kmp_tile_depth) {
689 // no NUMA, tiles
690 int NT;
691 int n_active_tiles = 0;
692 tile = NULL;
693 NT = __kmp_hwloc_count_children_by_depth(tp, socket, __kmp_tile_depth,
694 &tile);
695 for (int tl_id = 0; tl_id < NT; ++tl_id, tile = tile->next_cousin) {
696 labels[1] = tl_id;
697 int n_active_cores = 0;
698 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads,
699 n_active_cores, tile, 2, labels);
700 if (n_active_cores) { // were there any active cores on the socket?
701 ++n_active_tiles; // count active tiles per package
702 if (n_active_cores > nCorePerTile)
703 nCorePerTile = n_active_cores; // calc maximum
704 }
705 }
706 if (n_active_tiles) { // were there any active tiles on the socket?
707 ++nPackages; // count total active packages
708 if (n_active_tiles > nTilePerPkg)
709 nTilePerPkg = n_active_tiles; // calc maximum
710 }
711 } else {
712 // no NUMA, no tiles
713 int n_active_cores = 0;
714 __kmp_hwloc_process_obj_core_pu(retval, nActiveThreads, n_active_cores,
715 socket, 1, labels);
716 if (n_active_cores) { // were there any active cores on the socket?
717 ++nPackages; // count total active packages
718 if (n_active_cores > nCoresPerPkg)
719 nCoresPerPkg = n_active_cores; // calc maximum
720 }
721 }
722 }
723 }
724
725 // If there's only one thread context to bind to, return now.
726 KMP_DEBUG_ASSERT(nActiveThreads == __kmp_avail_proc);
727 KMP_ASSERT(nActiveThreads > 0);
728 if (nActiveThreads == 1) {
729 __kmp_ncores = nPackages = 1;
730 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
731 if (__kmp_affinity_verbose) {
732 char buf[KMP_AFFIN_MASK_PRINT_LEN];
733 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
734
735 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
736 if (__kmp_affinity_respect_mask) {
737 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
738 } else {
739 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
740 }
741 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
742 KMP_INFORM(Uniform, "KMP_AFFINITY");
743 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
744 __kmp_nThreadsPerCore, __kmp_ncores);
745 }
746
747 if (__kmp_affinity_type == affinity_none) {
748 __kmp_free(retval);
749 KMP_CPU_FREE(oldMask);
750 return 0;
751 }
752
753 // Form an Address object which only includes the package level.
754 Address addr(1);
755 addr.labels[0] = retval[0].first.labels[0];
756 retval[0].first = addr;
757
758 if (__kmp_affinity_gran_levels < 0) {
759 __kmp_affinity_gran_levels = 0;
760 }
761
762 if (__kmp_affinity_verbose) {
763 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
764 }
765
766 *address2os = retval;
767 KMP_CPU_FREE(oldMask);
768 return 1;
769 }
770
771 // Sort the table by physical Id.
772 qsort(retval, nActiveThreads, sizeof(*retval),
773 __kmp_affinity_cmp_Address_labels);
774
775 // Check to see if the machine topology is uniform
776 int nPUs = nPackages * __kmp_nThreadsPerCore;
777 if (__kmp_numa_detected) {
778 if (__kmp_tile_depth) { // NUMA + tiles
779 nPUs *= (nNodePerPkg * nTilePerNode * nCorePerTile);
780 } else { // NUMA, no tiles
781 nPUs *= (nNodePerPkg * nCorePerNode);
782 }
783 } else {
784 if (__kmp_tile_depth) { // no NUMA, tiles
785 nPUs *= (nTilePerPkg * nCorePerTile);
786 } else { // no NUMA, no tiles
787 nPUs *= nCoresPerPkg;
788 }
789 }
790 unsigned uniform = (nPUs == nActiveThreads);
791
792 // Print the machine topology summary.
793 if (__kmp_affinity_verbose) {
794 char mask[KMP_AFFIN_MASK_PRINT_LEN];
795 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
796 if (__kmp_affinity_respect_mask) {
797 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
798 } else {
799 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
800 }
801 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
802 if (uniform) {
803 KMP_INFORM(Uniform, "KMP_AFFINITY");
804 } else {
805 KMP_INFORM(NonUniform, "KMP_AFFINITY");
806 }
807 if (__kmp_numa_detected) {
808 if (__kmp_tile_depth) { // NUMA + tiles
809 KMP_INFORM(TopologyExtraNoTi, "KMP_AFFINITY", nPackages, nNodePerPkg,
810 nTilePerNode, nCorePerTile, __kmp_nThreadsPerCore,
811 __kmp_ncores);
812 } else { // NUMA, no tiles
813 KMP_INFORM(TopologyExtraNode, "KMP_AFFINITY", nPackages, nNodePerPkg,
814 nCorePerNode, __kmp_nThreadsPerCore, __kmp_ncores);
815 nPUs *= (nNodePerPkg * nCorePerNode);
816 }
817 } else {
818 if (__kmp_tile_depth) { // no NUMA, tiles
819 KMP_INFORM(TopologyExtraTile, "KMP_AFFINITY", nPackages, nTilePerPkg,
820 nCorePerTile, __kmp_nThreadsPerCore, __kmp_ncores);
821 } else { // no NUMA, no tiles
822 kmp_str_buf_t buf;
823 __kmp_str_buf_init(&buf);
824 __kmp_str_buf_print(&buf, "%d", nPackages);
825 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
826 __kmp_nThreadsPerCore, __kmp_ncores);
827 __kmp_str_buf_free(&buf);
828 }
829 }
830 }
831
832 if (__kmp_affinity_type == affinity_none) {
833 __kmp_free(retval);
834 KMP_CPU_FREE(oldMask);
835 return 0;
836 }
837
838 int depth_full = depth; // number of levels before compressing
839 // Find any levels with radix 1, and remove them from the map
840 // (except for the package level).
841 depth = __kmp_affinity_remove_radix_one_levels(retval, nActiveThreads, depth,
842 levels);
843 KMP_DEBUG_ASSERT(__kmp_affinity_gran != affinity_gran_default);
844 if (__kmp_affinity_gran_levels < 0) {
845 // Set the granularity level based on what levels are modeled
846 // in the machine topology map.
847 __kmp_affinity_gran_levels = 0; // lowest level (e.g. fine)
848 if (__kmp_affinity_gran > affinity_gran_thread) {
849 for (int i = 1; i <= depth_full; ++i) {
850 if (__kmp_affinity_gran <= i) // only count deeper levels
851 break;
852 if (levels[depth_full - i] > 0)
853 __kmp_affinity_gran_levels++;
854 }
855 }
856 if (__kmp_affinity_gran > affinity_gran_package)
857 __kmp_affinity_gran_levels++; // e.g. granularity = group
858 }
859
860 if (__kmp_affinity_verbose)
861 __kmp_affinity_print_hwloc_tp(retval, nActiveThreads, depth, levels);
862
863 KMP_CPU_FREE(oldMask);
864 *address2os = retval;
865 return depth;
866 }
867 #endif // KMP_USE_HWLOC
868
869 // If we don't know how to retrieve the machine's processor topology, or
870 // encounter an error in doing so, this routine is called to form a "flat"
871 // mapping of os thread id's <-> processor id's.
__kmp_affinity_create_flat_map(AddrUnsPair ** address2os,kmp_i18n_id_t * const msg_id)872 static int __kmp_affinity_create_flat_map(AddrUnsPair **address2os,
873 kmp_i18n_id_t *const msg_id) {
874 *address2os = NULL;
875 *msg_id = kmp_i18n_null;
876
877 // Even if __kmp_affinity_type == affinity_none, this routine might still
878 // called to set __kmp_ncores, as well as
879 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
880 if (!KMP_AFFINITY_CAPABLE()) {
881 KMP_ASSERT(__kmp_affinity_type == affinity_none);
882 __kmp_ncores = nPackages = __kmp_xproc;
883 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
884 if (__kmp_affinity_verbose) {
885 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY");
886 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
887 KMP_INFORM(Uniform, "KMP_AFFINITY");
888 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
889 __kmp_nThreadsPerCore, __kmp_ncores);
890 }
891 return 0;
892 }
893
894 // When affinity is off, this routine will still be called to set
895 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
896 // Make sure all these vars are set correctly, and return now if affinity is
897 // not enabled.
898 __kmp_ncores = nPackages = __kmp_avail_proc;
899 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
900 if (__kmp_affinity_verbose) {
901 char buf[KMP_AFFIN_MASK_PRINT_LEN];
902 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
903 __kmp_affin_fullMask);
904
905 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY");
906 if (__kmp_affinity_respect_mask) {
907 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
908 } else {
909 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
910 }
911 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
912 KMP_INFORM(Uniform, "KMP_AFFINITY");
913 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
914 __kmp_nThreadsPerCore, __kmp_ncores);
915 }
916 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
917 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
918 if (__kmp_affinity_type == affinity_none) {
919 int avail_ct = 0;
920 int i;
921 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
922 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask))
923 continue;
924 __kmp_pu_os_idx[avail_ct++] = i; // suppose indices are flat
925 }
926 return 0;
927 }
928
929 // Construct the data structure to be returned.
930 *address2os =
931 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
932 int avail_ct = 0;
933 int i;
934 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
935 // Skip this proc if it is not included in the machine model.
936 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
937 continue;
938 }
939 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
940 Address addr(1);
941 addr.labels[0] = i;
942 (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
943 }
944 if (__kmp_affinity_verbose) {
945 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY");
946 }
947
948 if (__kmp_affinity_gran_levels < 0) {
949 // Only the package level is modeled in the machine topology map,
950 // so the #levels of granularity is either 0 or 1.
951 if (__kmp_affinity_gran > affinity_gran_package) {
952 __kmp_affinity_gran_levels = 1;
953 } else {
954 __kmp_affinity_gran_levels = 0;
955 }
956 }
957 return 1;
958 }
959
960 #if KMP_GROUP_AFFINITY
961
962 // If multiple Windows* OS processor groups exist, we can create a 2-level
963 // topology map with the groups at level 0 and the individual procs at level 1.
964 // This facilitates letting the threads float among all procs in a group,
965 // if granularity=group (the default when there are multiple groups).
__kmp_affinity_create_proc_group_map(AddrUnsPair ** address2os,kmp_i18n_id_t * const msg_id)966 static int __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os,
967 kmp_i18n_id_t *const msg_id) {
968 *address2os = NULL;
969 *msg_id = kmp_i18n_null;
970
971 // If we aren't affinity capable, then return now.
972 // The flat mapping will be used.
973 if (!KMP_AFFINITY_CAPABLE()) {
974 // FIXME set *msg_id
975 return -1;
976 }
977
978 // Construct the data structure to be returned.
979 *address2os =
980 (AddrUnsPair *)__kmp_allocate(sizeof(**address2os) * __kmp_avail_proc);
981 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
982 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
983 int avail_ct = 0;
984 int i;
985 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
986 // Skip this proc if it is not included in the machine model.
987 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
988 continue;
989 }
990 __kmp_pu_os_idx[avail_ct] = i; // suppose indices are flat
991 Address addr(2);
992 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR));
993 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR));
994 (*address2os)[avail_ct++] = AddrUnsPair(addr, i);
995
996 if (__kmp_affinity_verbose) {
997 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0],
998 addr.labels[1]);
999 }
1000 }
1001
1002 if (__kmp_affinity_gran_levels < 0) {
1003 if (__kmp_affinity_gran == affinity_gran_group) {
1004 __kmp_affinity_gran_levels = 1;
1005 } else if ((__kmp_affinity_gran == affinity_gran_fine) ||
1006 (__kmp_affinity_gran == affinity_gran_thread)) {
1007 __kmp_affinity_gran_levels = 0;
1008 } else {
1009 const char *gran_str = NULL;
1010 if (__kmp_affinity_gran == affinity_gran_core) {
1011 gran_str = "core";
1012 } else if (__kmp_affinity_gran == affinity_gran_package) {
1013 gran_str = "package";
1014 } else if (__kmp_affinity_gran == affinity_gran_node) {
1015 gran_str = "node";
1016 } else {
1017 KMP_ASSERT(0);
1018 }
1019
1020 // Warning: can't use affinity granularity \"gran\" with group topology
1021 // method, using "thread"
1022 __kmp_affinity_gran_levels = 0;
1023 }
1024 }
1025 return 2;
1026 }
1027
1028 #endif /* KMP_GROUP_AFFINITY */
1029
1030 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1031
__kmp_cpuid_mask_width(int count)1032 static int __kmp_cpuid_mask_width(int count) {
1033 int r = 0;
1034
1035 while ((1 << r) < count)
1036 ++r;
1037 return r;
1038 }
1039
1040 class apicThreadInfo {
1041 public:
1042 unsigned osId; // param to __kmp_affinity_bind_thread
1043 unsigned apicId; // from cpuid after binding
1044 unsigned maxCoresPerPkg; // ""
1045 unsigned maxThreadsPerPkg; // ""
1046 unsigned pkgId; // inferred from above values
1047 unsigned coreId; // ""
1048 unsigned threadId; // ""
1049 };
1050
__kmp_affinity_cmp_apicThreadInfo_phys_id(const void * a,const void * b)1051 static int __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a,
1052 const void *b) {
1053 const apicThreadInfo *aa = (const apicThreadInfo *)a;
1054 const apicThreadInfo *bb = (const apicThreadInfo *)b;
1055 if (aa->pkgId < bb->pkgId)
1056 return -1;
1057 if (aa->pkgId > bb->pkgId)
1058 return 1;
1059 if (aa->coreId < bb->coreId)
1060 return -1;
1061 if (aa->coreId > bb->coreId)
1062 return 1;
1063 if (aa->threadId < bb->threadId)
1064 return -1;
1065 if (aa->threadId > bb->threadId)
1066 return 1;
1067 return 0;
1068 }
1069
1070 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use
1071 // an algorithm which cycles through the available os threads, setting
1072 // the current thread's affinity mask to that thread, and then retrieves
1073 // the Apic Id for each thread context using the cpuid instruction.
__kmp_affinity_create_apicid_map(AddrUnsPair ** address2os,kmp_i18n_id_t * const msg_id)1074 static int __kmp_affinity_create_apicid_map(AddrUnsPair **address2os,
1075 kmp_i18n_id_t *const msg_id) {
1076 kmp_cpuid buf;
1077 *address2os = NULL;
1078 *msg_id = kmp_i18n_null;
1079
1080 // Check if cpuid leaf 4 is supported.
1081 __kmp_x86_cpuid(0, 0, &buf);
1082 if (buf.eax < 4) {
1083 *msg_id = kmp_i18n_str_NoLeaf4Support;
1084 return -1;
1085 }
1086
1087 // The algorithm used starts by setting the affinity to each available thread
1088 // and retrieving info from the cpuid instruction, so if we are not capable of
1089 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
1090 // need to do something else - use the defaults that we calculated from
1091 // issuing cpuid without binding to each proc.
1092 if (!KMP_AFFINITY_CAPABLE()) {
1093 // Hack to try and infer the machine topology using only the data
1094 // available from cpuid on the current thread, and __kmp_xproc.
1095 KMP_ASSERT(__kmp_affinity_type == affinity_none);
1096
1097 // Get an upper bound on the number of threads per package using cpuid(1).
1098 // On some OS/chps combinations where HT is supported by the chip but is
1099 // disabled, this value will be 2 on a single core chip. Usually, it will be
1100 // 2 if HT is enabled and 1 if HT is disabled.
1101 __kmp_x86_cpuid(1, 0, &buf);
1102 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1103 if (maxThreadsPerPkg == 0) {
1104 maxThreadsPerPkg = 1;
1105 }
1106
1107 // The num cores per pkg comes from cpuid(4). 1 must be added to the encoded
1108 // value.
1109 //
1110 // The author of cpu_count.cpp treated this only an upper bound on the
1111 // number of cores, but I haven't seen any cases where it was greater than
1112 // the actual number of cores, so we will treat it as exact in this block of
1113 // code.
1114 //
1115 // First, we need to check if cpuid(4) is supported on this chip. To see if
1116 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n or
1117 // greater.
1118 __kmp_x86_cpuid(0, 0, &buf);
1119 if (buf.eax >= 4) {
1120 __kmp_x86_cpuid(4, 0, &buf);
1121 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1122 } else {
1123 nCoresPerPkg = 1;
1124 }
1125
1126 // There is no way to reliably tell if HT is enabled without issuing the
1127 // cpuid instruction from every thread, can correlating the cpuid info, so
1128 // if the machine is not affinity capable, we assume that HT is off. We have
1129 // seen quite a few machines where maxThreadsPerPkg is 2, yet the machine
1130 // does not support HT.
1131 //
1132 // - Older OSes are usually found on machines with older chips, which do not
1133 // support HT.
1134 // - The performance penalty for mistakenly identifying a machine as HT when
1135 // it isn't (which results in blocktime being incorrectly set to 0) is
1136 // greater than the penalty when for mistakenly identifying a machine as
1137 // being 1 thread/core when it is really HT enabled (which results in
1138 // blocktime being incorrectly set to a positive value).
1139 __kmp_ncores = __kmp_xproc;
1140 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1141 __kmp_nThreadsPerCore = 1;
1142 if (__kmp_affinity_verbose) {
1143 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY");
1144 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1145 if (__kmp_affinity_uniform_topology()) {
1146 KMP_INFORM(Uniform, "KMP_AFFINITY");
1147 } else {
1148 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1149 }
1150 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1151 __kmp_nThreadsPerCore, __kmp_ncores);
1152 }
1153 return 0;
1154 }
1155
1156 // From here on, we can assume that it is safe to call
1157 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
1158 // __kmp_affinity_type = affinity_none.
1159
1160 // Save the affinity mask for the current thread.
1161 kmp_affin_mask_t *oldMask;
1162 KMP_CPU_ALLOC(oldMask);
1163 KMP_ASSERT(oldMask != NULL);
1164 __kmp_get_system_affinity(oldMask, TRUE);
1165
1166 // Run through each of the available contexts, binding the current thread
1167 // to it, and obtaining the pertinent information using the cpuid instr.
1168 //
1169 // The relevant information is:
1170 // - Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context
1171 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#.
1172 // - Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The value
1173 // of this field determines the width of the core# + thread# fields in the
1174 // Apic Id. It is also an upper bound on the number of threads per
1175 // package, but it has been verified that situations happen were it is not
1176 // exact. In particular, on certain OS/chip combinations where Intel(R)
1177 // Hyper-Threading Technology is supported by the chip but has been
1178 // disabled, the value of this field will be 2 (for a single core chip).
1179 // On other OS/chip combinations supporting Intel(R) Hyper-Threading
1180 // Technology, the value of this field will be 1 when Intel(R)
1181 // Hyper-Threading Technology is disabled and 2 when it is enabled.
1182 // - Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The value
1183 // of this field (+1) determines the width of the core# field in the Apic
1184 // Id. The comments in "cpucount.cpp" say that this value is an upper
1185 // bound, but the IA-32 architecture manual says that it is exactly the
1186 // number of cores per package, and I haven't seen any case where it
1187 // wasn't.
1188 //
1189 // From this information, deduce the package Id, core Id, and thread Id,
1190 // and set the corresponding fields in the apicThreadInfo struct.
1191 unsigned i;
1192 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate(
1193 __kmp_avail_proc * sizeof(apicThreadInfo));
1194 unsigned nApics = 0;
1195 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
1196 // Skip this proc if it is not included in the machine model.
1197 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
1198 continue;
1199 }
1200 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc);
1201
1202 __kmp_affinity_dispatch->bind_thread(i);
1203 threadInfo[nApics].osId = i;
1204
1205 // The apic id and max threads per pkg come from cpuid(1).
1206 __kmp_x86_cpuid(1, 0, &buf);
1207 if (((buf.edx >> 9) & 1) == 0) {
1208 __kmp_set_system_affinity(oldMask, TRUE);
1209 __kmp_free(threadInfo);
1210 KMP_CPU_FREE(oldMask);
1211 *msg_id = kmp_i18n_str_ApicNotPresent;
1212 return -1;
1213 }
1214 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff;
1215 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff;
1216 if (threadInfo[nApics].maxThreadsPerPkg == 0) {
1217 threadInfo[nApics].maxThreadsPerPkg = 1;
1218 }
1219
1220 // Max cores per pkg comes from cpuid(4). 1 must be added to the encoded
1221 // value.
1222 //
1223 // First, we need to check if cpuid(4) is supported on this chip. To see if
1224 // cpuid(n) is supported, issue cpuid(0) and check if eax has the value n
1225 // or greater.
1226 __kmp_x86_cpuid(0, 0, &buf);
1227 if (buf.eax >= 4) {
1228 __kmp_x86_cpuid(4, 0, &buf);
1229 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1;
1230 } else {
1231 threadInfo[nApics].maxCoresPerPkg = 1;
1232 }
1233
1234 // Infer the pkgId / coreId / threadId using only the info obtained locally.
1235 int widthCT = __kmp_cpuid_mask_width(threadInfo[nApics].maxThreadsPerPkg);
1236 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT;
1237
1238 int widthC = __kmp_cpuid_mask_width(threadInfo[nApics].maxCoresPerPkg);
1239 int widthT = widthCT - widthC;
1240 if (widthT < 0) {
1241 // I've never seen this one happen, but I suppose it could, if the cpuid
1242 // instruction on a chip was really screwed up. Make sure to restore the
1243 // affinity mask before the tail call.
1244 __kmp_set_system_affinity(oldMask, TRUE);
1245 __kmp_free(threadInfo);
1246 KMP_CPU_FREE(oldMask);
1247 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1248 return -1;
1249 }
1250
1251 int maskC = (1 << widthC) - 1;
1252 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) & maskC;
1253
1254 int maskT = (1 << widthT) - 1;
1255 threadInfo[nApics].threadId = threadInfo[nApics].apicId & maskT;
1256
1257 nApics++;
1258 }
1259
1260 // We've collected all the info we need.
1261 // Restore the old affinity mask for this thread.
1262 __kmp_set_system_affinity(oldMask, TRUE);
1263
1264 // If there's only one thread context to bind to, form an Address object
1265 // with depth 1 and return immediately (or, if affinity is off, set
1266 // address2os to NULL and return).
1267 //
1268 // If it is configured to omit the package level when there is only a single
1269 // package, the logic at the end of this routine won't work if there is only
1270 // a single thread - it would try to form an Address object with depth 0.
1271 KMP_ASSERT(nApics > 0);
1272 if (nApics == 1) {
1273 __kmp_ncores = nPackages = 1;
1274 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1275 if (__kmp_affinity_verbose) {
1276 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1277 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1278
1279 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1280 if (__kmp_affinity_respect_mask) {
1281 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1282 } else {
1283 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1284 }
1285 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1286 KMP_INFORM(Uniform, "KMP_AFFINITY");
1287 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1288 __kmp_nThreadsPerCore, __kmp_ncores);
1289 }
1290
1291 if (__kmp_affinity_type == affinity_none) {
1292 __kmp_free(threadInfo);
1293 KMP_CPU_FREE(oldMask);
1294 return 0;
1295 }
1296
1297 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
1298 Address addr(1);
1299 addr.labels[0] = threadInfo[0].pkgId;
1300 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId);
1301
1302 if (__kmp_affinity_gran_levels < 0) {
1303 __kmp_affinity_gran_levels = 0;
1304 }
1305
1306 if (__kmp_affinity_verbose) {
1307 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
1308 }
1309
1310 __kmp_free(threadInfo);
1311 KMP_CPU_FREE(oldMask);
1312 return 1;
1313 }
1314
1315 // Sort the threadInfo table by physical Id.
1316 qsort(threadInfo, nApics, sizeof(*threadInfo),
1317 __kmp_affinity_cmp_apicThreadInfo_phys_id);
1318
1319 // The table is now sorted by pkgId / coreId / threadId, but we really don't
1320 // know the radix of any of the fields. pkgId's may be sparsely assigned among
1321 // the chips on a system. Although coreId's are usually assigned
1322 // [0 .. coresPerPkg-1] and threadId's are usually assigned
1323 // [0..threadsPerCore-1], we don't want to make any such assumptions.
1324 //
1325 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
1326 // total # packages) are at this point - we want to determine that now. We
1327 // only have an upper bound on the first two figures.
1328 //
1329 // We also perform a consistency check at this point: the values returned by
1330 // the cpuid instruction for any thread bound to a given package had better
1331 // return the same info for maxThreadsPerPkg and maxCoresPerPkg.
1332 nPackages = 1;
1333 nCoresPerPkg = 1;
1334 __kmp_nThreadsPerCore = 1;
1335 unsigned nCores = 1;
1336
1337 unsigned pkgCt = 1; // to determine radii
1338 unsigned lastPkgId = threadInfo[0].pkgId;
1339 unsigned coreCt = 1;
1340 unsigned lastCoreId = threadInfo[0].coreId;
1341 unsigned threadCt = 1;
1342 unsigned lastThreadId = threadInfo[0].threadId;
1343
1344 // intra-pkg consist checks
1345 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg;
1346 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg;
1347
1348 for (i = 1; i < nApics; i++) {
1349 if (threadInfo[i].pkgId != lastPkgId) {
1350 nCores++;
1351 pkgCt++;
1352 lastPkgId = threadInfo[i].pkgId;
1353 if ((int)coreCt > nCoresPerPkg)
1354 nCoresPerPkg = coreCt;
1355 coreCt = 1;
1356 lastCoreId = threadInfo[i].coreId;
1357 if ((int)threadCt > __kmp_nThreadsPerCore)
1358 __kmp_nThreadsPerCore = threadCt;
1359 threadCt = 1;
1360 lastThreadId = threadInfo[i].threadId;
1361
1362 // This is a different package, so go on to the next iteration without
1363 // doing any consistency checks. Reset the consistency check vars, though.
1364 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg;
1365 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg;
1366 continue;
1367 }
1368
1369 if (threadInfo[i].coreId != lastCoreId) {
1370 nCores++;
1371 coreCt++;
1372 lastCoreId = threadInfo[i].coreId;
1373 if ((int)threadCt > __kmp_nThreadsPerCore)
1374 __kmp_nThreadsPerCore = threadCt;
1375 threadCt = 1;
1376 lastThreadId = threadInfo[i].threadId;
1377 } else if (threadInfo[i].threadId != lastThreadId) {
1378 threadCt++;
1379 lastThreadId = threadInfo[i].threadId;
1380 } else {
1381 __kmp_free(threadInfo);
1382 KMP_CPU_FREE(oldMask);
1383 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique;
1384 return -1;
1385 }
1386
1387 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg
1388 // fields agree between all the threads bounds to a given package.
1389 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) ||
1390 (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) {
1391 __kmp_free(threadInfo);
1392 KMP_CPU_FREE(oldMask);
1393 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1394 return -1;
1395 }
1396 }
1397 nPackages = pkgCt;
1398 if ((int)coreCt > nCoresPerPkg)
1399 nCoresPerPkg = coreCt;
1400 if ((int)threadCt > __kmp_nThreadsPerCore)
1401 __kmp_nThreadsPerCore = threadCt;
1402
1403 // When affinity is off, this routine will still be called to set
1404 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1405 // Make sure all these vars are set correctly, and return now if affinity is
1406 // not enabled.
1407 __kmp_ncores = nCores;
1408 if (__kmp_affinity_verbose) {
1409 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1410 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1411
1412 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY");
1413 if (__kmp_affinity_respect_mask) {
1414 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1415 } else {
1416 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1417 }
1418 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1419 if (__kmp_affinity_uniform_topology()) {
1420 KMP_INFORM(Uniform, "KMP_AFFINITY");
1421 } else {
1422 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1423 }
1424 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1425 __kmp_nThreadsPerCore, __kmp_ncores);
1426 }
1427 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1428 KMP_DEBUG_ASSERT(nApics == (unsigned)__kmp_avail_proc);
1429 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1430 for (i = 0; i < nApics; ++i) {
1431 __kmp_pu_os_idx[i] = threadInfo[i].osId;
1432 }
1433 if (__kmp_affinity_type == affinity_none) {
1434 __kmp_free(threadInfo);
1435 KMP_CPU_FREE(oldMask);
1436 return 0;
1437 }
1438
1439 // Now that we've determined the number of packages, the number of cores per
1440 // package, and the number of threads per core, we can construct the data
1441 // structure that is to be returned.
1442 int pkgLevel = 0;
1443 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1;
1444 int threadLevel =
1445 (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1);
1446 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0);
1447
1448 KMP_ASSERT(depth > 0);
1449 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1450
1451 for (i = 0; i < nApics; ++i) {
1452 Address addr(depth);
1453 unsigned os = threadInfo[i].osId;
1454 int d = 0;
1455
1456 if (pkgLevel >= 0) {
1457 addr.labels[d++] = threadInfo[i].pkgId;
1458 }
1459 if (coreLevel >= 0) {
1460 addr.labels[d++] = threadInfo[i].coreId;
1461 }
1462 if (threadLevel >= 0) {
1463 addr.labels[d++] = threadInfo[i].threadId;
1464 }
1465 (*address2os)[i] = AddrUnsPair(addr, os);
1466 }
1467
1468 if (__kmp_affinity_gran_levels < 0) {
1469 // Set the granularity level based on what levels are modeled in the machine
1470 // topology map.
1471 __kmp_affinity_gran_levels = 0;
1472 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1473 __kmp_affinity_gran_levels++;
1474 }
1475 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1476 __kmp_affinity_gran_levels++;
1477 }
1478 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) {
1479 __kmp_affinity_gran_levels++;
1480 }
1481 }
1482
1483 if (__kmp_affinity_verbose) {
1484 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel,
1485 coreLevel, threadLevel);
1486 }
1487
1488 __kmp_free(threadInfo);
1489 KMP_CPU_FREE(oldMask);
1490 return depth;
1491 }
1492
1493 // Intel(R) microarchitecture code name Nehalem, Dunnington and later
1494 // architectures support a newer interface for specifying the x2APIC Ids,
1495 // based on cpuid leaf 11.
__kmp_affinity_create_x2apicid_map(AddrUnsPair ** address2os,kmp_i18n_id_t * const msg_id)1496 static int __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os,
1497 kmp_i18n_id_t *const msg_id) {
1498 kmp_cpuid buf;
1499 *address2os = NULL;
1500 *msg_id = kmp_i18n_null;
1501
1502 // Check to see if cpuid leaf 11 is supported.
1503 __kmp_x86_cpuid(0, 0, &buf);
1504 if (buf.eax < 11) {
1505 *msg_id = kmp_i18n_str_NoLeaf11Support;
1506 return -1;
1507 }
1508 __kmp_x86_cpuid(11, 0, &buf);
1509 if (buf.ebx == 0) {
1510 *msg_id = kmp_i18n_str_NoLeaf11Support;
1511 return -1;
1512 }
1513
1514 // Find the number of levels in the machine topology. While we're at it, get
1515 // the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will try to
1516 // get more accurate values later by explicitly counting them, but get
1517 // reasonable defaults now, in case we return early.
1518 int level;
1519 int threadLevel = -1;
1520 int coreLevel = -1;
1521 int pkgLevel = -1;
1522 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
1523
1524 for (level = 0;; level++) {
1525 if (level > 31) {
1526 // FIXME: Hack for DPD200163180
1527 //
1528 // If level is big then something went wrong -> exiting
1529 //
1530 // There could actually be 32 valid levels in the machine topology, but so
1531 // far, the only machine we have seen which does not exit this loop before
1532 // iteration 32 has fubar x2APIC settings.
1533 //
1534 // For now, just reject this case based upon loop trip count.
1535 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1536 return -1;
1537 }
1538 __kmp_x86_cpuid(11, level, &buf);
1539 if (buf.ebx == 0) {
1540 if (pkgLevel < 0) {
1541 // Will infer nPackages from __kmp_xproc
1542 pkgLevel = level;
1543 level++;
1544 }
1545 break;
1546 }
1547 int kind = (buf.ecx >> 8) & 0xff;
1548 if (kind == 1) {
1549 // SMT level
1550 threadLevel = level;
1551 coreLevel = -1;
1552 pkgLevel = -1;
1553 __kmp_nThreadsPerCore = buf.ebx & 0xffff;
1554 if (__kmp_nThreadsPerCore == 0) {
1555 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1556 return -1;
1557 }
1558 } else if (kind == 2) {
1559 // core level
1560 coreLevel = level;
1561 pkgLevel = -1;
1562 nCoresPerPkg = buf.ebx & 0xffff;
1563 if (nCoresPerPkg == 0) {
1564 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1565 return -1;
1566 }
1567 } else {
1568 if (level <= 0) {
1569 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1570 return -1;
1571 }
1572 if (pkgLevel >= 0) {
1573 continue;
1574 }
1575 pkgLevel = level;
1576 nPackages = buf.ebx & 0xffff;
1577 if (nPackages == 0) {
1578 *msg_id = kmp_i18n_str_InvalidCpuidInfo;
1579 return -1;
1580 }
1581 }
1582 }
1583 int depth = level;
1584
1585 // In the above loop, "level" was counted from the finest level (usually
1586 // thread) to the coarsest. The caller expects that we will place the labels
1587 // in (*address2os)[].first.labels[] in the inverse order, so we need to
1588 // invert the vars saying which level means what.
1589 if (threadLevel >= 0) {
1590 threadLevel = depth - threadLevel - 1;
1591 }
1592 if (coreLevel >= 0) {
1593 coreLevel = depth - coreLevel - 1;
1594 }
1595 KMP_DEBUG_ASSERT(pkgLevel >= 0);
1596 pkgLevel = depth - pkgLevel - 1;
1597
1598 // The algorithm used starts by setting the affinity to each available thread
1599 // and retrieving info from the cpuid instruction, so if we are not capable of
1600 // calling __kmp_get_system_affinity() and _kmp_get_system_affinity(), then we
1601 // need to do something else - use the defaults that we calculated from
1602 // issuing cpuid without binding to each proc.
1603 if (!KMP_AFFINITY_CAPABLE()) {
1604 // Hack to try and infer the machine topology using only the data
1605 // available from cpuid on the current thread, and __kmp_xproc.
1606 KMP_ASSERT(__kmp_affinity_type == affinity_none);
1607
1608 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore;
1609 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg;
1610 if (__kmp_affinity_verbose) {
1611 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY");
1612 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1613 if (__kmp_affinity_uniform_topology()) {
1614 KMP_INFORM(Uniform, "KMP_AFFINITY");
1615 } else {
1616 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1617 }
1618 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1619 __kmp_nThreadsPerCore, __kmp_ncores);
1620 }
1621 return 0;
1622 }
1623
1624 // From here on, we can assume that it is safe to call
1625 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), even if
1626 // __kmp_affinity_type = affinity_none.
1627
1628 // Save the affinity mask for the current thread.
1629 kmp_affin_mask_t *oldMask;
1630 KMP_CPU_ALLOC(oldMask);
1631 __kmp_get_system_affinity(oldMask, TRUE);
1632
1633 // Allocate the data structure to be returned.
1634 AddrUnsPair *retval =
1635 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc);
1636
1637 // Run through each of the available contexts, binding the current thread
1638 // to it, and obtaining the pertinent information using the cpuid instr.
1639 unsigned int proc;
1640 int nApics = 0;
1641 KMP_CPU_SET_ITERATE(proc, __kmp_affin_fullMask) {
1642 // Skip this proc if it is not included in the machine model.
1643 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
1644 continue;
1645 }
1646 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc);
1647
1648 __kmp_affinity_dispatch->bind_thread(proc);
1649
1650 // Extract labels for each level in the machine topology map from Apic ID.
1651 Address addr(depth);
1652 int prev_shift = 0;
1653
1654 for (level = 0; level < depth; level++) {
1655 __kmp_x86_cpuid(11, level, &buf);
1656 unsigned apicId = buf.edx;
1657 if (buf.ebx == 0) {
1658 if (level != depth - 1) {
1659 KMP_CPU_FREE(oldMask);
1660 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1661 return -1;
1662 }
1663 addr.labels[depth - level - 1] = apicId >> prev_shift;
1664 level++;
1665 break;
1666 }
1667 int shift = buf.eax & 0x1f;
1668 int mask = (1 << shift) - 1;
1669 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift;
1670 prev_shift = shift;
1671 }
1672 if (level != depth) {
1673 KMP_CPU_FREE(oldMask);
1674 *msg_id = kmp_i18n_str_InconsistentCpuidInfo;
1675 return -1;
1676 }
1677
1678 retval[nApics] = AddrUnsPair(addr, proc);
1679 nApics++;
1680 }
1681
1682 // We've collected all the info we need.
1683 // Restore the old affinity mask for this thread.
1684 __kmp_set_system_affinity(oldMask, TRUE);
1685
1686 // If there's only one thread context to bind to, return now.
1687 KMP_ASSERT(nApics > 0);
1688 if (nApics == 1) {
1689 __kmp_ncores = nPackages = 1;
1690 __kmp_nThreadsPerCore = nCoresPerPkg = 1;
1691 if (__kmp_affinity_verbose) {
1692 char buf[KMP_AFFIN_MASK_PRINT_LEN];
1693 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1694
1695 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1696 if (__kmp_affinity_respect_mask) {
1697 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
1698 } else {
1699 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
1700 }
1701 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1702 KMP_INFORM(Uniform, "KMP_AFFINITY");
1703 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg,
1704 __kmp_nThreadsPerCore, __kmp_ncores);
1705 }
1706
1707 if (__kmp_affinity_type == affinity_none) {
1708 __kmp_free(retval);
1709 KMP_CPU_FREE(oldMask);
1710 return 0;
1711 }
1712
1713 // Form an Address object which only includes the package level.
1714 Address addr(1);
1715 addr.labels[0] = retval[0].first.labels[pkgLevel];
1716 retval[0].first = addr;
1717
1718 if (__kmp_affinity_gran_levels < 0) {
1719 __kmp_affinity_gran_levels = 0;
1720 }
1721
1722 if (__kmp_affinity_verbose) {
1723 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1);
1724 }
1725
1726 *address2os = retval;
1727 KMP_CPU_FREE(oldMask);
1728 return 1;
1729 }
1730
1731 // Sort the table by physical Id.
1732 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels);
1733
1734 // Find the radix at each of the levels.
1735 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1736 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1737 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1738 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned));
1739 for (level = 0; level < depth; level++) {
1740 totals[level] = 1;
1741 maxCt[level] = 1;
1742 counts[level] = 1;
1743 last[level] = retval[0].first.labels[level];
1744 }
1745
1746 // From here on, the iteration variable "level" runs from the finest level to
1747 // the coarsest, i.e. we iterate forward through
1748 // (*address2os)[].first.labels[] - in the previous loops, we iterated
1749 // backwards.
1750 for (proc = 1; (int)proc < nApics; proc++) {
1751 int level;
1752 for (level = 0; level < depth; level++) {
1753 if (retval[proc].first.labels[level] != last[level]) {
1754 int j;
1755 for (j = level + 1; j < depth; j++) {
1756 totals[j]++;
1757 counts[j] = 1;
1758 // The line below causes printing incorrect topology information in
1759 // case the max value for some level (maxCt[level]) is encountered
1760 // earlier than some less value while going through the array. For
1761 // example, let pkg0 has 4 cores and pkg1 has 2 cores. Then
1762 // maxCt[1] == 2
1763 // whereas it must be 4.
1764 // TODO!!! Check if it can be commented safely
1765 // maxCt[j] = 1;
1766 last[j] = retval[proc].first.labels[j];
1767 }
1768 totals[level]++;
1769 counts[level]++;
1770 if (counts[level] > maxCt[level]) {
1771 maxCt[level] = counts[level];
1772 }
1773 last[level] = retval[proc].first.labels[level];
1774 break;
1775 } else if (level == depth - 1) {
1776 __kmp_free(last);
1777 __kmp_free(maxCt);
1778 __kmp_free(counts);
1779 __kmp_free(totals);
1780 __kmp_free(retval);
1781 KMP_CPU_FREE(oldMask);
1782 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique;
1783 return -1;
1784 }
1785 }
1786 }
1787
1788 // When affinity is off, this routine will still be called to set
1789 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
1790 // Make sure all these vars are set correctly, and return if affinity is not
1791 // enabled.
1792 if (threadLevel >= 0) {
1793 __kmp_nThreadsPerCore = maxCt[threadLevel];
1794 } else {
1795 __kmp_nThreadsPerCore = 1;
1796 }
1797 nPackages = totals[pkgLevel];
1798
1799 if (coreLevel >= 0) {
1800 __kmp_ncores = totals[coreLevel];
1801 nCoresPerPkg = maxCt[coreLevel];
1802 } else {
1803 __kmp_ncores = nPackages;
1804 nCoresPerPkg = 1;
1805 }
1806
1807 // Check to see if the machine topology is uniform
1808 unsigned prod = maxCt[0];
1809 for (level = 1; level < depth; level++) {
1810 prod *= maxCt[level];
1811 }
1812 bool uniform = (prod == totals[level - 1]);
1813
1814 // Print the machine topology summary.
1815 if (__kmp_affinity_verbose) {
1816 char mask[KMP_AFFIN_MASK_PRINT_LEN];
1817 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask);
1818
1819 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY");
1820 if (__kmp_affinity_respect_mask) {
1821 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask);
1822 } else {
1823 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask);
1824 }
1825 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
1826 if (uniform) {
1827 KMP_INFORM(Uniform, "KMP_AFFINITY");
1828 } else {
1829 KMP_INFORM(NonUniform, "KMP_AFFINITY");
1830 }
1831
1832 kmp_str_buf_t buf;
1833 __kmp_str_buf_init(&buf);
1834
1835 __kmp_str_buf_print(&buf, "%d", totals[0]);
1836 for (level = 1; level <= pkgLevel; level++) {
1837 __kmp_str_buf_print(&buf, " x %d", maxCt[level]);
1838 }
1839 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg,
1840 __kmp_nThreadsPerCore, __kmp_ncores);
1841
1842 __kmp_str_buf_free(&buf);
1843 }
1844 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
1845 KMP_DEBUG_ASSERT(nApics == __kmp_avail_proc);
1846 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
1847 for (proc = 0; (int)proc < nApics; ++proc) {
1848 __kmp_pu_os_idx[proc] = retval[proc].second;
1849 }
1850 if (__kmp_affinity_type == affinity_none) {
1851 __kmp_free(last);
1852 __kmp_free(maxCt);
1853 __kmp_free(counts);
1854 __kmp_free(totals);
1855 __kmp_free(retval);
1856 KMP_CPU_FREE(oldMask);
1857 return 0;
1858 }
1859
1860 // Find any levels with radix 1, and remove them from the map
1861 // (except for the package level).
1862 int new_depth = 0;
1863 for (level = 0; level < depth; level++) {
1864 if ((maxCt[level] == 1) && (level != pkgLevel)) {
1865 continue;
1866 }
1867 new_depth++;
1868 }
1869
1870 // If we are removing any levels, allocate a new vector to return,
1871 // and copy the relevant information to it.
1872 if (new_depth != depth) {
1873 AddrUnsPair *new_retval =
1874 (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * nApics);
1875 for (proc = 0; (int)proc < nApics; proc++) {
1876 Address addr(new_depth);
1877 new_retval[proc] = AddrUnsPair(addr, retval[proc].second);
1878 }
1879 int new_level = 0;
1880 int newPkgLevel = -1;
1881 int newCoreLevel = -1;
1882 int newThreadLevel = -1;
1883 for (level = 0; level < depth; level++) {
1884 if ((maxCt[level] == 1) && (level != pkgLevel)) {
1885 // Remove this level. Never remove the package level
1886 continue;
1887 }
1888 if (level == pkgLevel) {
1889 newPkgLevel = new_level;
1890 }
1891 if (level == coreLevel) {
1892 newCoreLevel = new_level;
1893 }
1894 if (level == threadLevel) {
1895 newThreadLevel = new_level;
1896 }
1897 for (proc = 0; (int)proc < nApics; proc++) {
1898 new_retval[proc].first.labels[new_level] =
1899 retval[proc].first.labels[level];
1900 }
1901 new_level++;
1902 }
1903
1904 __kmp_free(retval);
1905 retval = new_retval;
1906 depth = new_depth;
1907 pkgLevel = newPkgLevel;
1908 coreLevel = newCoreLevel;
1909 threadLevel = newThreadLevel;
1910 }
1911
1912 if (__kmp_affinity_gran_levels < 0) {
1913 // Set the granularity level based on what levels are modeled
1914 // in the machine topology map.
1915 __kmp_affinity_gran_levels = 0;
1916 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) {
1917 __kmp_affinity_gran_levels++;
1918 }
1919 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) {
1920 __kmp_affinity_gran_levels++;
1921 }
1922 if (__kmp_affinity_gran > affinity_gran_package) {
1923 __kmp_affinity_gran_levels++;
1924 }
1925 }
1926
1927 if (__kmp_affinity_verbose) {
1928 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, coreLevel,
1929 threadLevel);
1930 }
1931
1932 __kmp_free(last);
1933 __kmp_free(maxCt);
1934 __kmp_free(counts);
1935 __kmp_free(totals);
1936 KMP_CPU_FREE(oldMask);
1937 *address2os = retval;
1938 return depth;
1939 }
1940
1941 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1942
1943 #define osIdIndex 0
1944 #define threadIdIndex 1
1945 #define coreIdIndex 2
1946 #define pkgIdIndex 3
1947 #define nodeIdIndex 4
1948
1949 typedef unsigned *ProcCpuInfo;
1950 static unsigned maxIndex = pkgIdIndex;
1951
__kmp_affinity_cmp_ProcCpuInfo_phys_id(const void * a,const void * b)1952 static int __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a,
1953 const void *b) {
1954 unsigned i;
1955 const unsigned *aa = *(unsigned *const *)a;
1956 const unsigned *bb = *(unsigned *const *)b;
1957 for (i = maxIndex;; i--) {
1958 if (aa[i] < bb[i])
1959 return -1;
1960 if (aa[i] > bb[i])
1961 return 1;
1962 if (i == osIdIndex)
1963 break;
1964 }
1965 return 0;
1966 }
1967
1968 #if KMP_USE_HIER_SCHED
1969 // Set the array sizes for the hierarchy layers
__kmp_dispatch_set_hierarchy_values()1970 static void __kmp_dispatch_set_hierarchy_values() {
1971 // Set the maximum number of L1's to number of cores
1972 // Set the maximum number of L2's to to either number of cores / 2 for
1973 // Intel(R) Xeon Phi(TM) coprocessor formally codenamed Knights Landing
1974 // Or the number of cores for Intel(R) Xeon(R) processors
1975 // Set the maximum number of NUMA nodes and L3's to number of packages
1976 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1] =
1977 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
1978 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L1 + 1] = __kmp_ncores;
1979 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
1980 KMP_MIC_SUPPORTED
1981 if (__kmp_mic_type >= mic3)
1982 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores / 2;
1983 else
1984 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
1985 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L2 + 1] = __kmp_ncores;
1986 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_L3 + 1] = nPackages;
1987 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_NUMA + 1] = nPackages;
1988 __kmp_hier_max_units[kmp_hier_layer_e::LAYER_LOOP + 1] = 1;
1989 // Set the number of threads per unit
1990 // Number of hardware threads per L1/L2/L3/NUMA/LOOP
1991 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_THREAD + 1] = 1;
1992 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L1 + 1] =
1993 __kmp_nThreadsPerCore;
1994 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS) && \
1995 KMP_MIC_SUPPORTED
1996 if (__kmp_mic_type >= mic3)
1997 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
1998 2 * __kmp_nThreadsPerCore;
1999 else
2000 #endif // KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
2001 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L2 + 1] =
2002 __kmp_nThreadsPerCore;
2003 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_L3 + 1] =
2004 nCoresPerPkg * __kmp_nThreadsPerCore;
2005 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_NUMA + 1] =
2006 nCoresPerPkg * __kmp_nThreadsPerCore;
2007 __kmp_hier_threads_per[kmp_hier_layer_e::LAYER_LOOP + 1] =
2008 nPackages * nCoresPerPkg * __kmp_nThreadsPerCore;
2009 }
2010
2011 // Return the index into the hierarchy for this tid and layer type (L1, L2, etc)
2012 // i.e., this thread's L1 or this thread's L2, etc.
__kmp_dispatch_get_index(int tid,kmp_hier_layer_e type)2013 int __kmp_dispatch_get_index(int tid, kmp_hier_layer_e type) {
2014 int index = type + 1;
2015 int num_hw_threads = __kmp_hier_max_units[kmp_hier_layer_e::LAYER_THREAD + 1];
2016 KMP_DEBUG_ASSERT(type != kmp_hier_layer_e::LAYER_LAST);
2017 if (type == kmp_hier_layer_e::LAYER_THREAD)
2018 return tid;
2019 else if (type == kmp_hier_layer_e::LAYER_LOOP)
2020 return 0;
2021 KMP_DEBUG_ASSERT(__kmp_hier_max_units[index] != 0);
2022 if (tid >= num_hw_threads)
2023 tid = tid % num_hw_threads;
2024 return (tid / __kmp_hier_threads_per[index]) % __kmp_hier_max_units[index];
2025 }
2026
2027 // Return the number of t1's per t2
__kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1,kmp_hier_layer_e t2)2028 int __kmp_dispatch_get_t1_per_t2(kmp_hier_layer_e t1, kmp_hier_layer_e t2) {
2029 int i1 = t1 + 1;
2030 int i2 = t2 + 1;
2031 KMP_DEBUG_ASSERT(i1 <= i2);
2032 KMP_DEBUG_ASSERT(t1 != kmp_hier_layer_e::LAYER_LAST);
2033 KMP_DEBUG_ASSERT(t2 != kmp_hier_layer_e::LAYER_LAST);
2034 KMP_DEBUG_ASSERT(__kmp_hier_threads_per[i1] != 0);
2035 // (nthreads/t2) / (nthreads/t1) = t1 / t2
2036 return __kmp_hier_threads_per[i2] / __kmp_hier_threads_per[i1];
2037 }
2038 #endif // KMP_USE_HIER_SCHED
2039
2040 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the
2041 // affinity map.
__kmp_affinity_create_cpuinfo_map(AddrUnsPair ** address2os,int * line,kmp_i18n_id_t * const msg_id,FILE * f)2042 static int __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os,
2043 int *line,
2044 kmp_i18n_id_t *const msg_id,
2045 FILE *f) {
2046 *address2os = NULL;
2047 *msg_id = kmp_i18n_null;
2048
2049 // Scan of the file, and count the number of "processor" (osId) fields,
2050 // and find the highest value of <n> for a node_<n> field.
2051 char buf[256];
2052 unsigned num_records = 0;
2053 while (!feof(f)) {
2054 buf[sizeof(buf) - 1] = 1;
2055 if (!fgets(buf, sizeof(buf), f)) {
2056 // Read errors presumably because of EOF
2057 break;
2058 }
2059
2060 char s1[] = "processor";
2061 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2062 num_records++;
2063 continue;
2064 }
2065
2066 // FIXME - this will match "node_<n> <garbage>"
2067 unsigned level;
2068 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2069 if (nodeIdIndex + level >= maxIndex) {
2070 maxIndex = nodeIdIndex + level;
2071 }
2072 continue;
2073 }
2074 }
2075
2076 // Check for empty file / no valid processor records, or too many. The number
2077 // of records can't exceed the number of valid bits in the affinity mask.
2078 if (num_records == 0) {
2079 *line = 0;
2080 *msg_id = kmp_i18n_str_NoProcRecords;
2081 return -1;
2082 }
2083 if (num_records > (unsigned)__kmp_xproc) {
2084 *line = 0;
2085 *msg_id = kmp_i18n_str_TooManyProcRecords;
2086 return -1;
2087 }
2088
2089 // Set the file pointer back to the beginning, so that we can scan the file
2090 // again, this time performing a full parse of the data. Allocate a vector of
2091 // ProcCpuInfo object, where we will place the data. Adding an extra element
2092 // at the end allows us to remove a lot of extra checks for termination
2093 // conditions.
2094 if (fseek(f, 0, SEEK_SET) != 0) {
2095 *line = 0;
2096 *msg_id = kmp_i18n_str_CantRewindCpuinfo;
2097 return -1;
2098 }
2099
2100 // Allocate the array of records to store the proc info in. The dummy
2101 // element at the end makes the logic in filling them out easier to code.
2102 unsigned **threadInfo =
2103 (unsigned **)__kmp_allocate((num_records + 1) * sizeof(unsigned *));
2104 unsigned i;
2105 for (i = 0; i <= num_records; i++) {
2106 threadInfo[i] =
2107 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2108 }
2109
2110 #define CLEANUP_THREAD_INFO \
2111 for (i = 0; i <= num_records; i++) { \
2112 __kmp_free(threadInfo[i]); \
2113 } \
2114 __kmp_free(threadInfo);
2115
2116 // A value of UINT_MAX means that we didn't find the field
2117 unsigned __index;
2118
2119 #define INIT_PROC_INFO(p) \
2120 for (__index = 0; __index <= maxIndex; __index++) { \
2121 (p)[__index] = UINT_MAX; \
2122 }
2123
2124 for (i = 0; i <= num_records; i++) {
2125 INIT_PROC_INFO(threadInfo[i]);
2126 }
2127
2128 unsigned num_avail = 0;
2129 *line = 0;
2130 while (!feof(f)) {
2131 // Create an inner scoping level, so that all the goto targets at the end of
2132 // the loop appear in an outer scoping level. This avoids warnings about
2133 // jumping past an initialization to a target in the same block.
2134 {
2135 buf[sizeof(buf) - 1] = 1;
2136 bool long_line = false;
2137 if (!fgets(buf, sizeof(buf), f)) {
2138 // Read errors presumably because of EOF
2139 // If there is valid data in threadInfo[num_avail], then fake
2140 // a blank line in ensure that the last address gets parsed.
2141 bool valid = false;
2142 for (i = 0; i <= maxIndex; i++) {
2143 if (threadInfo[num_avail][i] != UINT_MAX) {
2144 valid = true;
2145 }
2146 }
2147 if (!valid) {
2148 break;
2149 }
2150 buf[0] = 0;
2151 } else if (!buf[sizeof(buf) - 1]) {
2152 // The line is longer than the buffer. Set a flag and don't
2153 // emit an error if we were going to ignore the line, anyway.
2154 long_line = true;
2155
2156 #define CHECK_LINE \
2157 if (long_line) { \
2158 CLEANUP_THREAD_INFO; \
2159 *msg_id = kmp_i18n_str_LongLineCpuinfo; \
2160 return -1; \
2161 }
2162 }
2163 (*line)++;
2164
2165 char s1[] = "processor";
2166 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) {
2167 CHECK_LINE;
2168 char *p = strchr(buf + sizeof(s1) - 1, ':');
2169 unsigned val;
2170 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2171 goto no_val;
2172 if (threadInfo[num_avail][osIdIndex] != UINT_MAX)
2173 #if KMP_ARCH_AARCH64
2174 // Handle the old AArch64 /proc/cpuinfo layout differently,
2175 // it contains all of the 'processor' entries listed in a
2176 // single 'Processor' section, therefore the normal looking
2177 // for duplicates in that section will always fail.
2178 num_avail++;
2179 #else
2180 goto dup_field;
2181 #endif
2182 threadInfo[num_avail][osIdIndex] = val;
2183 #if KMP_OS_LINUX && !(KMP_ARCH_X86 || KMP_ARCH_X86_64)
2184 char path[256];
2185 KMP_SNPRINTF(
2186 path, sizeof(path),
2187 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id",
2188 threadInfo[num_avail][osIdIndex]);
2189 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]);
2190
2191 KMP_SNPRINTF(path, sizeof(path),
2192 "/sys/devices/system/cpu/cpu%u/topology/core_id",
2193 threadInfo[num_avail][osIdIndex]);
2194 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]);
2195 continue;
2196 #else
2197 }
2198 char s2[] = "physical id";
2199 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) {
2200 CHECK_LINE;
2201 char *p = strchr(buf + sizeof(s2) - 1, ':');
2202 unsigned val;
2203 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2204 goto no_val;
2205 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX)
2206 goto dup_field;
2207 threadInfo[num_avail][pkgIdIndex] = val;
2208 continue;
2209 }
2210 char s3[] = "core id";
2211 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) {
2212 CHECK_LINE;
2213 char *p = strchr(buf + sizeof(s3) - 1, ':');
2214 unsigned val;
2215 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2216 goto no_val;
2217 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX)
2218 goto dup_field;
2219 threadInfo[num_avail][coreIdIndex] = val;
2220 continue;
2221 #endif // KMP_OS_LINUX && USE_SYSFS_INFO
2222 }
2223 char s4[] = "thread id";
2224 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) {
2225 CHECK_LINE;
2226 char *p = strchr(buf + sizeof(s4) - 1, ':');
2227 unsigned val;
2228 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2229 goto no_val;
2230 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX)
2231 goto dup_field;
2232 threadInfo[num_avail][threadIdIndex] = val;
2233 continue;
2234 }
2235 unsigned level;
2236 if (KMP_SSCANF(buf, "node_%u id", &level) == 1) {
2237 CHECK_LINE;
2238 char *p = strchr(buf + sizeof(s4) - 1, ':');
2239 unsigned val;
2240 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1))
2241 goto no_val;
2242 KMP_ASSERT(nodeIdIndex + level <= maxIndex);
2243 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX)
2244 goto dup_field;
2245 threadInfo[num_avail][nodeIdIndex + level] = val;
2246 continue;
2247 }
2248
2249 // We didn't recognize the leading token on the line. There are lots of
2250 // leading tokens that we don't recognize - if the line isn't empty, go on
2251 // to the next line.
2252 if ((*buf != 0) && (*buf != '\n')) {
2253 // If the line is longer than the buffer, read characters
2254 // until we find a newline.
2255 if (long_line) {
2256 int ch;
2257 while (((ch = fgetc(f)) != EOF) && (ch != '\n'))
2258 ;
2259 }
2260 continue;
2261 }
2262
2263 // A newline has signalled the end of the processor record.
2264 // Check that there aren't too many procs specified.
2265 if ((int)num_avail == __kmp_xproc) {
2266 CLEANUP_THREAD_INFO;
2267 *msg_id = kmp_i18n_str_TooManyEntries;
2268 return -1;
2269 }
2270
2271 // Check for missing fields. The osId field must be there, and we
2272 // currently require that the physical id field is specified, also.
2273 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) {
2274 CLEANUP_THREAD_INFO;
2275 *msg_id = kmp_i18n_str_MissingProcField;
2276 return -1;
2277 }
2278 if (threadInfo[0][pkgIdIndex] == UINT_MAX) {
2279 CLEANUP_THREAD_INFO;
2280 *msg_id = kmp_i18n_str_MissingPhysicalIDField;
2281 return -1;
2282 }
2283
2284 // Skip this proc if it is not included in the machine model.
2285 if (!KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex],
2286 __kmp_affin_fullMask)) {
2287 INIT_PROC_INFO(threadInfo[num_avail]);
2288 continue;
2289 }
2290
2291 // We have a successful parse of this proc's info.
2292 // Increment the counter, and prepare for the next proc.
2293 num_avail++;
2294 KMP_ASSERT(num_avail <= num_records);
2295 INIT_PROC_INFO(threadInfo[num_avail]);
2296 }
2297 continue;
2298
2299 no_val:
2300 CLEANUP_THREAD_INFO;
2301 *msg_id = kmp_i18n_str_MissingValCpuinfo;
2302 return -1;
2303
2304 dup_field:
2305 CLEANUP_THREAD_INFO;
2306 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo;
2307 return -1;
2308 }
2309 *line = 0;
2310
2311 #if KMP_MIC && REDUCE_TEAM_SIZE
2312 unsigned teamSize = 0;
2313 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2314
2315 // check for num_records == __kmp_xproc ???
2316
2317 // If there's only one thread context to bind to, form an Address object with
2318 // depth 1 and return immediately (or, if affinity is off, set address2os to
2319 // NULL and return).
2320 //
2321 // If it is configured to omit the package level when there is only a single
2322 // package, the logic at the end of this routine won't work if there is only a
2323 // single thread - it would try to form an Address object with depth 0.
2324 KMP_ASSERT(num_avail > 0);
2325 KMP_ASSERT(num_avail <= num_records);
2326 if (num_avail == 1) {
2327 __kmp_ncores = 1;
2328 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1;
2329 if (__kmp_affinity_verbose) {
2330 if (!KMP_AFFINITY_CAPABLE()) {
2331 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2332 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2333 KMP_INFORM(Uniform, "KMP_AFFINITY");
2334 } else {
2335 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2336 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2337 __kmp_affin_fullMask);
2338 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2339 if (__kmp_affinity_respect_mask) {
2340 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2341 } else {
2342 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2343 }
2344 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2345 KMP_INFORM(Uniform, "KMP_AFFINITY");
2346 }
2347 int index;
2348 kmp_str_buf_t buf;
2349 __kmp_str_buf_init(&buf);
2350 __kmp_str_buf_print(&buf, "1");
2351 for (index = maxIndex - 1; index > pkgIdIndex; index--) {
2352 __kmp_str_buf_print(&buf, " x 1");
2353 }
2354 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1);
2355 __kmp_str_buf_free(&buf);
2356 }
2357
2358 if (__kmp_affinity_type == affinity_none) {
2359 CLEANUP_THREAD_INFO;
2360 return 0;
2361 }
2362
2363 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair));
2364 Address addr(1);
2365 addr.labels[0] = threadInfo[0][pkgIdIndex];
2366 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]);
2367
2368 if (__kmp_affinity_gran_levels < 0) {
2369 __kmp_affinity_gran_levels = 0;
2370 }
2371
2372 if (__kmp_affinity_verbose) {
2373 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1);
2374 }
2375
2376 CLEANUP_THREAD_INFO;
2377 return 1;
2378 }
2379
2380 // Sort the threadInfo table by physical Id.
2381 qsort(threadInfo, num_avail, sizeof(*threadInfo),
2382 __kmp_affinity_cmp_ProcCpuInfo_phys_id);
2383
2384 // The table is now sorted by pkgId / coreId / threadId, but we really don't
2385 // know the radix of any of the fields. pkgId's may be sparsely assigned among
2386 // the chips on a system. Although coreId's are usually assigned
2387 // [0 .. coresPerPkg-1] and threadId's are usually assigned
2388 // [0..threadsPerCore-1], we don't want to make any such assumptions.
2389 //
2390 // For that matter, we don't know what coresPerPkg and threadsPerCore (or the
2391 // total # packages) are at this point - we want to determine that now. We
2392 // only have an upper bound on the first two figures.
2393 unsigned *counts =
2394 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2395 unsigned *maxCt =
2396 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2397 unsigned *totals =
2398 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2399 unsigned *lastId =
2400 (unsigned *)__kmp_allocate((maxIndex + 1) * sizeof(unsigned));
2401
2402 bool assign_thread_ids = false;
2403 unsigned threadIdCt;
2404 unsigned index;
2405
2406 restart_radix_check:
2407 threadIdCt = 0;
2408
2409 // Initialize the counter arrays with data from threadInfo[0].
2410 if (assign_thread_ids) {
2411 if (threadInfo[0][threadIdIndex] == UINT_MAX) {
2412 threadInfo[0][threadIdIndex] = threadIdCt++;
2413 } else if (threadIdCt <= threadInfo[0][threadIdIndex]) {
2414 threadIdCt = threadInfo[0][threadIdIndex] + 1;
2415 }
2416 }
2417 for (index = 0; index <= maxIndex; index++) {
2418 counts[index] = 1;
2419 maxCt[index] = 1;
2420 totals[index] = 1;
2421 lastId[index] = threadInfo[0][index];
2422 ;
2423 }
2424
2425 // Run through the rest of the OS procs.
2426 for (i = 1; i < num_avail; i++) {
2427 // Find the most significant index whose id differs from the id for the
2428 // previous OS proc.
2429 for (index = maxIndex; index >= threadIdIndex; index--) {
2430 if (assign_thread_ids && (index == threadIdIndex)) {
2431 // Auto-assign the thread id field if it wasn't specified.
2432 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2433 threadInfo[i][threadIdIndex] = threadIdCt++;
2434 }
2435 // Apparently the thread id field was specified for some entries and not
2436 // others. Start the thread id counter off at the next higher thread id.
2437 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2438 threadIdCt = threadInfo[i][threadIdIndex] + 1;
2439 }
2440 }
2441 if (threadInfo[i][index] != lastId[index]) {
2442 // Run through all indices which are less significant, and reset the
2443 // counts to 1. At all levels up to and including index, we need to
2444 // increment the totals and record the last id.
2445 unsigned index2;
2446 for (index2 = threadIdIndex; index2 < index; index2++) {
2447 totals[index2]++;
2448 if (counts[index2] > maxCt[index2]) {
2449 maxCt[index2] = counts[index2];
2450 }
2451 counts[index2] = 1;
2452 lastId[index2] = threadInfo[i][index2];
2453 }
2454 counts[index]++;
2455 totals[index]++;
2456 lastId[index] = threadInfo[i][index];
2457
2458 if (assign_thread_ids && (index > threadIdIndex)) {
2459
2460 #if KMP_MIC && REDUCE_TEAM_SIZE
2461 // The default team size is the total #threads in the machine
2462 // minus 1 thread for every core that has 3 or more threads.
2463 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
2464 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2465
2466 // Restart the thread counter, as we are on a new core.
2467 threadIdCt = 0;
2468
2469 // Auto-assign the thread id field if it wasn't specified.
2470 if (threadInfo[i][threadIdIndex] == UINT_MAX) {
2471 threadInfo[i][threadIdIndex] = threadIdCt++;
2472 }
2473
2474 // Apparently the thread id field was specified for some entries and
2475 // not others. Start the thread id counter off at the next higher
2476 // thread id.
2477 else if (threadIdCt <= threadInfo[i][threadIdIndex]) {
2478 threadIdCt = threadInfo[i][threadIdIndex] + 1;
2479 }
2480 }
2481 break;
2482 }
2483 }
2484 if (index < threadIdIndex) {
2485 // If thread ids were specified, it is an error if they are not unique.
2486 // Also, check that we waven't already restarted the loop (to be safe -
2487 // shouldn't need to).
2488 if ((threadInfo[i][threadIdIndex] != UINT_MAX) || assign_thread_ids) {
2489 __kmp_free(lastId);
2490 __kmp_free(totals);
2491 __kmp_free(maxCt);
2492 __kmp_free(counts);
2493 CLEANUP_THREAD_INFO;
2494 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique;
2495 return -1;
2496 }
2497
2498 // If the thread ids were not specified and we see entries entries that
2499 // are duplicates, start the loop over and assign the thread ids manually.
2500 assign_thread_ids = true;
2501 goto restart_radix_check;
2502 }
2503 }
2504
2505 #if KMP_MIC && REDUCE_TEAM_SIZE
2506 // The default team size is the total #threads in the machine
2507 // minus 1 thread for every core that has 3 or more threads.
2508 teamSize += (threadIdCt <= 2) ? (threadIdCt) : (threadIdCt - 1);
2509 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2510
2511 for (index = threadIdIndex; index <= maxIndex; index++) {
2512 if (counts[index] > maxCt[index]) {
2513 maxCt[index] = counts[index];
2514 }
2515 }
2516
2517 __kmp_nThreadsPerCore = maxCt[threadIdIndex];
2518 nCoresPerPkg = maxCt[coreIdIndex];
2519 nPackages = totals[pkgIdIndex];
2520
2521 // Check to see if the machine topology is uniform
2522 unsigned prod = totals[maxIndex];
2523 for (index = threadIdIndex; index < maxIndex; index++) {
2524 prod *= maxCt[index];
2525 }
2526 bool uniform = (prod == totals[threadIdIndex]);
2527
2528 // When affinity is off, this routine will still be called to set
2529 // __kmp_ncores, as well as __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages.
2530 // Make sure all these vars are set correctly, and return now if affinity is
2531 // not enabled.
2532 __kmp_ncores = totals[coreIdIndex];
2533
2534 if (__kmp_affinity_verbose) {
2535 if (!KMP_AFFINITY_CAPABLE()) {
2536 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY");
2537 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2538 if (uniform) {
2539 KMP_INFORM(Uniform, "KMP_AFFINITY");
2540 } else {
2541 KMP_INFORM(NonUniform, "KMP_AFFINITY");
2542 }
2543 } else {
2544 char buf[KMP_AFFIN_MASK_PRINT_LEN];
2545 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
2546 __kmp_affin_fullMask);
2547 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY");
2548 if (__kmp_affinity_respect_mask) {
2549 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf);
2550 } else {
2551 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf);
2552 }
2553 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc);
2554 if (uniform) {
2555 KMP_INFORM(Uniform, "KMP_AFFINITY");
2556 } else {
2557 KMP_INFORM(NonUniform, "KMP_AFFINITY");
2558 }
2559 }
2560 kmp_str_buf_t buf;
2561 __kmp_str_buf_init(&buf);
2562
2563 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]);
2564 for (index = maxIndex - 1; index >= pkgIdIndex; index--) {
2565 __kmp_str_buf_print(&buf, " x %d", maxCt[index]);
2566 }
2567 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex],
2568 maxCt[threadIdIndex], __kmp_ncores);
2569
2570 __kmp_str_buf_free(&buf);
2571 }
2572
2573 #if KMP_MIC && REDUCE_TEAM_SIZE
2574 // Set the default team size.
2575 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) {
2576 __kmp_dflt_team_nth = teamSize;
2577 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting "
2578 "__kmp_dflt_team_nth = %d\n",
2579 __kmp_dflt_team_nth));
2580 }
2581 #endif // KMP_MIC && REDUCE_TEAM_SIZE
2582
2583 KMP_DEBUG_ASSERT(__kmp_pu_os_idx == NULL);
2584 KMP_DEBUG_ASSERT(num_avail == (unsigned)__kmp_avail_proc);
2585 __kmp_pu_os_idx = (int *)__kmp_allocate(sizeof(int) * __kmp_avail_proc);
2586 for (i = 0; i < num_avail; ++i) { // fill the os indices
2587 __kmp_pu_os_idx[i] = threadInfo[i][osIdIndex];
2588 }
2589
2590 if (__kmp_affinity_type == affinity_none) {
2591 __kmp_free(lastId);
2592 __kmp_free(totals);
2593 __kmp_free(maxCt);
2594 __kmp_free(counts);
2595 CLEANUP_THREAD_INFO;
2596 return 0;
2597 }
2598
2599 // Count the number of levels which have more nodes at that level than at the
2600 // parent's level (with there being an implicit root node of the top level).
2601 // This is equivalent to saying that there is at least one node at this level
2602 // which has a sibling. These levels are in the map, and the package level is
2603 // always in the map.
2604 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool));
2605 for (index = threadIdIndex; index < maxIndex; index++) {
2606 KMP_ASSERT(totals[index] >= totals[index + 1]);
2607 inMap[index] = (totals[index] > totals[index + 1]);
2608 }
2609 inMap[maxIndex] = (totals[maxIndex] > 1);
2610 inMap[pkgIdIndex] = true;
2611
2612 int depth = 0;
2613 for (index = threadIdIndex; index <= maxIndex; index++) {
2614 if (inMap[index]) {
2615 depth++;
2616 }
2617 }
2618 KMP_ASSERT(depth > 0);
2619
2620 // Construct the data structure that is to be returned.
2621 *address2os = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * num_avail);
2622 int pkgLevel = -1;
2623 int coreLevel = -1;
2624 int threadLevel = -1;
2625
2626 for (i = 0; i < num_avail; ++i) {
2627 Address addr(depth);
2628 unsigned os = threadInfo[i][osIdIndex];
2629 int src_index;
2630 int dst_index = 0;
2631
2632 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) {
2633 if (!inMap[src_index]) {
2634 continue;
2635 }
2636 addr.labels[dst_index] = threadInfo[i][src_index];
2637 if (src_index == pkgIdIndex) {
2638 pkgLevel = dst_index;
2639 } else if (src_index == coreIdIndex) {
2640 coreLevel = dst_index;
2641 } else if (src_index == threadIdIndex) {
2642 threadLevel = dst_index;
2643 }
2644 dst_index++;
2645 }
2646 (*address2os)[i] = AddrUnsPair(addr, os);
2647 }
2648
2649 if (__kmp_affinity_gran_levels < 0) {
2650 // Set the granularity level based on what levels are modeled
2651 // in the machine topology map.
2652 unsigned src_index;
2653 __kmp_affinity_gran_levels = 0;
2654 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) {
2655 if (!inMap[src_index]) {
2656 continue;
2657 }
2658 switch (src_index) {
2659 case threadIdIndex:
2660 if (__kmp_affinity_gran > affinity_gran_thread) {
2661 __kmp_affinity_gran_levels++;
2662 }
2663
2664 break;
2665 case coreIdIndex:
2666 if (__kmp_affinity_gran > affinity_gran_core) {
2667 __kmp_affinity_gran_levels++;
2668 }
2669 break;
2670
2671 case pkgIdIndex:
2672 if (__kmp_affinity_gran > affinity_gran_package) {
2673 __kmp_affinity_gran_levels++;
2674 }
2675 break;
2676 }
2677 }
2678 }
2679
2680 if (__kmp_affinity_verbose) {
2681 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel,
2682 coreLevel, threadLevel);
2683 }
2684
2685 __kmp_free(inMap);
2686 __kmp_free(lastId);
2687 __kmp_free(totals);
2688 __kmp_free(maxCt);
2689 __kmp_free(counts);
2690 CLEANUP_THREAD_INFO;
2691 return depth;
2692 }
2693
2694 // Create and return a table of affinity masks, indexed by OS thread ID.
2695 // This routine handles OR'ing together all the affinity masks of threads
2696 // that are sufficiently close, if granularity > fine.
__kmp_create_masks(unsigned * maxIndex,unsigned * numUnique,AddrUnsPair * address2os,unsigned numAddrs)2697 static kmp_affin_mask_t *__kmp_create_masks(unsigned *maxIndex,
2698 unsigned *numUnique,
2699 AddrUnsPair *address2os,
2700 unsigned numAddrs) {
2701 // First form a table of affinity masks in order of OS thread id.
2702 unsigned depth;
2703 unsigned maxOsId;
2704 unsigned i;
2705
2706 KMP_ASSERT(numAddrs > 0);
2707 depth = address2os[0].first.depth;
2708
2709 maxOsId = 0;
2710 for (i = numAddrs - 1;; --i) {
2711 unsigned osId = address2os[i].second;
2712 if (osId > maxOsId) {
2713 maxOsId = osId;
2714 }
2715 if (i == 0)
2716 break;
2717 }
2718 kmp_affin_mask_t *osId2Mask;
2719 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId + 1));
2720
2721 // Sort the address2os table according to physical order. Doing so will put
2722 // all threads on the same core/package/node in consecutive locations.
2723 qsort(address2os, numAddrs, sizeof(*address2os),
2724 __kmp_affinity_cmp_Address_labels);
2725
2726 KMP_ASSERT(__kmp_affinity_gran_levels >= 0);
2727 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) {
2728 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels);
2729 }
2730 if (__kmp_affinity_gran_levels >= (int)depth) {
2731 if (__kmp_affinity_verbose ||
2732 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
2733 KMP_WARNING(AffThreadsMayMigrate);
2734 }
2735 }
2736
2737 // Run through the table, forming the masks for all threads on each core.
2738 // Threads on the same core will have identical "Address" objects, not
2739 // considering the last level, which must be the thread id. All threads on a
2740 // core will appear consecutively.
2741 unsigned unique = 0;
2742 unsigned j = 0; // index of 1st thread on core
2743 unsigned leader = 0;
2744 Address *leaderAddr = &(address2os[0].first);
2745 kmp_affin_mask_t *sum;
2746 KMP_CPU_ALLOC_ON_STACK(sum);
2747 KMP_CPU_ZERO(sum);
2748 KMP_CPU_SET(address2os[0].second, sum);
2749 for (i = 1; i < numAddrs; i++) {
2750 // If this thread is sufficiently close to the leader (within the
2751 // granularity setting), then set the bit for this os thread in the
2752 // affinity mask for this group, and go on to the next thread.
2753 if (leaderAddr->isClose(address2os[i].first, __kmp_affinity_gran_levels)) {
2754 KMP_CPU_SET(address2os[i].second, sum);
2755 continue;
2756 }
2757
2758 // For every thread in this group, copy the mask to the thread's entry in
2759 // the osId2Mask table. Mark the first address as a leader.
2760 for (; j < i; j++) {
2761 unsigned osId = address2os[j].second;
2762 KMP_DEBUG_ASSERT(osId <= maxOsId);
2763 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2764 KMP_CPU_COPY(mask, sum);
2765 address2os[j].first.leader = (j == leader);
2766 }
2767 unique++;
2768
2769 // Start a new mask.
2770 leader = i;
2771 leaderAddr = &(address2os[i].first);
2772 KMP_CPU_ZERO(sum);
2773 KMP_CPU_SET(address2os[i].second, sum);
2774 }
2775
2776 // For every thread in last group, copy the mask to the thread's
2777 // entry in the osId2Mask table.
2778 for (; j < i; j++) {
2779 unsigned osId = address2os[j].second;
2780 KMP_DEBUG_ASSERT(osId <= maxOsId);
2781 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId);
2782 KMP_CPU_COPY(mask, sum);
2783 address2os[j].first.leader = (j == leader);
2784 }
2785 unique++;
2786 KMP_CPU_FREE_FROM_STACK(sum);
2787
2788 *maxIndex = maxOsId;
2789 *numUnique = unique;
2790 return osId2Mask;
2791 }
2792
2793 // Stuff for the affinity proclist parsers. It's easier to declare these vars
2794 // as file-static than to try and pass them through the calling sequence of
2795 // the recursive-descent OMP_PLACES parser.
2796 static kmp_affin_mask_t *newMasks;
2797 static int numNewMasks;
2798 static int nextNewMask;
2799
2800 #define ADD_MASK(_mask) \
2801 { \
2802 if (nextNewMask >= numNewMasks) { \
2803 int i; \
2804 numNewMasks *= 2; \
2805 kmp_affin_mask_t *temp; \
2806 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \
2807 for (i = 0; i < numNewMasks / 2; i++) { \
2808 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i); \
2809 kmp_affin_mask_t *dest = KMP_CPU_INDEX(temp, i); \
2810 KMP_CPU_COPY(dest, src); \
2811 } \
2812 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks / 2); \
2813 newMasks = temp; \
2814 } \
2815 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \
2816 nextNewMask++; \
2817 }
2818
2819 #define ADD_MASK_OSID(_osId, _osId2Mask, _maxOsId) \
2820 { \
2821 if (((_osId) > _maxOsId) || \
2822 (!KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \
2823 if (__kmp_affinity_verbose || \
2824 (__kmp_affinity_warnings && \
2825 (__kmp_affinity_type != affinity_none))) { \
2826 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \
2827 } \
2828 } else { \
2829 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \
2830 } \
2831 }
2832
2833 // Re-parse the proclist (for the explicit affinity type), and form the list
2834 // of affinity newMasks indexed by gtid.
__kmp_affinity_process_proclist(kmp_affin_mask_t ** out_masks,unsigned int * out_numMasks,const char * proclist,kmp_affin_mask_t * osId2Mask,int maxOsId)2835 static void __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks,
2836 unsigned int *out_numMasks,
2837 const char *proclist,
2838 kmp_affin_mask_t *osId2Mask,
2839 int maxOsId) {
2840 int i;
2841 const char *scan = proclist;
2842 const char *next = proclist;
2843
2844 // We use malloc() for the temporary mask vector, so that we can use
2845 // realloc() to extend it.
2846 numNewMasks = 2;
2847 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
2848 nextNewMask = 0;
2849 kmp_affin_mask_t *sumMask;
2850 KMP_CPU_ALLOC(sumMask);
2851 int setSize = 0;
2852
2853 for (;;) {
2854 int start, end, stride;
2855
2856 SKIP_WS(scan);
2857 next = scan;
2858 if (*next == '\0') {
2859 break;
2860 }
2861
2862 if (*next == '{') {
2863 int num;
2864 setSize = 0;
2865 next++; // skip '{'
2866 SKIP_WS(next);
2867 scan = next;
2868
2869 // Read the first integer in the set.
2870 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad proclist");
2871 SKIP_DIGITS(next);
2872 num = __kmp_str_to_int(scan, *next);
2873 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2874
2875 // Copy the mask for that osId to the sum (union) mask.
2876 if ((num > maxOsId) ||
2877 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2878 if (__kmp_affinity_verbose ||
2879 (__kmp_affinity_warnings &&
2880 (__kmp_affinity_type != affinity_none))) {
2881 KMP_WARNING(AffIgnoreInvalidProcID, num);
2882 }
2883 KMP_CPU_ZERO(sumMask);
2884 } else {
2885 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2886 setSize = 1;
2887 }
2888
2889 for (;;) {
2890 // Check for end of set.
2891 SKIP_WS(next);
2892 if (*next == '}') {
2893 next++; // skip '}'
2894 break;
2895 }
2896
2897 // Skip optional comma.
2898 if (*next == ',') {
2899 next++;
2900 }
2901 SKIP_WS(next);
2902
2903 // Read the next integer in the set.
2904 scan = next;
2905 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2906
2907 SKIP_DIGITS(next);
2908 num = __kmp_str_to_int(scan, *next);
2909 KMP_ASSERT2(num >= 0, "bad explicit proc list");
2910
2911 // Add the mask for that osId to the sum mask.
2912 if ((num > maxOsId) ||
2913 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
2914 if (__kmp_affinity_verbose ||
2915 (__kmp_affinity_warnings &&
2916 (__kmp_affinity_type != affinity_none))) {
2917 KMP_WARNING(AffIgnoreInvalidProcID, num);
2918 }
2919 } else {
2920 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num));
2921 setSize++;
2922 }
2923 }
2924 if (setSize > 0) {
2925 ADD_MASK(sumMask);
2926 }
2927
2928 SKIP_WS(next);
2929 if (*next == ',') {
2930 next++;
2931 }
2932 scan = next;
2933 continue;
2934 }
2935
2936 // Read the first integer.
2937 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2938 SKIP_DIGITS(next);
2939 start = __kmp_str_to_int(scan, *next);
2940 KMP_ASSERT2(start >= 0, "bad explicit proc list");
2941 SKIP_WS(next);
2942
2943 // If this isn't a range, then add a mask to the list and go on.
2944 if (*next != '-') {
2945 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2946
2947 // Skip optional comma.
2948 if (*next == ',') {
2949 next++;
2950 }
2951 scan = next;
2952 continue;
2953 }
2954
2955 // This is a range. Skip over the '-' and read in the 2nd int.
2956 next++; // skip '-'
2957 SKIP_WS(next);
2958 scan = next;
2959 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2960 SKIP_DIGITS(next);
2961 end = __kmp_str_to_int(scan, *next);
2962 KMP_ASSERT2(end >= 0, "bad explicit proc list");
2963
2964 // Check for a stride parameter
2965 stride = 1;
2966 SKIP_WS(next);
2967 if (*next == ':') {
2968 // A stride is specified. Skip over the ':" and read the 3rd int.
2969 int sign = +1;
2970 next++; // skip ':'
2971 SKIP_WS(next);
2972 scan = next;
2973 if (*next == '-') {
2974 sign = -1;
2975 next++;
2976 SKIP_WS(next);
2977 scan = next;
2978 }
2979 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list");
2980 SKIP_DIGITS(next);
2981 stride = __kmp_str_to_int(scan, *next);
2982 KMP_ASSERT2(stride >= 0, "bad explicit proc list");
2983 stride *= sign;
2984 }
2985
2986 // Do some range checks.
2987 KMP_ASSERT2(stride != 0, "bad explicit proc list");
2988 if (stride > 0) {
2989 KMP_ASSERT2(start <= end, "bad explicit proc list");
2990 } else {
2991 KMP_ASSERT2(start >= end, "bad explicit proc list");
2992 }
2993 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list");
2994
2995 // Add the mask for each OS proc # to the list.
2996 if (stride > 0) {
2997 do {
2998 ADD_MASK_OSID(start, osId2Mask, maxOsId);
2999 start += stride;
3000 } while (start <= end);
3001 } else {
3002 do {
3003 ADD_MASK_OSID(start, osId2Mask, maxOsId);
3004 start += stride;
3005 } while (start >= end);
3006 }
3007
3008 // Skip optional comma.
3009 SKIP_WS(next);
3010 if (*next == ',') {
3011 next++;
3012 }
3013 scan = next;
3014 }
3015
3016 *out_numMasks = nextNewMask;
3017 if (nextNewMask == 0) {
3018 *out_masks = NULL;
3019 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3020 return;
3021 }
3022 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3023 for (i = 0; i < nextNewMask; i++) {
3024 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3025 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3026 KMP_CPU_COPY(dest, src);
3027 }
3028 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3029 KMP_CPU_FREE(sumMask);
3030 }
3031
3032 /*-----------------------------------------------------------------------------
3033 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different
3034 places. Again, Here is the grammar:
3035
3036 place_list := place
3037 place_list := place , place_list
3038 place := num
3039 place := place : num
3040 place := place : num : signed
3041 place := { subplacelist }
3042 place := ! place // (lowest priority)
3043 subplace_list := subplace
3044 subplace_list := subplace , subplace_list
3045 subplace := num
3046 subplace := num : num
3047 subplace := num : num : signed
3048 signed := num
3049 signed := + signed
3050 signed := - signed
3051 -----------------------------------------------------------------------------*/
__kmp_process_subplace_list(const char ** scan,kmp_affin_mask_t * osId2Mask,int maxOsId,kmp_affin_mask_t * tempMask,int * setSize)3052 static void __kmp_process_subplace_list(const char **scan,
3053 kmp_affin_mask_t *osId2Mask,
3054 int maxOsId, kmp_affin_mask_t *tempMask,
3055 int *setSize) {
3056 const char *next;
3057
3058 for (;;) {
3059 int start, count, stride, i;
3060
3061 // Read in the starting proc id
3062 SKIP_WS(*scan);
3063 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3064 next = *scan;
3065 SKIP_DIGITS(next);
3066 start = __kmp_str_to_int(*scan, *next);
3067 KMP_ASSERT(start >= 0);
3068 *scan = next;
3069
3070 // valid follow sets are ',' ':' and '}'
3071 SKIP_WS(*scan);
3072 if (**scan == '}' || **scan == ',') {
3073 if ((start > maxOsId) ||
3074 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3075 if (__kmp_affinity_verbose ||
3076 (__kmp_affinity_warnings &&
3077 (__kmp_affinity_type != affinity_none))) {
3078 KMP_WARNING(AffIgnoreInvalidProcID, start);
3079 }
3080 } else {
3081 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3082 (*setSize)++;
3083 }
3084 if (**scan == '}') {
3085 break;
3086 }
3087 (*scan)++; // skip ','
3088 continue;
3089 }
3090 KMP_ASSERT2(**scan == ':', "bad explicit places list");
3091 (*scan)++; // skip ':'
3092
3093 // Read count parameter
3094 SKIP_WS(*scan);
3095 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3096 next = *scan;
3097 SKIP_DIGITS(next);
3098 count = __kmp_str_to_int(*scan, *next);
3099 KMP_ASSERT(count >= 0);
3100 *scan = next;
3101
3102 // valid follow sets are ',' ':' and '}'
3103 SKIP_WS(*scan);
3104 if (**scan == '}' || **scan == ',') {
3105 for (i = 0; i < count; i++) {
3106 if ((start > maxOsId) ||
3107 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3108 if (__kmp_affinity_verbose ||
3109 (__kmp_affinity_warnings &&
3110 (__kmp_affinity_type != affinity_none))) {
3111 KMP_WARNING(AffIgnoreInvalidProcID, start);
3112 }
3113 break; // don't proliferate warnings for large count
3114 } else {
3115 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3116 start++;
3117 (*setSize)++;
3118 }
3119 }
3120 if (**scan == '}') {
3121 break;
3122 }
3123 (*scan)++; // skip ','
3124 continue;
3125 }
3126 KMP_ASSERT2(**scan == ':', "bad explicit places list");
3127 (*scan)++; // skip ':'
3128
3129 // Read stride parameter
3130 int sign = +1;
3131 for (;;) {
3132 SKIP_WS(*scan);
3133 if (**scan == '+') {
3134 (*scan)++; // skip '+'
3135 continue;
3136 }
3137 if (**scan == '-') {
3138 sign *= -1;
3139 (*scan)++; // skip '-'
3140 continue;
3141 }
3142 break;
3143 }
3144 SKIP_WS(*scan);
3145 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), "bad explicit places list");
3146 next = *scan;
3147 SKIP_DIGITS(next);
3148 stride = __kmp_str_to_int(*scan, *next);
3149 KMP_ASSERT(stride >= 0);
3150 *scan = next;
3151 stride *= sign;
3152
3153 // valid follow sets are ',' and '}'
3154 SKIP_WS(*scan);
3155 if (**scan == '}' || **scan == ',') {
3156 for (i = 0; i < count; i++) {
3157 if ((start > maxOsId) ||
3158 (!KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) {
3159 if (__kmp_affinity_verbose ||
3160 (__kmp_affinity_warnings &&
3161 (__kmp_affinity_type != affinity_none))) {
3162 KMP_WARNING(AffIgnoreInvalidProcID, start);
3163 }
3164 break; // don't proliferate warnings for large count
3165 } else {
3166 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start));
3167 start += stride;
3168 (*setSize)++;
3169 }
3170 }
3171 if (**scan == '}') {
3172 break;
3173 }
3174 (*scan)++; // skip ','
3175 continue;
3176 }
3177
3178 KMP_ASSERT2(0, "bad explicit places list");
3179 }
3180 }
3181
__kmp_process_place(const char ** scan,kmp_affin_mask_t * osId2Mask,int maxOsId,kmp_affin_mask_t * tempMask,int * setSize)3182 static void __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask,
3183 int maxOsId, kmp_affin_mask_t *tempMask,
3184 int *setSize) {
3185 const char *next;
3186
3187 // valid follow sets are '{' '!' and num
3188 SKIP_WS(*scan);
3189 if (**scan == '{') {
3190 (*scan)++; // skip '{'
3191 __kmp_process_subplace_list(scan, osId2Mask, maxOsId, tempMask, setSize);
3192 KMP_ASSERT2(**scan == '}', "bad explicit places list");
3193 (*scan)++; // skip '}'
3194 } else if (**scan == '!') {
3195 (*scan)++; // skip '!'
3196 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize);
3197 KMP_CPU_COMPLEMENT(maxOsId, tempMask);
3198 } else if ((**scan >= '0') && (**scan <= '9')) {
3199 next = *scan;
3200 SKIP_DIGITS(next);
3201 int num = __kmp_str_to_int(*scan, *next);
3202 KMP_ASSERT(num >= 0);
3203 if ((num > maxOsId) ||
3204 (!KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) {
3205 if (__kmp_affinity_verbose ||
3206 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
3207 KMP_WARNING(AffIgnoreInvalidProcID, num);
3208 }
3209 } else {
3210 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num));
3211 (*setSize)++;
3212 }
3213 *scan = next; // skip num
3214 } else {
3215 KMP_ASSERT2(0, "bad explicit places list");
3216 }
3217 }
3218
3219 // static void
__kmp_affinity_process_placelist(kmp_affin_mask_t ** out_masks,unsigned int * out_numMasks,const char * placelist,kmp_affin_mask_t * osId2Mask,int maxOsId)3220 void __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks,
3221 unsigned int *out_numMasks,
3222 const char *placelist,
3223 kmp_affin_mask_t *osId2Mask,
3224 int maxOsId) {
3225 int i, j, count, stride, sign;
3226 const char *scan = placelist;
3227 const char *next = placelist;
3228
3229 numNewMasks = 2;
3230 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks);
3231 nextNewMask = 0;
3232
3233 // tempMask is modified based on the previous or initial
3234 // place to form the current place
3235 // previousMask contains the previous place
3236 kmp_affin_mask_t *tempMask;
3237 kmp_affin_mask_t *previousMask;
3238 KMP_CPU_ALLOC(tempMask);
3239 KMP_CPU_ZERO(tempMask);
3240 KMP_CPU_ALLOC(previousMask);
3241 KMP_CPU_ZERO(previousMask);
3242 int setSize = 0;
3243
3244 for (;;) {
3245 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize);
3246
3247 // valid follow sets are ',' ':' and EOL
3248 SKIP_WS(scan);
3249 if (*scan == '\0' || *scan == ',') {
3250 if (setSize > 0) {
3251 ADD_MASK(tempMask);
3252 }
3253 KMP_CPU_ZERO(tempMask);
3254 setSize = 0;
3255 if (*scan == '\0') {
3256 break;
3257 }
3258 scan++; // skip ','
3259 continue;
3260 }
3261
3262 KMP_ASSERT2(*scan == ':', "bad explicit places list");
3263 scan++; // skip ':'
3264
3265 // Read count parameter
3266 SKIP_WS(scan);
3267 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
3268 next = scan;
3269 SKIP_DIGITS(next);
3270 count = __kmp_str_to_int(scan, *next);
3271 KMP_ASSERT(count >= 0);
3272 scan = next;
3273
3274 // valid follow sets are ',' ':' and EOL
3275 SKIP_WS(scan);
3276 if (*scan == '\0' || *scan == ',') {
3277 stride = +1;
3278 } else {
3279 KMP_ASSERT2(*scan == ':', "bad explicit places list");
3280 scan++; // skip ':'
3281
3282 // Read stride parameter
3283 sign = +1;
3284 for (;;) {
3285 SKIP_WS(scan);
3286 if (*scan == '+') {
3287 scan++; // skip '+'
3288 continue;
3289 }
3290 if (*scan == '-') {
3291 sign *= -1;
3292 scan++; // skip '-'
3293 continue;
3294 }
3295 break;
3296 }
3297 SKIP_WS(scan);
3298 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), "bad explicit places list");
3299 next = scan;
3300 SKIP_DIGITS(next);
3301 stride = __kmp_str_to_int(scan, *next);
3302 KMP_DEBUG_ASSERT(stride >= 0);
3303 scan = next;
3304 stride *= sign;
3305 }
3306
3307 // Add places determined by initial_place : count : stride
3308 for (i = 0; i < count; i++) {
3309 if (setSize == 0) {
3310 break;
3311 }
3312 // Add the current place, then build the next place (tempMask) from that
3313 KMP_CPU_COPY(previousMask, tempMask);
3314 ADD_MASK(previousMask);
3315 KMP_CPU_ZERO(tempMask);
3316 setSize = 0;
3317 KMP_CPU_SET_ITERATE(j, previousMask) {
3318 if (!KMP_CPU_ISSET(j, previousMask)) {
3319 continue;
3320 }
3321 if ((j + stride > maxOsId) || (j + stride < 0) ||
3322 (!KMP_CPU_ISSET(j, __kmp_affin_fullMask)) ||
3323 (!KMP_CPU_ISSET(j + stride,
3324 KMP_CPU_INDEX(osId2Mask, j + stride)))) {
3325 if ((__kmp_affinity_verbose ||
3326 (__kmp_affinity_warnings &&
3327 (__kmp_affinity_type != affinity_none))) &&
3328 i < count - 1) {
3329 KMP_WARNING(AffIgnoreInvalidProcID, j + stride);
3330 }
3331 continue;
3332 }
3333 KMP_CPU_SET(j + stride, tempMask);
3334 setSize++;
3335 }
3336 }
3337 KMP_CPU_ZERO(tempMask);
3338 setSize = 0;
3339
3340 // valid follow sets are ',' and EOL
3341 SKIP_WS(scan);
3342 if (*scan == '\0') {
3343 break;
3344 }
3345 if (*scan == ',') {
3346 scan++; // skip ','
3347 continue;
3348 }
3349
3350 KMP_ASSERT2(0, "bad explicit places list");
3351 }
3352
3353 *out_numMasks = nextNewMask;
3354 if (nextNewMask == 0) {
3355 *out_masks = NULL;
3356 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3357 return;
3358 }
3359 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask);
3360 KMP_CPU_FREE(tempMask);
3361 KMP_CPU_FREE(previousMask);
3362 for (i = 0; i < nextNewMask; i++) {
3363 kmp_affin_mask_t *src = KMP_CPU_INDEX(newMasks, i);
3364 kmp_affin_mask_t *dest = KMP_CPU_INDEX((*out_masks), i);
3365 KMP_CPU_COPY(dest, src);
3366 }
3367 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks);
3368 }
3369
3370 #undef ADD_MASK
3371 #undef ADD_MASK_OSID
3372
3373 #if KMP_USE_HWLOC
__kmp_hwloc_skip_PUs_obj(hwloc_topology_t t,hwloc_obj_t o)3374 static int __kmp_hwloc_skip_PUs_obj(hwloc_topology_t t, hwloc_obj_t o) {
3375 // skip PUs descendants of the object o
3376 int skipped = 0;
3377 hwloc_obj_t hT = NULL;
3378 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
3379 for (int i = 0; i < N; ++i) {
3380 KMP_DEBUG_ASSERT(hT);
3381 unsigned idx = hT->os_index;
3382 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3383 KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3384 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3385 ++skipped;
3386 }
3387 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
3388 }
3389 return skipped; // count number of skipped units
3390 }
3391
__kmp_hwloc_obj_has_PUs(hwloc_topology_t t,hwloc_obj_t o)3392 static int __kmp_hwloc_obj_has_PUs(hwloc_topology_t t, hwloc_obj_t o) {
3393 // check if obj has PUs present in fullMask
3394 hwloc_obj_t hT = NULL;
3395 int N = __kmp_hwloc_count_children_by_type(t, o, HWLOC_OBJ_PU, &hT);
3396 for (int i = 0; i < N; ++i) {
3397 KMP_DEBUG_ASSERT(hT);
3398 unsigned idx = hT->os_index;
3399 if (KMP_CPU_ISSET(idx, __kmp_affin_fullMask))
3400 return 1; // found PU
3401 hT = hwloc_get_next_obj_by_type(t, HWLOC_OBJ_PU, hT);
3402 }
3403 return 0; // no PUs found
3404 }
3405 #endif // KMP_USE_HWLOC
3406
__kmp_apply_thread_places(AddrUnsPair ** pAddr,int depth)3407 static void __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) {
3408 AddrUnsPair *newAddr;
3409 if (__kmp_hws_requested == 0)
3410 goto _exit; // no topology limiting actions requested, exit
3411 #if KMP_USE_HWLOC
3412 if (__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
3413 // Number of subobjects calculated dynamically, this works fine for
3414 // any non-uniform topology.
3415 // L2 cache objects are determined by depth, other objects - by type.
3416 hwloc_topology_t tp = __kmp_hwloc_topology;
3417 int nS = 0, nN = 0, nL = 0, nC = 0,
3418 nT = 0; // logical index including skipped
3419 int nCr = 0, nTr = 0; // number of requested units
3420 int nPkg = 0, nCo = 0, n_new = 0, n_old = 0, nCpP = 0, nTpC = 0; // counters
3421 hwloc_obj_t hT, hC, hL, hN, hS; // hwloc objects (pointers to)
3422 int L2depth, idx;
3423
3424 // check support of extensions ----------------------------------
3425 int numa_support = 0, tile_support = 0;
3426 if (__kmp_pu_os_idx)
3427 hT = hwloc_get_pu_obj_by_os_index(tp,
3428 __kmp_pu_os_idx[__kmp_avail_proc - 1]);
3429 else
3430 hT = hwloc_get_obj_by_type(tp, HWLOC_OBJ_PU, __kmp_avail_proc - 1);
3431 if (hT == NULL) { // something's gone wrong
3432 KMP_WARNING(AffHWSubsetUnsupported);
3433 goto _exit;
3434 }
3435 // check NUMA node
3436 hN = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hT);
3437 hS = hwloc_get_ancestor_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hT);
3438 if (hN != NULL && hN->depth > hS->depth) {
3439 numa_support = 1; // 1 in case socket includes node(s)
3440 } else if (__kmp_hws_node.num > 0) {
3441 // don't support sockets inside NUMA node (no such HW found for testing)
3442 KMP_WARNING(AffHWSubsetUnsupported);
3443 goto _exit;
3444 }
3445 // check L2 cahce, get object by depth because of multiple caches
3446 L2depth = hwloc_get_cache_type_depth(tp, 2, HWLOC_OBJ_CACHE_UNIFIED);
3447 hL = hwloc_get_ancestor_obj_by_depth(tp, L2depth, hT);
3448 if (hL != NULL &&
3449 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC) > 1) {
3450 tile_support = 1; // no sense to count L2 if it includes single core
3451 } else if (__kmp_hws_tile.num > 0) {
3452 if (__kmp_hws_core.num == 0) {
3453 __kmp_hws_core = __kmp_hws_tile; // replace L2 with core
3454 __kmp_hws_tile.num = 0;
3455 } else {
3456 // L2 and core are both requested, but represent same object
3457 KMP_WARNING(AffHWSubsetInvalid);
3458 goto _exit;
3459 }
3460 }
3461 // end of check of extensions -----------------------------------
3462
3463 // fill in unset items, validate settings -----------------------
3464 if (__kmp_hws_socket.num == 0)
3465 __kmp_hws_socket.num = nPackages; // use all available sockets
3466 if (__kmp_hws_socket.offset >= nPackages) {
3467 KMP_WARNING(AffHWSubsetManySockets);
3468 goto _exit;
3469 }
3470 if (numa_support) {
3471 hN = NULL;
3472 int NN = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE,
3473 &hN); // num nodes in socket
3474 if (__kmp_hws_node.num == 0)
3475 __kmp_hws_node.num = NN; // use all available nodes
3476 if (__kmp_hws_node.offset >= NN) {
3477 KMP_WARNING(AffHWSubsetManyNodes);
3478 goto _exit;
3479 }
3480 if (tile_support) {
3481 // get num tiles in node
3482 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
3483 if (__kmp_hws_tile.num == 0) {
3484 __kmp_hws_tile.num = NL + 1;
3485 } // use all available tiles, some node may have more tiles, thus +1
3486 if (__kmp_hws_tile.offset >= NL) {
3487 KMP_WARNING(AffHWSubsetManyTiles);
3488 goto _exit;
3489 }
3490 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
3491 &hC); // num cores in tile
3492 if (__kmp_hws_core.num == 0)
3493 __kmp_hws_core.num = NC; // use all available cores
3494 if (__kmp_hws_core.offset >= NC) {
3495 KMP_WARNING(AffHWSubsetManyCores);
3496 goto _exit;
3497 }
3498 } else { // tile_support
3499 int NC = __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE,
3500 &hC); // num cores in node
3501 if (__kmp_hws_core.num == 0)
3502 __kmp_hws_core.num = NC; // use all available cores
3503 if (__kmp_hws_core.offset >= NC) {
3504 KMP_WARNING(AffHWSubsetManyCores);
3505 goto _exit;
3506 }
3507 } // tile_support
3508 } else { // numa_support
3509 if (tile_support) {
3510 // get num tiles in socket
3511 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
3512 if (__kmp_hws_tile.num == 0)
3513 __kmp_hws_tile.num = NL; // use all available tiles
3514 if (__kmp_hws_tile.offset >= NL) {
3515 KMP_WARNING(AffHWSubsetManyTiles);
3516 goto _exit;
3517 }
3518 int NC = __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE,
3519 &hC); // num cores in tile
3520 if (__kmp_hws_core.num == 0)
3521 __kmp_hws_core.num = NC; // use all available cores
3522 if (__kmp_hws_core.offset >= NC) {
3523 KMP_WARNING(AffHWSubsetManyCores);
3524 goto _exit;
3525 }
3526 } else { // tile_support
3527 int NC = __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE,
3528 &hC); // num cores in socket
3529 if (__kmp_hws_core.num == 0)
3530 __kmp_hws_core.num = NC; // use all available cores
3531 if (__kmp_hws_core.offset >= NC) {
3532 KMP_WARNING(AffHWSubsetManyCores);
3533 goto _exit;
3534 }
3535 } // tile_support
3536 }
3537 if (__kmp_hws_proc.num == 0)
3538 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all available procs
3539 if (__kmp_hws_proc.offset >= __kmp_nThreadsPerCore) {
3540 KMP_WARNING(AffHWSubsetManyProcs);
3541 goto _exit;
3542 }
3543 // end of validation --------------------------------------------
3544
3545 if (pAddr) // pAddr is NULL in case of affinity_none
3546 newAddr = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) *
3547 __kmp_avail_proc); // max size
3548 // main loop to form HW subset ----------------------------------
3549 hS = NULL;
3550 int NP = hwloc_get_nbobjs_by_type(tp, HWLOC_OBJ_PACKAGE);
3551 for (int s = 0; s < NP; ++s) {
3552 // Check Socket -----------------------------------------------
3553 hS = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PACKAGE, hS);
3554 if (!__kmp_hwloc_obj_has_PUs(tp, hS))
3555 continue; // skip socket if all PUs are out of fullMask
3556 ++nS; // only count objects those have PUs in affinity mask
3557 if (nS <= __kmp_hws_socket.offset ||
3558 nS > __kmp_hws_socket.num + __kmp_hws_socket.offset) {
3559 n_old += __kmp_hwloc_skip_PUs_obj(tp, hS); // skip socket
3560 continue; // move to next socket
3561 }
3562 nCr = 0; // count number of cores per socket
3563 // socket requested, go down the topology tree
3564 // check 4 cases: (+NUMA+Tile), (+NUMA-Tile), (-NUMA+Tile), (-NUMA-Tile)
3565 if (numa_support) {
3566 nN = 0;
3567 hN = NULL;
3568 // num nodes in current socket
3569 int NN =
3570 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_NUMANODE, &hN);
3571 for (int n = 0; n < NN; ++n) {
3572 // Check NUMA Node ----------------------------------------
3573 if (!__kmp_hwloc_obj_has_PUs(tp, hN)) {
3574 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
3575 continue; // skip node if all PUs are out of fullMask
3576 }
3577 ++nN;
3578 if (nN <= __kmp_hws_node.offset ||
3579 nN > __kmp_hws_node.num + __kmp_hws_node.offset) {
3580 // skip node as not requested
3581 n_old += __kmp_hwloc_skip_PUs_obj(tp, hN); // skip node
3582 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
3583 continue; // move to next node
3584 }
3585 // node requested, go down the topology tree
3586 if (tile_support) {
3587 nL = 0;
3588 hL = NULL;
3589 int NL = __kmp_hwloc_count_children_by_depth(tp, hN, L2depth, &hL);
3590 for (int l = 0; l < NL; ++l) {
3591 // Check L2 (tile) ------------------------------------
3592 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
3593 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3594 continue; // skip tile if all PUs are out of fullMask
3595 }
3596 ++nL;
3597 if (nL <= __kmp_hws_tile.offset ||
3598 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
3599 // skip tile as not requested
3600 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
3601 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3602 continue; // move to next tile
3603 }
3604 // tile requested, go down the topology tree
3605 nC = 0;
3606 hC = NULL;
3607 // num cores in current tile
3608 int NC = __kmp_hwloc_count_children_by_type(tp, hL,
3609 HWLOC_OBJ_CORE, &hC);
3610 for (int c = 0; c < NC; ++c) {
3611 // Check Core ---------------------------------------
3612 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3613 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3614 continue; // skip core if all PUs are out of fullMask
3615 }
3616 ++nC;
3617 if (nC <= __kmp_hws_core.offset ||
3618 nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3619 // skip node as not requested
3620 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3621 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3622 continue; // move to next node
3623 }
3624 // core requested, go down to PUs
3625 nT = 0;
3626 nTr = 0;
3627 hT = NULL;
3628 // num procs in current core
3629 int NT = __kmp_hwloc_count_children_by_type(tp, hC,
3630 HWLOC_OBJ_PU, &hT);
3631 for (int t = 0; t < NT; ++t) {
3632 // Check PU ---------------------------------------
3633 idx = hT->os_index;
3634 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3635 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3636 continue; // skip PU if not in fullMask
3637 }
3638 ++nT;
3639 if (nT <= __kmp_hws_proc.offset ||
3640 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3641 // skip PU
3642 KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3643 ++n_old;
3644 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3645 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3646 continue; // move to next node
3647 }
3648 ++nTr;
3649 if (pAddr) // collect requested thread's data
3650 newAddr[n_new] = (*pAddr)[n_old];
3651 ++n_new;
3652 ++n_old;
3653 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3654 } // threads loop
3655 if (nTr > 0) {
3656 ++nCr; // num cores per socket
3657 ++nCo; // total num cores
3658 if (nTr > nTpC)
3659 nTpC = nTr; // calc max threads per core
3660 }
3661 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3662 } // cores loop
3663 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3664 } // tiles loop
3665 } else { // tile_support
3666 // no tiles, check cores
3667 nC = 0;
3668 hC = NULL;
3669 // num cores in current node
3670 int NC =
3671 __kmp_hwloc_count_children_by_type(tp, hN, HWLOC_OBJ_CORE, &hC);
3672 for (int c = 0; c < NC; ++c) {
3673 // Check Core ---------------------------------------
3674 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3675 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3676 continue; // skip core if all PUs are out of fullMask
3677 }
3678 ++nC;
3679 if (nC <= __kmp_hws_core.offset ||
3680 nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3681 // skip node as not requested
3682 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3683 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3684 continue; // move to next node
3685 }
3686 // core requested, go down to PUs
3687 nT = 0;
3688 nTr = 0;
3689 hT = NULL;
3690 int NT =
3691 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
3692 for (int t = 0; t < NT; ++t) {
3693 // Check PU ---------------------------------------
3694 idx = hT->os_index;
3695 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3696 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3697 continue; // skip PU if not in fullMask
3698 }
3699 ++nT;
3700 if (nT <= __kmp_hws_proc.offset ||
3701 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3702 // skip PU
3703 KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3704 ++n_old;
3705 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3706 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3707 continue; // move to next node
3708 }
3709 ++nTr;
3710 if (pAddr) // collect requested thread's data
3711 newAddr[n_new] = (*pAddr)[n_old];
3712 ++n_new;
3713 ++n_old;
3714 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3715 } // threads loop
3716 if (nTr > 0) {
3717 ++nCr; // num cores per socket
3718 ++nCo; // total num cores
3719 if (nTr > nTpC)
3720 nTpC = nTr; // calc max threads per core
3721 }
3722 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3723 } // cores loop
3724 } // tiles support
3725 hN = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_NUMANODE, hN);
3726 } // nodes loop
3727 } else { // numa_support
3728 // no NUMA support
3729 if (tile_support) {
3730 nL = 0;
3731 hL = NULL;
3732 // num tiles in current socket
3733 int NL = __kmp_hwloc_count_children_by_depth(tp, hS, L2depth, &hL);
3734 for (int l = 0; l < NL; ++l) {
3735 // Check L2 (tile) ------------------------------------
3736 if (!__kmp_hwloc_obj_has_PUs(tp, hL)) {
3737 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3738 continue; // skip tile if all PUs are out of fullMask
3739 }
3740 ++nL;
3741 if (nL <= __kmp_hws_tile.offset ||
3742 nL > __kmp_hws_tile.num + __kmp_hws_tile.offset) {
3743 // skip tile as not requested
3744 n_old += __kmp_hwloc_skip_PUs_obj(tp, hL); // skip tile
3745 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3746 continue; // move to next tile
3747 }
3748 // tile requested, go down the topology tree
3749 nC = 0;
3750 hC = NULL;
3751 // num cores per tile
3752 int NC =
3753 __kmp_hwloc_count_children_by_type(tp, hL, HWLOC_OBJ_CORE, &hC);
3754 for (int c = 0; c < NC; ++c) {
3755 // Check Core ---------------------------------------
3756 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3757 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3758 continue; // skip core if all PUs are out of fullMask
3759 }
3760 ++nC;
3761 if (nC <= __kmp_hws_core.offset ||
3762 nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3763 // skip node as not requested
3764 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3765 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3766 continue; // move to next node
3767 }
3768 // core requested, go down to PUs
3769 nT = 0;
3770 nTr = 0;
3771 hT = NULL;
3772 // num procs per core
3773 int NT =
3774 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
3775 for (int t = 0; t < NT; ++t) {
3776 // Check PU ---------------------------------------
3777 idx = hT->os_index;
3778 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3779 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3780 continue; // skip PU if not in fullMask
3781 }
3782 ++nT;
3783 if (nT <= __kmp_hws_proc.offset ||
3784 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3785 // skip PU
3786 KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3787 ++n_old;
3788 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3789 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3790 continue; // move to next node
3791 }
3792 ++nTr;
3793 if (pAddr) // collect requested thread's data
3794 newAddr[n_new] = (*pAddr)[n_old];
3795 ++n_new;
3796 ++n_old;
3797 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3798 } // threads loop
3799 if (nTr > 0) {
3800 ++nCr; // num cores per socket
3801 ++nCo; // total num cores
3802 if (nTr > nTpC)
3803 nTpC = nTr; // calc max threads per core
3804 }
3805 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3806 } // cores loop
3807 hL = hwloc_get_next_obj_by_depth(tp, L2depth, hL);
3808 } // tiles loop
3809 } else { // tile_support
3810 // no tiles, check cores
3811 nC = 0;
3812 hC = NULL;
3813 // num cores in socket
3814 int NC =
3815 __kmp_hwloc_count_children_by_type(tp, hS, HWLOC_OBJ_CORE, &hC);
3816 for (int c = 0; c < NC; ++c) {
3817 // Check Core -------------------------------------------
3818 if (!__kmp_hwloc_obj_has_PUs(tp, hC)) {
3819 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3820 continue; // skip core if all PUs are out of fullMask
3821 }
3822 ++nC;
3823 if (nC <= __kmp_hws_core.offset ||
3824 nC > __kmp_hws_core.num + __kmp_hws_core.offset) {
3825 // skip node as not requested
3826 n_old += __kmp_hwloc_skip_PUs_obj(tp, hC); // skip core
3827 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3828 continue; // move to next node
3829 }
3830 // core requested, go down to PUs
3831 nT = 0;
3832 nTr = 0;
3833 hT = NULL;
3834 // num procs per core
3835 int NT =
3836 __kmp_hwloc_count_children_by_type(tp, hC, HWLOC_OBJ_PU, &hT);
3837 for (int t = 0; t < NT; ++t) {
3838 // Check PU ---------------------------------------
3839 idx = hT->os_index;
3840 if (!KMP_CPU_ISSET(idx, __kmp_affin_fullMask)) {
3841 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3842 continue; // skip PU if not in fullMask
3843 }
3844 ++nT;
3845 if (nT <= __kmp_hws_proc.offset ||
3846 nT > __kmp_hws_proc.num + __kmp_hws_proc.offset) {
3847 // skip PU
3848 KMP_CPU_CLR(idx, __kmp_affin_fullMask);
3849 ++n_old;
3850 KC_TRACE(200, ("KMP_HW_SUBSET: skipped proc %d\n", idx));
3851 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3852 continue; // move to next node
3853 }
3854 ++nTr;
3855 if (pAddr) // collect requested thread's data
3856 newAddr[n_new] = (*pAddr)[n_old];
3857 ++n_new;
3858 ++n_old;
3859 hT = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_PU, hT);
3860 } // threads loop
3861 if (nTr > 0) {
3862 ++nCr; // num cores per socket
3863 ++nCo; // total num cores
3864 if (nTr > nTpC)
3865 nTpC = nTr; // calc max threads per core
3866 }
3867 hC = hwloc_get_next_obj_by_type(tp, HWLOC_OBJ_CORE, hC);
3868 } // cores loop
3869 } // tiles support
3870 } // numa_support
3871 if (nCr > 0) { // found cores?
3872 ++nPkg; // num sockets
3873 if (nCr > nCpP)
3874 nCpP = nCr; // calc max cores per socket
3875 }
3876 } // sockets loop
3877
3878 // check the subset is valid
3879 KMP_DEBUG_ASSERT(n_old == __kmp_avail_proc);
3880 KMP_DEBUG_ASSERT(nPkg > 0);
3881 KMP_DEBUG_ASSERT(nCpP > 0);
3882 KMP_DEBUG_ASSERT(nTpC > 0);
3883 KMP_DEBUG_ASSERT(nCo > 0);
3884 KMP_DEBUG_ASSERT(nPkg <= nPackages);
3885 KMP_DEBUG_ASSERT(nCpP <= nCoresPerPkg);
3886 KMP_DEBUG_ASSERT(nTpC <= __kmp_nThreadsPerCore);
3887 KMP_DEBUG_ASSERT(nCo <= __kmp_ncores);
3888
3889 nPackages = nPkg; // correct num sockets
3890 nCoresPerPkg = nCpP; // correct num cores per socket
3891 __kmp_nThreadsPerCore = nTpC; // correct num threads per core
3892 __kmp_avail_proc = n_new; // correct num procs
3893 __kmp_ncores = nCo; // correct num cores
3894 // hwloc topology method end
3895 } else
3896 #endif // KMP_USE_HWLOC
3897 {
3898 int n_old = 0, n_new = 0, proc_num = 0;
3899 if (__kmp_hws_node.num > 0 || __kmp_hws_tile.num > 0) {
3900 KMP_WARNING(AffHWSubsetNoHWLOC);
3901 goto _exit;
3902 }
3903 if (__kmp_hws_socket.num == 0)
3904 __kmp_hws_socket.num = nPackages; // use all available sockets
3905 if (__kmp_hws_core.num == 0)
3906 __kmp_hws_core.num = nCoresPerPkg; // use all available cores
3907 if (__kmp_hws_proc.num == 0 || __kmp_hws_proc.num > __kmp_nThreadsPerCore)
3908 __kmp_hws_proc.num = __kmp_nThreadsPerCore; // use all HW contexts
3909 if (!__kmp_affinity_uniform_topology()) {
3910 KMP_WARNING(AffHWSubsetNonUniform);
3911 goto _exit; // don't support non-uniform topology
3912 }
3913 if (depth > 3) {
3914 KMP_WARNING(AffHWSubsetNonThreeLevel);
3915 goto _exit; // don't support not-3-level topology
3916 }
3917 if (__kmp_hws_socket.offset + __kmp_hws_socket.num > nPackages) {
3918 KMP_WARNING(AffHWSubsetManySockets);
3919 goto _exit;
3920 }
3921 if (__kmp_hws_core.offset + __kmp_hws_core.num > nCoresPerPkg) {
3922 KMP_WARNING(AffHWSubsetManyCores);
3923 goto _exit;
3924 }
3925 // Form the requested subset
3926 if (pAddr) // pAddr is NULL in case of affinity_none
3927 newAddr = (AddrUnsPair *)__kmp_allocate(
3928 sizeof(AddrUnsPair) * __kmp_hws_socket.num * __kmp_hws_core.num *
3929 __kmp_hws_proc.num);
3930 for (int i = 0; i < nPackages; ++i) {
3931 if (i < __kmp_hws_socket.offset ||
3932 i >= __kmp_hws_socket.offset + __kmp_hws_socket.num) {
3933 // skip not-requested socket
3934 n_old += nCoresPerPkg * __kmp_nThreadsPerCore;
3935 if (__kmp_pu_os_idx != NULL) {
3936 // walk through skipped socket
3937 for (int j = 0; j < nCoresPerPkg; ++j) {
3938 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
3939 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3940 ++proc_num;
3941 }
3942 }
3943 }
3944 } else {
3945 // walk through requested socket
3946 for (int j = 0; j < nCoresPerPkg; ++j) {
3947 if (j < __kmp_hws_core.offset ||
3948 j >= __kmp_hws_core.offset +
3949 __kmp_hws_core.num) { // skip not-requested core
3950 n_old += __kmp_nThreadsPerCore;
3951 if (__kmp_pu_os_idx != NULL) {
3952 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
3953 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3954 ++proc_num;
3955 }
3956 }
3957 } else {
3958 // walk through requested core
3959 for (int k = 0; k < __kmp_nThreadsPerCore; ++k) {
3960 if (k < __kmp_hws_proc.num) {
3961 if (pAddr) // collect requested thread's data
3962 newAddr[n_new] = (*pAddr)[n_old];
3963 n_new++;
3964 } else {
3965 if (__kmp_pu_os_idx != NULL)
3966 KMP_CPU_CLR(__kmp_pu_os_idx[proc_num], __kmp_affin_fullMask);
3967 }
3968 n_old++;
3969 ++proc_num;
3970 }
3971 }
3972 }
3973 }
3974 }
3975 KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore);
3976 KMP_DEBUG_ASSERT(n_new ==
3977 __kmp_hws_socket.num * __kmp_hws_core.num *
3978 __kmp_hws_proc.num);
3979 nPackages = __kmp_hws_socket.num; // correct nPackages
3980 nCoresPerPkg = __kmp_hws_core.num; // correct nCoresPerPkg
3981 __kmp_nThreadsPerCore = __kmp_hws_proc.num; // correct __kmp_nThreadsPerCore
3982 __kmp_avail_proc = n_new; // correct avail_proc
3983 __kmp_ncores = nPackages * __kmp_hws_core.num; // correct ncores
3984 } // non-hwloc topology method
3985 if (pAddr) {
3986 __kmp_free(*pAddr);
3987 *pAddr = newAddr; // replace old topology with new one
3988 }
3989 if (__kmp_affinity_verbose) {
3990 char m[KMP_AFFIN_MASK_PRINT_LEN];
3991 __kmp_affinity_print_mask(m, KMP_AFFIN_MASK_PRINT_LEN,
3992 __kmp_affin_fullMask);
3993 if (__kmp_affinity_respect_mask) {
3994 KMP_INFORM(InitOSProcSetRespect, "KMP_HW_SUBSET", m);
3995 } else {
3996 KMP_INFORM(InitOSProcSetNotRespect, "KMP_HW_SUBSET", m);
3997 }
3998 KMP_INFORM(AvailableOSProc, "KMP_HW_SUBSET", __kmp_avail_proc);
3999 kmp_str_buf_t buf;
4000 __kmp_str_buf_init(&buf);
4001 __kmp_str_buf_print(&buf, "%d", nPackages);
4002 KMP_INFORM(TopologyExtra, "KMP_HW_SUBSET", buf.str, nCoresPerPkg,
4003 __kmp_nThreadsPerCore, __kmp_ncores);
4004 __kmp_str_buf_free(&buf);
4005 }
4006 _exit:
4007 if (__kmp_pu_os_idx != NULL) {
4008 __kmp_free(__kmp_pu_os_idx);
4009 __kmp_pu_os_idx = NULL;
4010 }
4011 }
4012
4013 // This function figures out the deepest level at which there is at least one
4014 // cluster/core with more than one processing unit bound to it.
__kmp_affinity_find_core_level(const AddrUnsPair * address2os,int nprocs,int bottom_level)4015 static int __kmp_affinity_find_core_level(const AddrUnsPair *address2os,
4016 int nprocs, int bottom_level) {
4017 int core_level = 0;
4018
4019 for (int i = 0; i < nprocs; i++) {
4020 for (int j = bottom_level; j > 0; j--) {
4021 if (address2os[i].first.labels[j] > 0) {
4022 if (core_level < (j - 1)) {
4023 core_level = j - 1;
4024 }
4025 }
4026 }
4027 }
4028 return core_level;
4029 }
4030
4031 // This function counts number of clusters/cores at given level.
__kmp_affinity_compute_ncores(const AddrUnsPair * address2os,int nprocs,int bottom_level,int core_level)4032 static int __kmp_affinity_compute_ncores(const AddrUnsPair *address2os,
4033 int nprocs, int bottom_level,
4034 int core_level) {
4035 int ncores = 0;
4036 int i, j;
4037
4038 j = bottom_level;
4039 for (i = 0; i < nprocs; i++) {
4040 for (j = bottom_level; j > core_level; j--) {
4041 if ((i + 1) < nprocs) {
4042 if (address2os[i + 1].first.labels[j] > 0) {
4043 break;
4044 }
4045 }
4046 }
4047 if (j == core_level) {
4048 ncores++;
4049 }
4050 }
4051 if (j > core_level) {
4052 // In case of ( nprocs < __kmp_avail_proc ) we may end too deep and miss one
4053 // core. May occur when called from __kmp_affinity_find_core().
4054 ncores++;
4055 }
4056 return ncores;
4057 }
4058
4059 // This function finds to which cluster/core given processing unit is bound.
__kmp_affinity_find_core(const AddrUnsPair * address2os,int proc,int bottom_level,int core_level)4060 static int __kmp_affinity_find_core(const AddrUnsPair *address2os, int proc,
4061 int bottom_level, int core_level) {
4062 return __kmp_affinity_compute_ncores(address2os, proc + 1, bottom_level,
4063 core_level) -
4064 1;
4065 }
4066
4067 // This function finds maximal number of processing units bound to a
4068 // cluster/core at given level.
__kmp_affinity_max_proc_per_core(const AddrUnsPair * address2os,int nprocs,int bottom_level,int core_level)4069 static int __kmp_affinity_max_proc_per_core(const AddrUnsPair *address2os,
4070 int nprocs, int bottom_level,
4071 int core_level) {
4072 int maxprocpercore = 0;
4073
4074 if (core_level < bottom_level) {
4075 for (int i = 0; i < nprocs; i++) {
4076 int percore = address2os[i].first.labels[core_level + 1] + 1;
4077
4078 if (percore > maxprocpercore) {
4079 maxprocpercore = percore;
4080 }
4081 }
4082 } else {
4083 maxprocpercore = 1;
4084 }
4085 return maxprocpercore;
4086 }
4087
4088 static AddrUnsPair *address2os = NULL;
4089 static int *procarr = NULL;
4090 static int __kmp_aff_depth = 0;
4091
4092 #if KMP_USE_HIER_SCHED
4093 #define KMP_EXIT_AFF_NONE \
4094 KMP_ASSERT(__kmp_affinity_type == affinity_none); \
4095 KMP_ASSERT(address2os == NULL); \
4096 __kmp_apply_thread_places(NULL, 0); \
4097 __kmp_create_affinity_none_places(); \
4098 __kmp_dispatch_set_hierarchy_values(); \
4099 return;
4100 #else
4101 #define KMP_EXIT_AFF_NONE \
4102 KMP_ASSERT(__kmp_affinity_type == affinity_none); \
4103 KMP_ASSERT(address2os == NULL); \
4104 __kmp_apply_thread_places(NULL, 0); \
4105 __kmp_create_affinity_none_places(); \
4106 return;
4107 #endif
4108
4109 // Create a one element mask array (set of places) which only contains the
4110 // initial process's affinity mask
__kmp_create_affinity_none_places()4111 static void __kmp_create_affinity_none_places() {
4112 KMP_ASSERT(__kmp_affin_fullMask != NULL);
4113 KMP_ASSERT(__kmp_affinity_type == affinity_none);
4114 __kmp_affinity_num_masks = 1;
4115 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4116 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, 0);
4117 KMP_CPU_COPY(dest, __kmp_affin_fullMask);
4118 }
4119
__kmp_affinity_cmp_Address_child_num(const void * a,const void * b)4120 static int __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) {
4121 const Address *aa = &(((const AddrUnsPair *)a)->first);
4122 const Address *bb = &(((const AddrUnsPair *)b)->first);
4123 unsigned depth = aa->depth;
4124 unsigned i;
4125 KMP_DEBUG_ASSERT(depth == bb->depth);
4126 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth);
4127 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0);
4128 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) {
4129 int j = depth - i - 1;
4130 if (aa->childNums[j] < bb->childNums[j])
4131 return -1;
4132 if (aa->childNums[j] > bb->childNums[j])
4133 return 1;
4134 }
4135 for (; i < depth; i++) {
4136 int j = i - __kmp_affinity_compact;
4137 if (aa->childNums[j] < bb->childNums[j])
4138 return -1;
4139 if (aa->childNums[j] > bb->childNums[j])
4140 return 1;
4141 }
4142 return 0;
4143 }
4144
__kmp_aux_affinity_initialize(void)4145 static void __kmp_aux_affinity_initialize(void) {
4146 if (__kmp_affinity_masks != NULL) {
4147 KMP_ASSERT(__kmp_affin_fullMask != NULL);
4148 return;
4149 }
4150
4151 // Create the "full" mask - this defines all of the processors that we
4152 // consider to be in the machine model. If respect is set, then it is the
4153 // initialization thread's affinity mask. Otherwise, it is all processors that
4154 // we know about on the machine.
4155 if (__kmp_affin_fullMask == NULL) {
4156 KMP_CPU_ALLOC(__kmp_affin_fullMask);
4157 }
4158 if (KMP_AFFINITY_CAPABLE()) {
4159 if (__kmp_affinity_respect_mask) {
4160 __kmp_get_system_affinity(__kmp_affin_fullMask, TRUE);
4161
4162 // Count the number of available processors.
4163 unsigned i;
4164 __kmp_avail_proc = 0;
4165 KMP_CPU_SET_ITERATE(i, __kmp_affin_fullMask) {
4166 if (!KMP_CPU_ISSET(i, __kmp_affin_fullMask)) {
4167 continue;
4168 }
4169 __kmp_avail_proc++;
4170 }
4171 if (__kmp_avail_proc > __kmp_xproc) {
4172 if (__kmp_affinity_verbose ||
4173 (__kmp_affinity_warnings &&
4174 (__kmp_affinity_type != affinity_none))) {
4175 KMP_WARNING(ErrorInitializeAffinity);
4176 }
4177 __kmp_affinity_type = affinity_none;
4178 KMP_AFFINITY_DISABLE();
4179 return;
4180 }
4181 } else {
4182 __kmp_affinity_entire_machine_mask(__kmp_affin_fullMask);
4183 __kmp_avail_proc = __kmp_xproc;
4184 }
4185 }
4186
4187 if (__kmp_affinity_gran == affinity_gran_tile &&
4188 // check if user's request is valid
4189 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::NATIVE_OS) {
4190 KMP_WARNING(AffTilesNoHWLOC, "KMP_AFFINITY");
4191 __kmp_affinity_gran = affinity_gran_package;
4192 }
4193
4194 int depth = -1;
4195 kmp_i18n_id_t msg_id = kmp_i18n_null;
4196
4197 // For backward compatibility, setting KMP_CPUINFO_FILE =>
4198 // KMP_TOPOLOGY_METHOD=cpuinfo
4199 if ((__kmp_cpuinfo_file != NULL) &&
4200 (__kmp_affinity_top_method == affinity_top_method_all)) {
4201 __kmp_affinity_top_method = affinity_top_method_cpuinfo;
4202 }
4203
4204 if (__kmp_affinity_top_method == affinity_top_method_all) {
4205 // In the default code path, errors are not fatal - we just try using
4206 // another method. We only emit a warning message if affinity is on, or the
4207 // verbose flag is set, and the nowarnings flag was not set.
4208 const char *file_name = NULL;
4209 int line = 0;
4210 #if KMP_USE_HWLOC
4211 if (depth < 0 &&
4212 __kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC) {
4213 if (__kmp_affinity_verbose) {
4214 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
4215 }
4216 if (!__kmp_hwloc_error) {
4217 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
4218 if (depth == 0) {
4219 KMP_EXIT_AFF_NONE;
4220 } else if (depth < 0 && __kmp_affinity_verbose) {
4221 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
4222 }
4223 } else if (__kmp_affinity_verbose) {
4224 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY");
4225 }
4226 }
4227 #endif
4228
4229 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4230
4231 if (depth < 0) {
4232 if (__kmp_affinity_verbose) {
4233 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
4234 }
4235
4236 file_name = NULL;
4237 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
4238 if (depth == 0) {
4239 KMP_EXIT_AFF_NONE;
4240 }
4241
4242 if (depth < 0) {
4243 if (__kmp_affinity_verbose) {
4244 if (msg_id != kmp_i18n_null) {
4245 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY",
4246 __kmp_i18n_catgets(msg_id),
4247 KMP_I18N_STR(DecodingLegacyAPIC));
4248 } else {
4249 KMP_INFORM(AffInfoStr, "KMP_AFFINITY",
4250 KMP_I18N_STR(DecodingLegacyAPIC));
4251 }
4252 }
4253
4254 file_name = NULL;
4255 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
4256 if (depth == 0) {
4257 KMP_EXIT_AFF_NONE;
4258 }
4259 }
4260 }
4261
4262 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4263
4264 #if KMP_OS_LINUX
4265
4266 if (depth < 0) {
4267 if (__kmp_affinity_verbose) {
4268 if (msg_id != kmp_i18n_null) {
4269 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY",
4270 __kmp_i18n_catgets(msg_id), "/proc/cpuinfo");
4271 } else {
4272 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo");
4273 }
4274 }
4275
4276 FILE *f = fopen("/proc/cpuinfo", "r");
4277 if (f == NULL) {
4278 msg_id = kmp_i18n_str_CantOpenCpuinfo;
4279 } else {
4280 file_name = "/proc/cpuinfo";
4281 depth =
4282 __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
4283 fclose(f);
4284 if (depth == 0) {
4285 KMP_EXIT_AFF_NONE;
4286 }
4287 }
4288 }
4289
4290 #endif /* KMP_OS_LINUX */
4291
4292 #if KMP_GROUP_AFFINITY
4293
4294 if ((depth < 0) && (__kmp_num_proc_groups > 1)) {
4295 if (__kmp_affinity_verbose) {
4296 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
4297 }
4298
4299 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
4300 KMP_ASSERT(depth != 0);
4301 }
4302
4303 #endif /* KMP_GROUP_AFFINITY */
4304
4305 if (depth < 0) {
4306 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) {
4307 if (file_name == NULL) {
4308 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id));
4309 } else if (line == 0) {
4310 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id));
4311 } else {
4312 KMP_INFORM(UsingFlatOSFileLine, file_name, line,
4313 __kmp_i18n_catgets(msg_id));
4314 }
4315 }
4316 // FIXME - print msg if msg_id = kmp_i18n_null ???
4317
4318 file_name = "";
4319 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
4320 if (depth == 0) {
4321 KMP_EXIT_AFF_NONE;
4322 }
4323 KMP_ASSERT(depth > 0);
4324 KMP_ASSERT(address2os != NULL);
4325 }
4326 }
4327
4328 #if KMP_USE_HWLOC
4329 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) {
4330 KMP_ASSERT(__kmp_affinity_dispatch->get_api_type() == KMPAffinity::HWLOC);
4331 if (__kmp_affinity_verbose) {
4332 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY");
4333 }
4334 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id);
4335 if (depth == 0) {
4336 KMP_EXIT_AFF_NONE;
4337 }
4338 }
4339 #endif // KMP_USE_HWLOC
4340
4341 // If the user has specified that a particular topology discovery method is to be
4342 // used, then we abort if that method fails. The exception is group affinity,
4343 // which might have been implicitly set.
4344
4345 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4346
4347 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) {
4348 if (__kmp_affinity_verbose) {
4349 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC));
4350 }
4351
4352 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id);
4353 if (depth == 0) {
4354 KMP_EXIT_AFF_NONE;
4355 }
4356 if (depth < 0) {
4357 KMP_ASSERT(msg_id != kmp_i18n_null);
4358 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4359 }
4360 } else if (__kmp_affinity_top_method == affinity_top_method_apicid) {
4361 if (__kmp_affinity_verbose) {
4362 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC));
4363 }
4364
4365 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id);
4366 if (depth == 0) {
4367 KMP_EXIT_AFF_NONE;
4368 }
4369 if (depth < 0) {
4370 KMP_ASSERT(msg_id != kmp_i18n_null);
4371 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4372 }
4373 }
4374
4375 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4376
4377 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) {
4378 const char *filename;
4379 if (__kmp_cpuinfo_file != NULL) {
4380 filename = __kmp_cpuinfo_file;
4381 } else {
4382 filename = "/proc/cpuinfo";
4383 }
4384
4385 if (__kmp_affinity_verbose) {
4386 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename);
4387 }
4388
4389 FILE *f = fopen(filename, "r");
4390 if (f == NULL) {
4391 int code = errno;
4392 if (__kmp_cpuinfo_file != NULL) {
4393 __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
4394 KMP_HNT(NameComesFrom_CPUINFO_FILE), __kmp_msg_null);
4395 } else {
4396 __kmp_fatal(KMP_MSG(CantOpenFileForReading, filename), KMP_ERR(code),
4397 __kmp_msg_null);
4398 }
4399 }
4400 int line = 0;
4401 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f);
4402 fclose(f);
4403 if (depth < 0) {
4404 KMP_ASSERT(msg_id != kmp_i18n_null);
4405 if (line > 0) {
4406 KMP_FATAL(FileLineMsgExiting, filename, line,
4407 __kmp_i18n_catgets(msg_id));
4408 } else {
4409 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id));
4410 }
4411 }
4412 if (__kmp_affinity_type == affinity_none) {
4413 KMP_ASSERT(depth == 0);
4414 KMP_EXIT_AFF_NONE;
4415 }
4416 }
4417
4418 #if KMP_GROUP_AFFINITY
4419
4420 else if (__kmp_affinity_top_method == affinity_top_method_group) {
4421 if (__kmp_affinity_verbose) {
4422 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY");
4423 }
4424
4425 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id);
4426 KMP_ASSERT(depth != 0);
4427 if (depth < 0) {
4428 KMP_ASSERT(msg_id != kmp_i18n_null);
4429 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id));
4430 }
4431 }
4432
4433 #endif /* KMP_GROUP_AFFINITY */
4434
4435 else if (__kmp_affinity_top_method == affinity_top_method_flat) {
4436 if (__kmp_affinity_verbose) {
4437 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY");
4438 }
4439
4440 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id);
4441 if (depth == 0) {
4442 KMP_EXIT_AFF_NONE;
4443 }
4444 // should not fail
4445 KMP_ASSERT(depth > 0);
4446 KMP_ASSERT(address2os != NULL);
4447 }
4448
4449 #if KMP_USE_HIER_SCHED
4450 __kmp_dispatch_set_hierarchy_values();
4451 #endif
4452
4453 if (address2os == NULL) {
4454 if (KMP_AFFINITY_CAPABLE() &&
4455 (__kmp_affinity_verbose ||
4456 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none)))) {
4457 KMP_WARNING(ErrorInitializeAffinity);
4458 }
4459 __kmp_affinity_type = affinity_none;
4460 __kmp_create_affinity_none_places();
4461 KMP_AFFINITY_DISABLE();
4462 return;
4463 }
4464
4465 if (__kmp_affinity_gran == affinity_gran_tile
4466 #if KMP_USE_HWLOC
4467 && __kmp_tile_depth == 0
4468 #endif
4469 ) {
4470 // tiles requested but not detected, warn user on this
4471 KMP_WARNING(AffTilesNoTiles, "KMP_AFFINITY");
4472 }
4473
4474 __kmp_apply_thread_places(&address2os, depth);
4475
4476 // Create the table of masks, indexed by thread Id.
4477 unsigned maxIndex;
4478 unsigned numUnique;
4479 kmp_affin_mask_t *osId2Mask =
4480 __kmp_create_masks(&maxIndex, &numUnique, address2os, __kmp_avail_proc);
4481 if (__kmp_affinity_gran_levels == 0) {
4482 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc);
4483 }
4484
4485 // Set the childNums vector in all Address objects. This must be done before
4486 // we can sort using __kmp_affinity_cmp_Address_child_num(), which takes into
4487 // account the setting of __kmp_affinity_compact.
4488 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc);
4489
4490 switch (__kmp_affinity_type) {
4491
4492 case affinity_explicit:
4493 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL);
4494 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) {
4495 __kmp_affinity_process_proclist(
4496 &__kmp_affinity_masks, &__kmp_affinity_num_masks,
4497 __kmp_affinity_proclist, osId2Mask, maxIndex);
4498 } else {
4499 __kmp_affinity_process_placelist(
4500 &__kmp_affinity_masks, &__kmp_affinity_num_masks,
4501 __kmp_affinity_proclist, osId2Mask, maxIndex);
4502 }
4503 if (__kmp_affinity_num_masks == 0) {
4504 if (__kmp_affinity_verbose ||
4505 (__kmp_affinity_warnings && (__kmp_affinity_type != affinity_none))) {
4506 KMP_WARNING(AffNoValidProcID);
4507 }
4508 __kmp_affinity_type = affinity_none;
4509 __kmp_create_affinity_none_places();
4510 return;
4511 }
4512 break;
4513
4514 // The other affinity types rely on sorting the Addresses according to some
4515 // permutation of the machine topology tree. Set __kmp_affinity_compact and
4516 // __kmp_affinity_offset appropriately, then jump to a common code fragment
4517 // to do the sort and create the array of affinity masks.
4518
4519 case affinity_logical:
4520 __kmp_affinity_compact = 0;
4521 if (__kmp_affinity_offset) {
4522 __kmp_affinity_offset =
4523 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
4524 }
4525 goto sortAddresses;
4526
4527 case affinity_physical:
4528 if (__kmp_nThreadsPerCore > 1) {
4529 __kmp_affinity_compact = 1;
4530 if (__kmp_affinity_compact >= depth) {
4531 __kmp_affinity_compact = 0;
4532 }
4533 } else {
4534 __kmp_affinity_compact = 0;
4535 }
4536 if (__kmp_affinity_offset) {
4537 __kmp_affinity_offset =
4538 __kmp_nThreadsPerCore * __kmp_affinity_offset % __kmp_avail_proc;
4539 }
4540 goto sortAddresses;
4541
4542 case affinity_scatter:
4543 if (__kmp_affinity_compact >= depth) {
4544 __kmp_affinity_compact = 0;
4545 } else {
4546 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact;
4547 }
4548 goto sortAddresses;
4549
4550 case affinity_compact:
4551 if (__kmp_affinity_compact >= depth) {
4552 __kmp_affinity_compact = depth - 1;
4553 }
4554 goto sortAddresses;
4555
4556 case affinity_balanced:
4557 if (depth <= 1) {
4558 if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
4559 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
4560 }
4561 __kmp_affinity_type = affinity_none;
4562 __kmp_create_affinity_none_places();
4563 return;
4564 } else if (!__kmp_affinity_uniform_topology()) {
4565 // Save the depth for further usage
4566 __kmp_aff_depth = depth;
4567
4568 int core_level = __kmp_affinity_find_core_level(
4569 address2os, __kmp_avail_proc, depth - 1);
4570 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
4571 depth - 1, core_level);
4572 int maxprocpercore = __kmp_affinity_max_proc_per_core(
4573 address2os, __kmp_avail_proc, depth - 1, core_level);
4574
4575 int nproc = ncores * maxprocpercore;
4576 if ((nproc < 2) || (nproc < __kmp_avail_proc)) {
4577 if (__kmp_affinity_verbose || __kmp_affinity_warnings) {
4578 KMP_WARNING(AffBalancedNotAvail, "KMP_AFFINITY");
4579 }
4580 __kmp_affinity_type = affinity_none;
4581 return;
4582 }
4583
4584 procarr = (int *)__kmp_allocate(sizeof(int) * nproc);
4585 for (int i = 0; i < nproc; i++) {
4586 procarr[i] = -1;
4587 }
4588
4589 int lastcore = -1;
4590 int inlastcore = 0;
4591 for (int i = 0; i < __kmp_avail_proc; i++) {
4592 int proc = address2os[i].second;
4593 int core =
4594 __kmp_affinity_find_core(address2os, i, depth - 1, core_level);
4595
4596 if (core == lastcore) {
4597 inlastcore++;
4598 } else {
4599 inlastcore = 0;
4600 }
4601 lastcore = core;
4602
4603 procarr[core * maxprocpercore + inlastcore] = proc;
4604 }
4605 }
4606 if (__kmp_affinity_compact >= depth) {
4607 __kmp_affinity_compact = depth - 1;
4608 }
4609
4610 sortAddresses:
4611 // Allocate the gtid->affinity mask table.
4612 if (__kmp_affinity_dups) {
4613 __kmp_affinity_num_masks = __kmp_avail_proc;
4614 } else {
4615 __kmp_affinity_num_masks = numUnique;
4616 }
4617
4618 if ((__kmp_nested_proc_bind.bind_types[0] != proc_bind_intel) &&
4619 (__kmp_affinity_num_places > 0) &&
4620 ((unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks)) {
4621 __kmp_affinity_num_masks = __kmp_affinity_num_places;
4622 }
4623
4624 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4625
4626 // Sort the address2os table according to the current setting of
4627 // __kmp_affinity_compact, then fill out __kmp_affinity_masks.
4628 qsort(address2os, __kmp_avail_proc, sizeof(*address2os),
4629 __kmp_affinity_cmp_Address_child_num);
4630 {
4631 int i;
4632 unsigned j;
4633 for (i = 0, j = 0; i < __kmp_avail_proc; i++) {
4634 if ((!__kmp_affinity_dups) && (!address2os[i].first.leader)) {
4635 continue;
4636 }
4637 unsigned osId = address2os[i].second;
4638 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId);
4639 kmp_affin_mask_t *dest = KMP_CPU_INDEX(__kmp_affinity_masks, j);
4640 KMP_ASSERT(KMP_CPU_ISSET(osId, src));
4641 KMP_CPU_COPY(dest, src);
4642 if (++j >= __kmp_affinity_num_masks) {
4643 break;
4644 }
4645 }
4646 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks);
4647 }
4648 break;
4649
4650 default:
4651 KMP_ASSERT2(0, "Unexpected affinity setting");
4652 }
4653
4654 KMP_CPU_FREE_ARRAY(osId2Mask, maxIndex + 1);
4655 machine_hierarchy.init(address2os, __kmp_avail_proc);
4656 }
4657 #undef KMP_EXIT_AFF_NONE
4658
__kmp_affinity_initialize(void)4659 void __kmp_affinity_initialize(void) {
4660 // Much of the code above was written assuming that if a machine was not
4661 // affinity capable, then __kmp_affinity_type == affinity_none. We now
4662 // explicitly represent this as __kmp_affinity_type == affinity_disabled.
4663 // There are too many checks for __kmp_affinity_type == affinity_none
4664 // in this code. Instead of trying to change them all, check if
4665 // __kmp_affinity_type == affinity_disabled, and if so, slam it with
4666 // affinity_none, call the real initialization routine, then restore
4667 // __kmp_affinity_type to affinity_disabled.
4668 int disabled = (__kmp_affinity_type == affinity_disabled);
4669 if (!KMP_AFFINITY_CAPABLE()) {
4670 KMP_ASSERT(disabled);
4671 }
4672 if (disabled) {
4673 __kmp_affinity_type = affinity_none;
4674 }
4675 __kmp_aux_affinity_initialize();
4676 if (disabled) {
4677 __kmp_affinity_type = affinity_disabled;
4678 }
4679 }
4680
__kmp_affinity_uninitialize(void)4681 void __kmp_affinity_uninitialize(void) {
4682 if (__kmp_affinity_masks != NULL) {
4683 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks);
4684 __kmp_affinity_masks = NULL;
4685 }
4686 if (__kmp_affin_fullMask != NULL) {
4687 KMP_CPU_FREE(__kmp_affin_fullMask);
4688 __kmp_affin_fullMask = NULL;
4689 }
4690 __kmp_affinity_num_masks = 0;
4691 __kmp_affinity_type = affinity_default;
4692 __kmp_affinity_num_places = 0;
4693 if (__kmp_affinity_proclist != NULL) {
4694 __kmp_free(__kmp_affinity_proclist);
4695 __kmp_affinity_proclist = NULL;
4696 }
4697 if (address2os != NULL) {
4698 __kmp_free(address2os);
4699 address2os = NULL;
4700 }
4701 if (procarr != NULL) {
4702 __kmp_free(procarr);
4703 procarr = NULL;
4704 }
4705 #if KMP_USE_HWLOC
4706 if (__kmp_hwloc_topology != NULL) {
4707 hwloc_topology_destroy(__kmp_hwloc_topology);
4708 __kmp_hwloc_topology = NULL;
4709 }
4710 #endif
4711 KMPAffinity::destroy_api();
4712 }
4713
__kmp_affinity_set_init_mask(int gtid,int isa_root)4714 void __kmp_affinity_set_init_mask(int gtid, int isa_root) {
4715 if (!KMP_AFFINITY_CAPABLE()) {
4716 return;
4717 }
4718
4719 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4720 if (th->th.th_affin_mask == NULL) {
4721 KMP_CPU_ALLOC(th->th.th_affin_mask);
4722 } else {
4723 KMP_CPU_ZERO(th->th.th_affin_mask);
4724 }
4725
4726 // Copy the thread mask to the kmp_info_t structure. If
4727 // __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one that
4728 // has all of the OS proc ids set, or if __kmp_affinity_respect_mask is set,
4729 // then the full mask is the same as the mask of the initialization thread.
4730 kmp_affin_mask_t *mask;
4731 int i;
4732
4733 if (KMP_AFFINITY_NON_PROC_BIND) {
4734 if ((__kmp_affinity_type == affinity_none) ||
4735 (__kmp_affinity_type == affinity_balanced)) {
4736 #if KMP_GROUP_AFFINITY
4737 if (__kmp_num_proc_groups > 1) {
4738 return;
4739 }
4740 #endif
4741 KMP_ASSERT(__kmp_affin_fullMask != NULL);
4742 i = 0;
4743 mask = __kmp_affin_fullMask;
4744 } else {
4745 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
4746 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4747 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4748 }
4749 } else {
4750 if ((!isa_root) ||
4751 (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) {
4752 #if KMP_GROUP_AFFINITY
4753 if (__kmp_num_proc_groups > 1) {
4754 return;
4755 }
4756 #endif
4757 KMP_ASSERT(__kmp_affin_fullMask != NULL);
4758 i = KMP_PLACE_ALL;
4759 mask = __kmp_affin_fullMask;
4760 } else {
4761 // int i = some hash function or just a counter that doesn't
4762 // always start at 0. Use gtid for now.
4763 KMP_DEBUG_ASSERT(__kmp_affinity_num_masks > 0);
4764 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks;
4765 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i);
4766 }
4767 }
4768
4769 th->th.th_current_place = i;
4770 if (isa_root) {
4771 th->th.th_new_place = i;
4772 th->th.th_first_place = 0;
4773 th->th.th_last_place = __kmp_affinity_num_masks - 1;
4774 } else if (KMP_AFFINITY_NON_PROC_BIND) {
4775 // When using a Non-OMP_PROC_BIND affinity method,
4776 // set all threads' place-partition-var to the entire place list
4777 th->th.th_first_place = 0;
4778 th->th.th_last_place = __kmp_affinity_num_masks - 1;
4779 }
4780
4781 if (i == KMP_PLACE_ALL) {
4782 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n",
4783 gtid));
4784 } else {
4785 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n",
4786 gtid, i));
4787 }
4788
4789 KMP_CPU_COPY(th->th.th_affin_mask, mask);
4790
4791 if (__kmp_affinity_verbose
4792 /* to avoid duplicate printing (will be correctly printed on barrier) */
4793 && (__kmp_affinity_type == affinity_none ||
4794 (i != KMP_PLACE_ALL && __kmp_affinity_type != affinity_balanced))) {
4795 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4796 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4797 th->th.th_affin_mask);
4798 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
4799 __kmp_gettid(), gtid, buf);
4800 }
4801
4802 #if KMP_OS_WINDOWS
4803 // On Windows* OS, the process affinity mask might have changed. If the user
4804 // didn't request affinity and this call fails, just continue silently.
4805 // See CQ171393.
4806 if (__kmp_affinity_type == affinity_none) {
4807 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE);
4808 } else
4809 #endif
4810 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4811 }
4812
__kmp_affinity_set_place(int gtid)4813 void __kmp_affinity_set_place(int gtid) {
4814 if (!KMP_AFFINITY_CAPABLE()) {
4815 return;
4816 }
4817
4818 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]);
4819
4820 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current "
4821 "place = %d)\n",
4822 gtid, th->th.th_new_place, th->th.th_current_place));
4823
4824 // Check that the new place is within this thread's partition.
4825 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4826 KMP_ASSERT(th->th.th_new_place >= 0);
4827 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks);
4828 if (th->th.th_first_place <= th->th.th_last_place) {
4829 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) &&
4830 (th->th.th_new_place <= th->th.th_last_place));
4831 } else {
4832 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) ||
4833 (th->th.th_new_place >= th->th.th_last_place));
4834 }
4835
4836 // Copy the thread mask to the kmp_info_t structure,
4837 // and set this thread's affinity.
4838 kmp_affin_mask_t *mask =
4839 KMP_CPU_INDEX(__kmp_affinity_masks, th->th.th_new_place);
4840 KMP_CPU_COPY(th->th.th_affin_mask, mask);
4841 th->th.th_current_place = th->th.th_new_place;
4842
4843 if (__kmp_affinity_verbose) {
4844 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4845 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4846 th->th.th_affin_mask);
4847 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(),
4848 __kmp_gettid(), gtid, buf);
4849 }
4850 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE);
4851 }
4852
__kmp_aux_set_affinity(void ** mask)4853 int __kmp_aux_set_affinity(void **mask) {
4854 int gtid;
4855 kmp_info_t *th;
4856 int retval;
4857
4858 if (!KMP_AFFINITY_CAPABLE()) {
4859 return -1;
4860 }
4861
4862 gtid = __kmp_entry_gtid();
4863 KA_TRACE(1000, (""); {
4864 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4865 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4866 (kmp_affin_mask_t *)(*mask));
4867 __kmp_debug_printf(
4868 "kmp_set_affinity: setting affinity mask for thread %d = %s\n", gtid,
4869 buf);
4870 });
4871
4872 if (__kmp_env_consistency_check) {
4873 if ((mask == NULL) || (*mask == NULL)) {
4874 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4875 } else {
4876 unsigned proc;
4877 int num_procs = 0;
4878
4879 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t *)(*mask))) {
4880 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
4881 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4882 }
4883 if (!KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) {
4884 continue;
4885 }
4886 num_procs++;
4887 }
4888 if (num_procs == 0) {
4889 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4890 }
4891
4892 #if KMP_GROUP_AFFINITY
4893 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) {
4894 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
4895 }
4896 #endif /* KMP_GROUP_AFFINITY */
4897 }
4898 }
4899
4900 th = __kmp_threads[gtid];
4901 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4902 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4903 if (retval == 0) {
4904 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask));
4905 }
4906
4907 th->th.th_current_place = KMP_PLACE_UNDEFINED;
4908 th->th.th_new_place = KMP_PLACE_UNDEFINED;
4909 th->th.th_first_place = 0;
4910 th->th.th_last_place = __kmp_affinity_num_masks - 1;
4911
4912 // Turn off 4.0 affinity for the current tread at this parallel level.
4913 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false;
4914
4915 return retval;
4916 }
4917
__kmp_aux_get_affinity(void ** mask)4918 int __kmp_aux_get_affinity(void **mask) {
4919 int gtid;
4920 int retval;
4921 kmp_info_t *th;
4922
4923 if (!KMP_AFFINITY_CAPABLE()) {
4924 return -1;
4925 }
4926
4927 gtid = __kmp_entry_gtid();
4928 th = __kmp_threads[gtid];
4929 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL);
4930
4931 KA_TRACE(1000, (""); {
4932 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4933 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4934 th->th.th_affin_mask);
4935 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n",
4936 gtid, buf);
4937 });
4938
4939 if (__kmp_env_consistency_check) {
4940 if ((mask == NULL) || (*mask == NULL)) {
4941 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity");
4942 }
4943 }
4944
4945 #if !KMP_OS_WINDOWS
4946
4947 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE);
4948 KA_TRACE(1000, (""); {
4949 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4950 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4951 (kmp_affin_mask_t *)(*mask));
4952 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n",
4953 gtid, buf);
4954 });
4955 return retval;
4956
4957 #else
4958
4959 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask);
4960 return 0;
4961
4962 #endif /* KMP_OS_WINDOWS */
4963 }
4964
__kmp_aux_get_affinity_max_proc()4965 int __kmp_aux_get_affinity_max_proc() {
4966 if (!KMP_AFFINITY_CAPABLE()) {
4967 return 0;
4968 }
4969 #if KMP_GROUP_AFFINITY
4970 if (__kmp_num_proc_groups > 1) {
4971 return (int)(__kmp_num_proc_groups * sizeof(DWORD_PTR) * CHAR_BIT);
4972 }
4973 #endif
4974 return __kmp_xproc;
4975 }
4976
__kmp_aux_set_affinity_mask_proc(int proc,void ** mask)4977 int __kmp_aux_set_affinity_mask_proc(int proc, void **mask) {
4978 if (!KMP_AFFINITY_CAPABLE()) {
4979 return -1;
4980 }
4981
4982 KA_TRACE(1000, (""); {
4983 int gtid = __kmp_entry_gtid();
4984 char buf[KMP_AFFIN_MASK_PRINT_LEN];
4985 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
4986 (kmp_affin_mask_t *)(*mask));
4987 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in "
4988 "affinity mask for thread %d = %s\n",
4989 proc, gtid, buf);
4990 });
4991
4992 if (__kmp_env_consistency_check) {
4993 if ((mask == NULL) || (*mask == NULL)) {
4994 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc");
4995 }
4996 }
4997
4998 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
4999 return -1;
5000 }
5001 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5002 return -2;
5003 }
5004
5005 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask));
5006 return 0;
5007 }
5008
__kmp_aux_unset_affinity_mask_proc(int proc,void ** mask)5009 int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) {
5010 if (!KMP_AFFINITY_CAPABLE()) {
5011 return -1;
5012 }
5013
5014 KA_TRACE(1000, (""); {
5015 int gtid = __kmp_entry_gtid();
5016 char buf[KMP_AFFIN_MASK_PRINT_LEN];
5017 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5018 (kmp_affin_mask_t *)(*mask));
5019 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in "
5020 "affinity mask for thread %d = %s\n",
5021 proc, gtid, buf);
5022 });
5023
5024 if (__kmp_env_consistency_check) {
5025 if ((mask == NULL) || (*mask == NULL)) {
5026 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc");
5027 }
5028 }
5029
5030 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5031 return -1;
5032 }
5033 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5034 return -2;
5035 }
5036
5037 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask));
5038 return 0;
5039 }
5040
__kmp_aux_get_affinity_mask_proc(int proc,void ** mask)5041 int __kmp_aux_get_affinity_mask_proc(int proc, void **mask) {
5042 if (!KMP_AFFINITY_CAPABLE()) {
5043 return -1;
5044 }
5045
5046 KA_TRACE(1000, (""); {
5047 int gtid = __kmp_entry_gtid();
5048 char buf[KMP_AFFIN_MASK_PRINT_LEN];
5049 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN,
5050 (kmp_affin_mask_t *)(*mask));
5051 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in "
5052 "affinity mask for thread %d = %s\n",
5053 proc, gtid, buf);
5054 });
5055
5056 if (__kmp_env_consistency_check) {
5057 if ((mask == NULL) || (*mask == NULL)) {
5058 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc");
5059 }
5060 }
5061
5062 if ((proc < 0) || (proc >= __kmp_aux_get_affinity_max_proc())) {
5063 return -1;
5064 }
5065 if (!KMP_CPU_ISSET(proc, __kmp_affin_fullMask)) {
5066 return 0;
5067 }
5068
5069 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask));
5070 }
5071
5072 // Dynamic affinity settings - Affinity balanced
__kmp_balanced_affinity(kmp_info_t * th,int nthreads)5073 void __kmp_balanced_affinity(kmp_info_t *th, int nthreads) {
5074 KMP_DEBUG_ASSERT(th);
5075 bool fine_gran = true;
5076 int tid = th->th.th_info.ds.ds_tid;
5077
5078 switch (__kmp_affinity_gran) {
5079 case affinity_gran_fine:
5080 case affinity_gran_thread:
5081 break;
5082 case affinity_gran_core:
5083 if (__kmp_nThreadsPerCore > 1) {
5084 fine_gran = false;
5085 }
5086 break;
5087 case affinity_gran_package:
5088 if (nCoresPerPkg > 1) {
5089 fine_gran = false;
5090 }
5091 break;
5092 default:
5093 fine_gran = false;
5094 }
5095
5096 if (__kmp_affinity_uniform_topology()) {
5097 int coreID;
5098 int threadID;
5099 // Number of hyper threads per core in HT machine
5100 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores;
5101 // Number of cores
5102 int ncores = __kmp_ncores;
5103 if ((nPackages > 1) && (__kmp_nth_per_core <= 1)) {
5104 __kmp_nth_per_core = __kmp_avail_proc / nPackages;
5105 ncores = nPackages;
5106 }
5107 // How many threads will be bound to each core
5108 int chunk = nthreads / ncores;
5109 // How many cores will have an additional thread bound to it - "big cores"
5110 int big_cores = nthreads % ncores;
5111 // Number of threads on the big cores
5112 int big_nth = (chunk + 1) * big_cores;
5113 if (tid < big_nth) {
5114 coreID = tid / (chunk + 1);
5115 threadID = (tid % (chunk + 1)) % __kmp_nth_per_core;
5116 } else { // tid >= big_nth
5117 coreID = (tid - big_cores) / chunk;
5118 threadID = ((tid - big_cores) % chunk) % __kmp_nth_per_core;
5119 }
5120
5121 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(),
5122 "Illegal set affinity operation when not capable");
5123
5124 kmp_affin_mask_t *mask = th->th.th_affin_mask;
5125 KMP_CPU_ZERO(mask);
5126
5127 if (fine_gran) {
5128 int osID = address2os[coreID * __kmp_nth_per_core + threadID].second;
5129 KMP_CPU_SET(osID, mask);
5130 } else {
5131 for (int i = 0; i < __kmp_nth_per_core; i++) {
5132 int osID;
5133 osID = address2os[coreID * __kmp_nth_per_core + i].second;
5134 KMP_CPU_SET(osID, mask);
5135 }
5136 }
5137 if (__kmp_affinity_verbose) {
5138 char buf[KMP_AFFIN_MASK_PRINT_LEN];
5139 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5140 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
5141 __kmp_gettid(), tid, buf);
5142 }
5143 __kmp_set_system_affinity(mask, TRUE);
5144 } else { // Non-uniform topology
5145
5146 kmp_affin_mask_t *mask = th->th.th_affin_mask;
5147 KMP_CPU_ZERO(mask);
5148
5149 int core_level = __kmp_affinity_find_core_level(
5150 address2os, __kmp_avail_proc, __kmp_aff_depth - 1);
5151 int ncores = __kmp_affinity_compute_ncores(address2os, __kmp_avail_proc,
5152 __kmp_aff_depth - 1, core_level);
5153 int nth_per_core = __kmp_affinity_max_proc_per_core(
5154 address2os, __kmp_avail_proc, __kmp_aff_depth - 1, core_level);
5155
5156 // For performance gain consider the special case nthreads ==
5157 // __kmp_avail_proc
5158 if (nthreads == __kmp_avail_proc) {
5159 if (fine_gran) {
5160 int osID = address2os[tid].second;
5161 KMP_CPU_SET(osID, mask);
5162 } else {
5163 int core = __kmp_affinity_find_core(address2os, tid,
5164 __kmp_aff_depth - 1, core_level);
5165 for (int i = 0; i < __kmp_avail_proc; i++) {
5166 int osID = address2os[i].second;
5167 if (__kmp_affinity_find_core(address2os, i, __kmp_aff_depth - 1,
5168 core_level) == core) {
5169 KMP_CPU_SET(osID, mask);
5170 }
5171 }
5172 }
5173 } else if (nthreads <= ncores) {
5174
5175 int core = 0;
5176 for (int i = 0; i < ncores; i++) {
5177 // Check if this core from procarr[] is in the mask
5178 int in_mask = 0;
5179 for (int j = 0; j < nth_per_core; j++) {
5180 if (procarr[i * nth_per_core + j] != -1) {
5181 in_mask = 1;
5182 break;
5183 }
5184 }
5185 if (in_mask) {
5186 if (tid == core) {
5187 for (int j = 0; j < nth_per_core; j++) {
5188 int osID = procarr[i * nth_per_core + j];
5189 if (osID != -1) {
5190 KMP_CPU_SET(osID, mask);
5191 // For fine granularity it is enough to set the first available
5192 // osID for this core
5193 if (fine_gran) {
5194 break;
5195 }
5196 }
5197 }
5198 break;
5199 } else {
5200 core++;
5201 }
5202 }
5203 }
5204 } else { // nthreads > ncores
5205 // Array to save the number of processors at each core
5206 int *nproc_at_core = (int *)KMP_ALLOCA(sizeof(int) * ncores);
5207 // Array to save the number of cores with "x" available processors;
5208 int *ncores_with_x_procs =
5209 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5210 // Array to save the number of cores with # procs from x to nth_per_core
5211 int *ncores_with_x_to_max_procs =
5212 (int *)KMP_ALLOCA(sizeof(int) * (nth_per_core + 1));
5213
5214 for (int i = 0; i <= nth_per_core; i++) {
5215 ncores_with_x_procs[i] = 0;
5216 ncores_with_x_to_max_procs[i] = 0;
5217 }
5218
5219 for (int i = 0; i < ncores; i++) {
5220 int cnt = 0;
5221 for (int j = 0; j < nth_per_core; j++) {
5222 if (procarr[i * nth_per_core + j] != -1) {
5223 cnt++;
5224 }
5225 }
5226 nproc_at_core[i] = cnt;
5227 ncores_with_x_procs[cnt]++;
5228 }
5229
5230 for (int i = 0; i <= nth_per_core; i++) {
5231 for (int j = i; j <= nth_per_core; j++) {
5232 ncores_with_x_to_max_procs[i] += ncores_with_x_procs[j];
5233 }
5234 }
5235
5236 // Max number of processors
5237 int nproc = nth_per_core * ncores;
5238 // An array to keep number of threads per each context
5239 int *newarr = (int *)__kmp_allocate(sizeof(int) * nproc);
5240 for (int i = 0; i < nproc; i++) {
5241 newarr[i] = 0;
5242 }
5243
5244 int nth = nthreads;
5245 int flag = 0;
5246 while (nth > 0) {
5247 for (int j = 1; j <= nth_per_core; j++) {
5248 int cnt = ncores_with_x_to_max_procs[j];
5249 for (int i = 0; i < ncores; i++) {
5250 // Skip the core with 0 processors
5251 if (nproc_at_core[i] == 0) {
5252 continue;
5253 }
5254 for (int k = 0; k < nth_per_core; k++) {
5255 if (procarr[i * nth_per_core + k] != -1) {
5256 if (newarr[i * nth_per_core + k] == 0) {
5257 newarr[i * nth_per_core + k] = 1;
5258 cnt--;
5259 nth--;
5260 break;
5261 } else {
5262 if (flag != 0) {
5263 newarr[i * nth_per_core + k]++;
5264 cnt--;
5265 nth--;
5266 break;
5267 }
5268 }
5269 }
5270 }
5271 if (cnt == 0 || nth == 0) {
5272 break;
5273 }
5274 }
5275 if (nth == 0) {
5276 break;
5277 }
5278 }
5279 flag = 1;
5280 }
5281 int sum = 0;
5282 for (int i = 0; i < nproc; i++) {
5283 sum += newarr[i];
5284 if (sum > tid) {
5285 if (fine_gran) {
5286 int osID = procarr[i];
5287 KMP_CPU_SET(osID, mask);
5288 } else {
5289 int coreID = i / nth_per_core;
5290 for (int ii = 0; ii < nth_per_core; ii++) {
5291 int osID = procarr[coreID * nth_per_core + ii];
5292 if (osID != -1) {
5293 KMP_CPU_SET(osID, mask);
5294 }
5295 }
5296 }
5297 break;
5298 }
5299 }
5300 __kmp_free(newarr);
5301 }
5302
5303 if (__kmp_affinity_verbose) {
5304 char buf[KMP_AFFIN_MASK_PRINT_LEN];
5305 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask);
5306 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(),
5307 __kmp_gettid(), tid, buf);
5308 }
5309 __kmp_set_system_affinity(mask, TRUE);
5310 }
5311 }
5312
5313 #if KMP_OS_LINUX || KMP_OS_FREEBSD
5314 // We don't need this entry for Windows because
5315 // there is GetProcessAffinityMask() api
5316 //
5317 // The intended usage is indicated by these steps:
5318 // 1) The user gets the current affinity mask
5319 // 2) Then sets the affinity by calling this function
5320 // 3) Error check the return value
5321 // 4) Use non-OpenMP parallelization
5322 // 5) Reset the affinity to what was stored in step 1)
5323 #ifdef __cplusplus
5324 extern "C"
5325 #endif
5326 int
kmp_set_thread_affinity_mask_initial()5327 kmp_set_thread_affinity_mask_initial()
5328 // the function returns 0 on success,
5329 // -1 if we cannot bind thread
5330 // >0 (errno) if an error happened during binding
5331 {
5332 int gtid = __kmp_get_gtid();
5333 if (gtid < 0) {
5334 // Do not touch non-omp threads
5335 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5336 "non-omp thread, returning\n"));
5337 return -1;
5338 }
5339 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) {
5340 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5341 "affinity not initialized, returning\n"));
5342 return -1;
5343 }
5344 KA_TRACE(30, ("kmp_set_thread_affinity_mask_initial: "
5345 "set full mask for thread %d\n",
5346 gtid));
5347 KMP_DEBUG_ASSERT(__kmp_affin_fullMask != NULL);
5348 return __kmp_set_system_affinity(__kmp_affin_fullMask, FALSE);
5349 }
5350 #endif
5351
5352 #endif // KMP_AFFINITY_SUPPORTED
5353