1 /*
2 * kmp_runtime.cpp -- KPTS runtime support library
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "kmp.h"
14 #include "kmp_affinity.h"
15 #include "kmp_atomic.h"
16 #include "kmp_environment.h"
17 #include "kmp_error.h"
18 #include "kmp_i18n.h"
19 #include "kmp_io.h"
20 #include "kmp_itt.h"
21 #include "kmp_settings.h"
22 #include "kmp_stats.h"
23 #include "kmp_str.h"
24 #include "kmp_wait_release.h"
25 #include "kmp_wrapper_getpid.h"
26 #include "kmp_dispatch.h"
27 #if KMP_USE_HIER_SCHED
28 #include "kmp_dispatch_hier.h"
29 #endif
30
31 #if OMPT_SUPPORT
32 #include "ompt-specific.h"
33 #endif
34
35 /* these are temporary issues to be dealt with */
36 #define KMP_USE_PRCTL 0
37
38 #if KMP_OS_WINDOWS
39 #include <process.h>
40 #endif
41
42 #include "tsan_annotations.h"
43
44 #if KMP_OS_WINDOWS
45 // windows does not need include files as it doesn't use shared memory
46 #else
47 #include <sys/mman.h>
48 #include <sys/stat.h>
49 #include <fcntl.h>
50 #define SHM_SIZE 1024
51 #endif
52
53 #if defined(KMP_GOMP_COMPAT)
54 char const __kmp_version_alt_comp[] =
55 KMP_VERSION_PREFIX "alternative compiler support: yes";
56 #endif /* defined(KMP_GOMP_COMPAT) */
57
58 char const __kmp_version_omp_api[] =
59 KMP_VERSION_PREFIX "API version: 5.0 (201611)";
60
61 #ifdef KMP_DEBUG
62 char const __kmp_version_lock[] =
63 KMP_VERSION_PREFIX "lock type: run time selectable";
64 #endif /* KMP_DEBUG */
65
66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y))
67
68 /* ------------------------------------------------------------------------ */
69
70 #if KMP_USE_MONITOR
71 kmp_info_t __kmp_monitor;
72 #endif
73
74 /* Forward declarations */
75
76 void __kmp_cleanup(void);
77
78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid,
79 int gtid);
80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
81 kmp_internal_control_t *new_icvs,
82 ident_t *loc);
83 #if KMP_AFFINITY_SUPPORTED
84 static void __kmp_partition_places(kmp_team_t *team,
85 int update_master_only = 0);
86 #endif
87 static void __kmp_do_serial_initialize(void);
88 void __kmp_fork_barrier(int gtid, int tid);
89 void __kmp_join_barrier(int gtid);
90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
91 kmp_internal_control_t *new_icvs, ident_t *loc);
92
93 #ifdef USE_LOAD_BALANCE
94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc);
95 #endif
96
97 static int __kmp_expand_threads(int nNeed);
98 #if KMP_OS_WINDOWS
99 static int __kmp_unregister_root_other_thread(int gtid);
100 #endif
101 static void __kmp_reap_thread(kmp_info_t *thread, int is_root);
102 kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
103
104 /* Calculate the identifier of the current thread */
105 /* fast (and somewhat portable) way to get unique identifier of executing
106 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */
__kmp_get_global_thread_id()107 int __kmp_get_global_thread_id() {
108 int i;
109 kmp_info_t **other_threads;
110 size_t stack_data;
111 char *stack_addr;
112 size_t stack_size;
113 char *stack_base;
114
115 KA_TRACE(
116 1000,
117 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n",
118 __kmp_nth, __kmp_all_nth));
119
120 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to
121 a parallel region, made it return KMP_GTID_DNE to force serial_initialize
122 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
123 __kmp_init_gtid for this to work. */
124
125 if (!TCR_4(__kmp_init_gtid))
126 return KMP_GTID_DNE;
127
128 #ifdef KMP_TDATA_GTID
129 if (TCR_4(__kmp_gtid_mode) >= 3) {
130 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n"));
131 return __kmp_gtid;
132 }
133 #endif
134 if (TCR_4(__kmp_gtid_mode) >= 2) {
135 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n"));
136 return __kmp_gtid_get_specific();
137 }
138 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n"));
139
140 stack_addr = (char *)&stack_data;
141 other_threads = __kmp_threads;
142
143 /* ATT: The code below is a source of potential bugs due to unsynchronized
144 access to __kmp_threads array. For example:
145 1. Current thread loads other_threads[i] to thr and checks it, it is
146 non-NULL.
147 2. Current thread is suspended by OS.
148 3. Another thread unregisters and finishes (debug versions of free()
149 may fill memory with something like 0xEF).
150 4. Current thread is resumed.
151 5. Current thread reads junk from *thr.
152 TODO: Fix it. --ln */
153
154 for (i = 0; i < __kmp_threads_capacity; i++) {
155
156 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
157 if (!thr)
158 continue;
159
160 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
161 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
162
163 /* stack grows down -- search through all of the active threads */
164
165 if (stack_addr <= stack_base) {
166 size_t stack_diff = stack_base - stack_addr;
167
168 if (stack_diff <= stack_size) {
169 /* The only way we can be closer than the allocated */
170 /* stack size is if we are running on this thread. */
171 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i);
172 return i;
173 }
174 }
175 }
176
177 /* get specific to try and determine our gtid */
178 KA_TRACE(1000,
179 ("*** __kmp_get_global_thread_id: internal alg. failed to find "
180 "thread, using TLS\n"));
181 i = __kmp_gtid_get_specific();
182
183 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */
184
185 /* if we havn't been assigned a gtid, then return code */
186 if (i < 0)
187 return i;
188
189 /* dynamically updated stack window for uber threads to avoid get_specific
190 call */
191 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) {
192 KMP_FATAL(StackOverflow, i);
193 }
194
195 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
196 if (stack_addr > stack_base) {
197 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
198 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
199 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr -
200 stack_base);
201 } else {
202 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
203 stack_base - stack_addr);
204 }
205
206 /* Reprint stack bounds for ubermaster since they have been refined */
207 if (__kmp_storage_map) {
208 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase;
209 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
210 __kmp_print_storage_map_gtid(i, stack_beg, stack_end,
211 other_threads[i]->th.th_info.ds.ds_stacksize,
212 "th_%d stack (refinement)", i);
213 }
214 return i;
215 }
216
__kmp_get_global_thread_id_reg()217 int __kmp_get_global_thread_id_reg() {
218 int gtid;
219
220 if (!__kmp_init_serial) {
221 gtid = KMP_GTID_DNE;
222 } else
223 #ifdef KMP_TDATA_GTID
224 if (TCR_4(__kmp_gtid_mode) >= 3) {
225 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n"));
226 gtid = __kmp_gtid;
227 } else
228 #endif
229 if (TCR_4(__kmp_gtid_mode) >= 2) {
230 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n"));
231 gtid = __kmp_gtid_get_specific();
232 } else {
233 KA_TRACE(1000,
234 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n"));
235 gtid = __kmp_get_global_thread_id();
236 }
237
238 /* we must be a new uber master sibling thread */
239 if (gtid == KMP_GTID_DNE) {
240 KA_TRACE(10,
241 ("__kmp_get_global_thread_id_reg: Encountered new root thread. "
242 "Registering a new gtid.\n"));
243 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
244 if (!__kmp_init_serial) {
245 __kmp_do_serial_initialize();
246 gtid = __kmp_gtid_get_specific();
247 } else {
248 gtid = __kmp_register_root(FALSE);
249 }
250 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
251 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
252 }
253
254 KMP_DEBUG_ASSERT(gtid >= 0);
255
256 return gtid;
257 }
258
259 /* caller must hold forkjoin_lock */
__kmp_check_stack_overlap(kmp_info_t * th)260 void __kmp_check_stack_overlap(kmp_info_t *th) {
261 int f;
262 char *stack_beg = NULL;
263 char *stack_end = NULL;
264 int gtid;
265
266 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n"));
267 if (__kmp_storage_map) {
268 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
269 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
270
271 gtid = __kmp_gtid_from_thread(th);
272
273 if (gtid == KMP_GTID_MONITOR) {
274 __kmp_print_storage_map_gtid(
275 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
276 "th_%s stack (%s)", "mon",
277 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
278 } else {
279 __kmp_print_storage_map_gtid(
280 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
281 "th_%d stack (%s)", gtid,
282 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual");
283 }
284 }
285
286 /* No point in checking ubermaster threads since they use refinement and
287 * cannot overlap */
288 gtid = __kmp_gtid_from_thread(th);
289 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) {
290 KA_TRACE(10,
291 ("__kmp_check_stack_overlap: performing extensive checking\n"));
292 if (stack_beg == NULL) {
293 stack_end = (char *)th->th.th_info.ds.ds_stackbase;
294 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
295 }
296
297 for (f = 0; f < __kmp_threads_capacity; f++) {
298 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
299
300 if (f_th && f_th != th) {
301 char *other_stack_end =
302 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
303 char *other_stack_beg =
304 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
305 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
306 (stack_end > other_stack_beg && stack_end < other_stack_end)) {
307
308 /* Print the other stack values before the abort */
309 if (__kmp_storage_map)
310 __kmp_print_storage_map_gtid(
311 -1, other_stack_beg, other_stack_end,
312 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
313 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th));
314
315 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit),
316 __kmp_msg_null);
317 }
318 }
319 }
320 }
321 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n"));
322 }
323
324 /* ------------------------------------------------------------------------ */
325
__kmp_infinite_loop(void)326 void __kmp_infinite_loop(void) {
327 static int done = FALSE;
328
329 while (!done) {
330 KMP_YIELD(TRUE);
331 }
332 }
333
334 #define MAX_MESSAGE 512
335
__kmp_print_storage_map_gtid(int gtid,void * p1,void * p2,size_t size,char const * format,...)336 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size,
337 char const *format, ...) {
338 char buffer[MAX_MESSAGE];
339 va_list ap;
340
341 va_start(ap, format);
342 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1,
343 p2, (unsigned long)size, format);
344 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
345 __kmp_vprintf(kmp_err, buffer, ap);
346 #if KMP_PRINT_DATA_PLACEMENT
347 int node;
348 if (gtid >= 0) {
349 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) {
350 if (__kmp_storage_map_verbose) {
351 node = __kmp_get_host_node(p1);
352 if (node < 0) /* doesn't work, so don't try this next time */
353 __kmp_storage_map_verbose = FALSE;
354 else {
355 char *last;
356 int lastNode;
357 int localProc = __kmp_get_cpu_from_gtid(gtid);
358
359 const int page_size = KMP_GET_PAGE_SIZE();
360
361 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1));
362 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1));
363 if (localProc >= 0)
364 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid,
365 localProc >> 1);
366 else
367 __kmp_printf_no_lock(" GTID %d\n", gtid);
368 #if KMP_USE_PRCTL
369 /* The more elaborate format is disabled for now because of the prctl
370 * hanging bug. */
371 do {
372 last = p1;
373 lastNode = node;
374 /* This loop collates adjacent pages with the same host node. */
375 do {
376 (char *)p1 += page_size;
377 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
378 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1,
379 lastNode);
380 } while (p1 <= p2);
381 #else
382 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1,
383 (char *)p1 + (page_size - 1),
384 __kmp_get_host_node(p1));
385 if (p1 < p2) {
386 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2,
387 (char *)p2 + (page_size - 1),
388 __kmp_get_host_node(p2));
389 }
390 #endif
391 }
392 }
393 } else
394 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning));
395 }
396 #endif /* KMP_PRINT_DATA_PLACEMENT */
397 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
398 }
399
__kmp_warn(char const * format,...)400 void __kmp_warn(char const *format, ...) {
401 char buffer[MAX_MESSAGE];
402 va_list ap;
403
404 if (__kmp_generate_warnings == kmp_warnings_off) {
405 return;
406 }
407
408 va_start(ap, format);
409
410 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format);
411 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock);
412 __kmp_vprintf(kmp_err, buffer, ap);
413 __kmp_release_bootstrap_lock(&__kmp_stdio_lock);
414
415 va_end(ap);
416 }
417
__kmp_abort_process()418 void __kmp_abort_process() {
419 // Later threads may stall here, but that's ok because abort() will kill them.
420 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock);
421
422 if (__kmp_debug_buf) {
423 __kmp_dump_debug_buffer();
424 }
425
426 if (KMP_OS_WINDOWS) {
427 // Let other threads know of abnormal termination and prevent deadlock
428 // if abort happened during library initialization or shutdown
429 __kmp_global.g.g_abort = SIGABRT;
430
431 /* On Windows* OS by default abort() causes pop-up error box, which stalls
432 nightly testing. Unfortunately, we cannot reliably suppress pop-up error
433 boxes. _set_abort_behavior() works well, but this function is not
434 available in VS7 (this is not problem for DLL, but it is a problem for
435 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not
436 help, at least in some versions of MS C RTL.
437
438 It seems following sequence is the only way to simulate abort() and
439 avoid pop-up error box. */
440 raise(SIGABRT);
441 _exit(3); // Just in case, if signal ignored, exit anyway.
442 } else {
443 __kmp_unregister_library();
444 abort();
445 }
446
447 __kmp_infinite_loop();
448 __kmp_release_bootstrap_lock(&__kmp_exit_lock);
449
450 } // __kmp_abort_process
451
__kmp_abort_thread(void)452 void __kmp_abort_thread(void) {
453 // TODO: Eliminate g_abort global variable and this function.
454 // In case of abort just call abort(), it will kill all the threads.
455 __kmp_infinite_loop();
456 } // __kmp_abort_thread
457
458 /* Print out the storage map for the major kmp_info_t thread data structures
459 that are allocated together. */
460
__kmp_print_thread_storage_map(kmp_info_t * thr,int gtid)461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) {
462 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d",
463 gtid);
464
465 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team,
466 sizeof(kmp_desc_t), "th_%d.th_info", gtid);
467
468 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head,
469 sizeof(kmp_local_t), "th_%d.th_local", gtid);
470
471 __kmp_print_storage_map_gtid(
472 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
473 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid);
474
475 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier],
476 &thr->th.th_bar[bs_plain_barrier + 1],
477 sizeof(kmp_balign_t), "th_%d.th_bar[plain]",
478 gtid);
479
480 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier],
481 &thr->th.th_bar[bs_forkjoin_barrier + 1],
482 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]",
483 gtid);
484
485 #if KMP_FAST_REDUCTION_BARRIER
486 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier],
487 &thr->th.th_bar[bs_reduction_barrier + 1],
488 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]",
489 gtid);
490 #endif // KMP_FAST_REDUCTION_BARRIER
491 }
492
493 /* Print out the storage map for the major kmp_team_t team data structures
494 that are allocated together. */
495
__kmp_print_team_storage_map(const char * header,kmp_team_t * team,int team_id,int num_thr)496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team,
497 int team_id, int num_thr) {
498 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
499 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
500 header, team_id);
501
502 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0],
503 &team->t.t_bar[bs_last_barrier],
504 sizeof(kmp_balign_team_t) * bs_last_barrier,
505 "%s_%d.t_bar", header, team_id);
506
507 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier],
508 &team->t.t_bar[bs_plain_barrier + 1],
509 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]",
510 header, team_id);
511
512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier],
513 &team->t.t_bar[bs_forkjoin_barrier + 1],
514 sizeof(kmp_balign_team_t),
515 "%s_%d.t_bar[forkjoin]", header, team_id);
516
517 #if KMP_FAST_REDUCTION_BARRIER
518 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier],
519 &team->t.t_bar[bs_reduction_barrier + 1],
520 sizeof(kmp_balign_team_t),
521 "%s_%d.t_bar[reduction]", header, team_id);
522 #endif // KMP_FAST_REDUCTION_BARRIER
523
524 __kmp_print_storage_map_gtid(
525 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
526 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id);
527
528 __kmp_print_storage_map_gtid(
529 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
530 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id);
531
532 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0],
533 &team->t.t_disp_buffer[num_disp_buff],
534 sizeof(dispatch_shared_info_t) * num_disp_buff,
535 "%s_%d.t_disp_buffer", header, team_id);
536 }
537
__kmp_init_allocator()538 static void __kmp_init_allocator() { __kmp_init_memkind(); }
__kmp_fini_allocator()539 static void __kmp_fini_allocator() { __kmp_fini_memkind(); }
540
541 /* ------------------------------------------------------------------------ */
542
543 #if KMP_DYNAMIC_LIB
544 #if KMP_OS_WINDOWS
545
__kmp_reset_lock(kmp_bootstrap_lock_t * lck)546 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) {
547 // TODO: Change to __kmp_break_bootstrap_lock().
548 __kmp_init_bootstrap_lock(lck); // make the lock released
549 }
550
__kmp_reset_locks_on_process_detach(int gtid_req)551 static void __kmp_reset_locks_on_process_detach(int gtid_req) {
552 int i;
553 int thread_count;
554
555 // PROCESS_DETACH is expected to be called by a thread that executes
556 // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one
557 // calling ProcessExit or FreeLibrary). So, it might be safe to access the
558 // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some
559 // threads can be still alive here, although being about to be terminated. The
560 // threads in the array with ds_thread==0 are most suspicious. Actually, it
561 // can be not safe to access the __kmp_threads[].
562
563 // TODO: does it make sense to check __kmp_roots[] ?
564
565 // Let's check that there are no other alive threads registered with the OMP
566 // lib.
567 while (1) {
568 thread_count = 0;
569 for (i = 0; i < __kmp_threads_capacity; ++i) {
570 if (!__kmp_threads)
571 continue;
572 kmp_info_t *th = __kmp_threads[i];
573 if (th == NULL)
574 continue;
575 int gtid = th->th.th_info.ds.ds_gtid;
576 if (gtid == gtid_req)
577 continue;
578 if (gtid < 0)
579 continue;
580 DWORD exit_val;
581 int alive = __kmp_is_thread_alive(th, &exit_val);
582 if (alive) {
583 ++thread_count;
584 }
585 }
586 if (thread_count == 0)
587 break; // success
588 }
589
590 // Assume that I'm alone. Now it might be safe to check and reset locks.
591 // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
592 __kmp_reset_lock(&__kmp_forkjoin_lock);
593 #ifdef KMP_DEBUG
594 __kmp_reset_lock(&__kmp_stdio_lock);
595 #endif // KMP_DEBUG
596 }
597
DllMain(HINSTANCE hInstDLL,DWORD fdwReason,LPVOID lpReserved)598 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) {
599 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
600
601 switch (fdwReason) {
602
603 case DLL_PROCESS_ATTACH:
604 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n"));
605
606 return TRUE;
607
608 case DLL_PROCESS_DETACH:
609 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific()));
610
611 if (lpReserved != NULL) {
612 // lpReserved is used for telling the difference:
613 // lpReserved == NULL when FreeLibrary() was called,
614 // lpReserved != NULL when the process terminates.
615 // When FreeLibrary() is called, worker threads remain alive. So they will
616 // release the forkjoin lock by themselves. When the process terminates,
617 // worker threads disappear triggering the problem of unreleased forkjoin
618 // lock as described below.
619
620 // A worker thread can take the forkjoin lock. The problem comes up if
621 // that worker thread becomes dead before it releases the forkjoin lock.
622 // The forkjoin lock remains taken, while the thread executing
623 // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try
624 // to take the forkjoin lock and will always fail, so that the application
625 // will never finish [normally]. This scenario is possible if
626 // __kmpc_end() has not been executed. It looks like it's not a corner
627 // case, but common cases:
628 // - the main function was compiled by an alternative compiler;
629 // - the main function was compiled by icl but without /Qopenmp
630 // (application with plugins);
631 // - application terminates by calling C exit(), Fortran CALL EXIT() or
632 // Fortran STOP.
633 // - alive foreign thread prevented __kmpc_end from doing cleanup.
634 //
635 // This is a hack to work around the problem.
636 // TODO: !!! figure out something better.
637 __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific());
638 }
639
640 __kmp_internal_end_library(__kmp_gtid_get_specific());
641
642 return TRUE;
643
644 case DLL_THREAD_ATTACH:
645 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n"));
646
647 /* if we want to register new siblings all the time here call
648 * __kmp_get_gtid(); */
649 return TRUE;
650
651 case DLL_THREAD_DETACH:
652 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific()));
653
654 __kmp_internal_end_thread(__kmp_gtid_get_specific());
655 return TRUE;
656 }
657
658 return TRUE;
659 }
660
661 #endif /* KMP_OS_WINDOWS */
662 #endif /* KMP_DYNAMIC_LIB */
663
664 /* __kmp_parallel_deo -- Wait until it's our turn. */
__kmp_parallel_deo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)665 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
666 int gtid = *gtid_ref;
667 #ifdef BUILD_PARALLEL_ORDERED
668 kmp_team_t *team = __kmp_team_from_gtid(gtid);
669 #endif /* BUILD_PARALLEL_ORDERED */
670
671 if (__kmp_env_consistency_check) {
672 if (__kmp_threads[gtid]->th.th_root->r.r_active)
673 #if KMP_USE_DYNAMIC_LOCK
674 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0);
675 #else
676 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL);
677 #endif
678 }
679 #ifdef BUILD_PARALLEL_ORDERED
680 if (!team->t.t_serialized) {
681 KMP_MB();
682 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ,
683 NULL);
684 KMP_MB();
685 }
686 #endif /* BUILD_PARALLEL_ORDERED */
687 }
688
689 /* __kmp_parallel_dxo -- Signal the next task. */
__kmp_parallel_dxo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)690 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
691 int gtid = *gtid_ref;
692 #ifdef BUILD_PARALLEL_ORDERED
693 int tid = __kmp_tid_from_gtid(gtid);
694 kmp_team_t *team = __kmp_team_from_gtid(gtid);
695 #endif /* BUILD_PARALLEL_ORDERED */
696
697 if (__kmp_env_consistency_check) {
698 if (__kmp_threads[gtid]->th.th_root->r.r_active)
699 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref);
700 }
701 #ifdef BUILD_PARALLEL_ORDERED
702 if (!team->t.t_serialized) {
703 KMP_MB(); /* Flush all pending memory write invalidates. */
704
705 /* use the tid of the next thread in this team */
706 /* TODO replace with general release procedure */
707 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc);
708
709 KMP_MB(); /* Flush all pending memory write invalidates. */
710 }
711 #endif /* BUILD_PARALLEL_ORDERED */
712 }
713
714 /* ------------------------------------------------------------------------ */
715 /* The BARRIER for a SINGLE process section is always explicit */
716
__kmp_enter_single(int gtid,ident_t * id_ref,int push_ws)717 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) {
718 int status;
719 kmp_info_t *th;
720 kmp_team_t *team;
721
722 if (!TCR_4(__kmp_init_parallel))
723 __kmp_parallel_initialize();
724 __kmp_resume_if_soft_paused();
725
726 th = __kmp_threads[gtid];
727 team = th->th.th_team;
728 status = 0;
729
730 th->th.th_ident = id_ref;
731
732 if (team->t.t_serialized) {
733 status = 1;
734 } else {
735 kmp_int32 old_this = th->th.th_local.this_construct;
736
737 ++th->th.th_local.this_construct;
738 /* try to set team count to thread count--success means thread got the
739 single block */
740 /* TODO: Should this be acquire or release? */
741 if (team->t.t_construct == old_this) {
742 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this,
743 th->th.th_local.this_construct);
744 }
745 #if USE_ITT_BUILD
746 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
747 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
748 team->t.t_active_level ==
749 1) { // Only report metadata by master of active team at level 1
750 __kmp_itt_metadata_single(id_ref);
751 }
752 #endif /* USE_ITT_BUILD */
753 }
754
755 if (__kmp_env_consistency_check) {
756 if (status && push_ws) {
757 __kmp_push_workshare(gtid, ct_psingle, id_ref);
758 } else {
759 __kmp_check_workshare(gtid, ct_psingle, id_ref);
760 }
761 }
762 #if USE_ITT_BUILD
763 if (status) {
764 __kmp_itt_single_start(gtid);
765 }
766 #endif /* USE_ITT_BUILD */
767 return status;
768 }
769
__kmp_exit_single(int gtid)770 void __kmp_exit_single(int gtid) {
771 #if USE_ITT_BUILD
772 __kmp_itt_single_end(gtid);
773 #endif /* USE_ITT_BUILD */
774 if (__kmp_env_consistency_check)
775 __kmp_pop_workshare(gtid, ct_psingle, NULL);
776 }
777
778 /* determine if we can go parallel or must use a serialized parallel region and
779 * how many threads we can use
780 * set_nproc is the number of threads requested for the team
781 * returns 0 if we should serialize or only use one thread,
782 * otherwise the number of threads to use
783 * The forkjoin lock is held by the caller. */
__kmp_reserve_threads(kmp_root_t * root,kmp_team_t * parent_team,int master_tid,int set_nthreads,int enter_teams)784 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team,
785 int master_tid, int set_nthreads,
786 int enter_teams) {
787 int capacity;
788 int new_nthreads;
789 KMP_DEBUG_ASSERT(__kmp_init_serial);
790 KMP_DEBUG_ASSERT(root && parent_team);
791 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid];
792
793 // If dyn-var is set, dynamically adjust the number of desired threads,
794 // according to the method specified by dynamic_mode.
795 new_nthreads = set_nthreads;
796 if (!get__dynamic_2(parent_team, master_tid)) {
797 ;
798 }
799 #ifdef USE_LOAD_BALANCE
800 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) {
801 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads);
802 if (new_nthreads == 1) {
803 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
804 "reservation to 1 thread\n",
805 master_tid));
806 return 1;
807 }
808 if (new_nthreads < set_nthreads) {
809 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced "
810 "reservation to %d threads\n",
811 master_tid, new_nthreads));
812 }
813 }
814 #endif /* USE_LOAD_BALANCE */
815 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) {
816 new_nthreads = __kmp_avail_proc - __kmp_nth +
817 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
818 if (new_nthreads <= 1) {
819 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
820 "reservation to 1 thread\n",
821 master_tid));
822 return 1;
823 }
824 if (new_nthreads < set_nthreads) {
825 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced "
826 "reservation to %d threads\n",
827 master_tid, new_nthreads));
828 } else {
829 new_nthreads = set_nthreads;
830 }
831 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) {
832 if (set_nthreads > 2) {
833 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]);
834 new_nthreads = (new_nthreads % set_nthreads) + 1;
835 if (new_nthreads == 1) {
836 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
837 "reservation to 1 thread\n",
838 master_tid));
839 return 1;
840 }
841 if (new_nthreads < set_nthreads) {
842 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced "
843 "reservation to %d threads\n",
844 master_tid, new_nthreads));
845 }
846 }
847 } else {
848 KMP_ASSERT(0);
849 }
850
851 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT.
852 if (__kmp_nth + new_nthreads -
853 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
854 __kmp_max_nth) {
855 int tl_nthreads = __kmp_max_nth - __kmp_nth +
856 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
857 if (tl_nthreads <= 0) {
858 tl_nthreads = 1;
859 }
860
861 // If dyn-var is false, emit a 1-time warning.
862 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
863 __kmp_reserve_warn = 1;
864 __kmp_msg(kmp_ms_warning,
865 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
866 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
867 }
868 if (tl_nthreads == 1) {
869 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT "
870 "reduced reservation to 1 thread\n",
871 master_tid));
872 return 1;
873 }
874 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced "
875 "reservation to %d threads\n",
876 master_tid, tl_nthreads));
877 new_nthreads = tl_nthreads;
878 }
879
880 // Respect OMP_THREAD_LIMIT
881 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads;
882 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit;
883 if (cg_nthreads + new_nthreads -
884 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
885 max_cg_threads) {
886 int tl_nthreads = max_cg_threads - cg_nthreads +
887 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
888 if (tl_nthreads <= 0) {
889 tl_nthreads = 1;
890 }
891
892 // If dyn-var is false, emit a 1-time warning.
893 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
894 __kmp_reserve_warn = 1;
895 __kmp_msg(kmp_ms_warning,
896 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads),
897 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
898 }
899 if (tl_nthreads == 1) {
900 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT "
901 "reduced reservation to 1 thread\n",
902 master_tid));
903 return 1;
904 }
905 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced "
906 "reservation to %d threads\n",
907 master_tid, tl_nthreads));
908 new_nthreads = tl_nthreads;
909 }
910
911 // Check if the threads array is large enough, or needs expanding.
912 // See comment in __kmp_register_root() about the adjustment if
913 // __kmp_threads[0] == NULL.
914 capacity = __kmp_threads_capacity;
915 if (TCR_PTR(__kmp_threads[0]) == NULL) {
916 --capacity;
917 }
918 if (__kmp_nth + new_nthreads -
919 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) >
920 capacity) {
921 // Expand the threads array.
922 int slotsRequired = __kmp_nth + new_nthreads -
923 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) -
924 capacity;
925 int slotsAdded = __kmp_expand_threads(slotsRequired);
926 if (slotsAdded < slotsRequired) {
927 // The threads array was not expanded enough.
928 new_nthreads -= (slotsRequired - slotsAdded);
929 KMP_ASSERT(new_nthreads >= 1);
930
931 // If dyn-var is false, emit a 1-time warning.
932 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) {
933 __kmp_reserve_warn = 1;
934 if (__kmp_tp_cached) {
935 __kmp_msg(kmp_ms_warning,
936 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
937 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
938 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
939 } else {
940 __kmp_msg(kmp_ms_warning,
941 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads),
942 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null);
943 }
944 }
945 }
946 }
947
948 #ifdef KMP_DEBUG
949 if (new_nthreads == 1) {
950 KC_TRACE(10,
951 ("__kmp_reserve_threads: T#%d serializing team after reclaiming "
952 "dead roots and rechecking; requested %d threads\n",
953 __kmp_get_gtid(), set_nthreads));
954 } else {
955 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested"
956 " %d threads\n",
957 __kmp_get_gtid(), new_nthreads, set_nthreads));
958 }
959 #endif // KMP_DEBUG
960 return new_nthreads;
961 }
962
963 /* Allocate threads from the thread pool and assign them to the new team. We are
964 assured that there are enough threads available, because we checked on that
965 earlier within critical section forkjoin */
__kmp_fork_team_threads(kmp_root_t * root,kmp_team_t * team,kmp_info_t * master_th,int master_gtid)966 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team,
967 kmp_info_t *master_th, int master_gtid) {
968 int i;
969 int use_hot_team;
970
971 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc));
972 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid());
973 KMP_MB();
974
975 /* first, let's setup the master thread */
976 master_th->th.th_info.ds.ds_tid = 0;
977 master_th->th.th_team = team;
978 master_th->th.th_team_nproc = team->t.t_nproc;
979 master_th->th.th_team_master = master_th;
980 master_th->th.th_team_serialized = FALSE;
981 master_th->th.th_dispatch = &team->t.t_dispatch[0];
982
983 /* make sure we are not the optimized hot team */
984 #if KMP_NESTED_HOT_TEAMS
985 use_hot_team = 0;
986 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
987 if (hot_teams) { // hot teams array is not allocated if
988 // KMP_HOT_TEAMS_MAX_LEVEL=0
989 int level = team->t.t_active_level - 1; // index in array of hot teams
990 if (master_th->th.th_teams_microtask) { // are we inside the teams?
991 if (master_th->th.th_teams_size.nteams > 1) {
992 ++level; // level was not increased in teams construct for
993 // team_of_masters
994 }
995 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
996 master_th->th.th_teams_level == team->t.t_level) {
997 ++level; // level was not increased in teams construct for
998 // team_of_workers before the parallel
999 } // team->t.t_level will be increased inside parallel
1000 }
1001 if (level < __kmp_hot_teams_max_level) {
1002 if (hot_teams[level].hot_team) {
1003 // hot team has already been allocated for given level
1004 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1005 use_hot_team = 1; // the team is ready to use
1006 } else {
1007 use_hot_team = 0; // AC: threads are not allocated yet
1008 hot_teams[level].hot_team = team; // remember new hot team
1009 hot_teams[level].hot_team_nth = team->t.t_nproc;
1010 }
1011 } else {
1012 use_hot_team = 0;
1013 }
1014 }
1015 #else
1016 use_hot_team = team == root->r.r_hot_team;
1017 #endif
1018 if (!use_hot_team) {
1019
1020 /* install the master thread */
1021 team->t.t_threads[0] = master_th;
1022 __kmp_initialize_info(master_th, team, 0, master_gtid);
1023
1024 /* now, install the worker threads */
1025 for (i = 1; i < team->t.t_nproc; i++) {
1026
1027 /* fork or reallocate a new thread and install it in team */
1028 kmp_info_t *thr = __kmp_allocate_thread(root, team, i);
1029 team->t.t_threads[i] = thr;
1030 KMP_DEBUG_ASSERT(thr);
1031 KMP_DEBUG_ASSERT(thr->th.th_team == team);
1032 /* align team and thread arrived states */
1033 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived "
1034 "T#%d(%d:%d) join =%llu, plain=%llu\n",
1035 __kmp_gtid_from_tid(0, team), team->t.t_id, 0,
1036 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
1037 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
1038 team->t.t_bar[bs_plain_barrier].b_arrived));
1039 thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1040 thr->th.th_teams_level = master_th->th.th_teams_level;
1041 thr->th.th_teams_size = master_th->th.th_teams_size;
1042 { // Initialize threads' barrier data.
1043 int b;
1044 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar;
1045 for (b = 0; b < bs_last_barrier; ++b) {
1046 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
1047 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1048 #if USE_DEBUGGER
1049 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
1050 #endif
1051 }
1052 }
1053 }
1054
1055 #if KMP_AFFINITY_SUPPORTED
1056 __kmp_partition_places(team);
1057 #endif
1058 }
1059
1060 if (__kmp_display_affinity && team->t.t_display_affinity != 1) {
1061 for (i = 0; i < team->t.t_nproc; i++) {
1062 kmp_info_t *thr = team->t.t_threads[i];
1063 if (thr->th.th_prev_num_threads != team->t.t_nproc ||
1064 thr->th.th_prev_level != team->t.t_level) {
1065 team->t.t_display_affinity = 1;
1066 break;
1067 }
1068 }
1069 }
1070
1071 KMP_MB();
1072 }
1073
1074 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1075 // Propagate any changes to the floating point control registers out to the team
1076 // We try to avoid unnecessary writes to the relevant cache line in the team
1077 // structure, so we don't make changes unless they are needed.
propagateFPControl(kmp_team_t * team)1078 inline static void propagateFPControl(kmp_team_t *team) {
1079 if (__kmp_inherit_fp_control) {
1080 kmp_int16 x87_fpu_control_word;
1081 kmp_uint32 mxcsr;
1082
1083 // Get master values of FPU control flags (both X87 and vector)
1084 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1085 __kmp_store_mxcsr(&mxcsr);
1086 mxcsr &= KMP_X86_MXCSR_MASK;
1087
1088 // There is no point looking at t_fp_control_saved here.
1089 // If it is TRUE, we still have to update the values if they are different
1090 // from those we now have. If it is FALSE we didn't save anything yet, but
1091 // our objective is the same. We have to ensure that the values in the team
1092 // are the same as those we have.
1093 // So, this code achieves what we need whether or not t_fp_control_saved is
1094 // true. By checking whether the value needs updating we avoid unnecessary
1095 // writes that would put the cache-line into a written state, causing all
1096 // threads in the team to have to read it again.
1097 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1098 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1099 // Although we don't use this value, other code in the runtime wants to know
1100 // whether it should restore them. So we must ensure it is correct.
1101 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1102 } else {
1103 // Similarly here. Don't write to this cache-line in the team structure
1104 // unless we have to.
1105 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1106 }
1107 }
1108
1109 // Do the opposite, setting the hardware registers to the updated values from
1110 // the team.
updateHWFPControl(kmp_team_t * team)1111 inline static void updateHWFPControl(kmp_team_t *team) {
1112 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) {
1113 // Only reset the fp control regs if they have been changed in the team.
1114 // the parallel region that we are exiting.
1115 kmp_int16 x87_fpu_control_word;
1116 kmp_uint32 mxcsr;
1117 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word);
1118 __kmp_store_mxcsr(&mxcsr);
1119 mxcsr &= KMP_X86_MXCSR_MASK;
1120
1121 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) {
1122 __kmp_clear_x87_fpu_status_word();
1123 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word);
1124 }
1125
1126 if (team->t.t_mxcsr != mxcsr) {
1127 __kmp_load_mxcsr(&team->t.t_mxcsr);
1128 }
1129 }
1130 }
1131 #else
1132 #define propagateFPControl(x) ((void)0)
1133 #define updateHWFPControl(x) ((void)0)
1134 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1135
1136 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team,
1137 int realloc); // forward declaration
1138
1139 /* Run a parallel region that has been serialized, so runs only in a team of the
1140 single master thread. */
__kmp_serialized_parallel(ident_t * loc,kmp_int32 global_tid)1141 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) {
1142 kmp_info_t *this_thr;
1143 kmp_team_t *serial_team;
1144
1145 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid));
1146
1147 /* Skip all this code for autopar serialized loops since it results in
1148 unacceptable overhead */
1149 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR))
1150 return;
1151
1152 if (!TCR_4(__kmp_init_parallel))
1153 __kmp_parallel_initialize();
1154 __kmp_resume_if_soft_paused();
1155
1156 this_thr = __kmp_threads[global_tid];
1157 serial_team = this_thr->th.th_serial_team;
1158
1159 /* utilize the serialized team held by this thread */
1160 KMP_DEBUG_ASSERT(serial_team);
1161 KMP_MB();
1162
1163 if (__kmp_tasking_mode != tskm_immediate_exec) {
1164 KMP_DEBUG_ASSERT(
1165 this_thr->th.th_task_team ==
1166 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1167 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] ==
1168 NULL);
1169 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / "
1170 "team %p, new task_team = NULL\n",
1171 global_tid, this_thr->th.th_task_team, this_thr->th.th_team));
1172 this_thr->th.th_task_team = NULL;
1173 }
1174
1175 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1176 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1177 proc_bind = proc_bind_false;
1178 } else if (proc_bind == proc_bind_default) {
1179 // No proc_bind clause was specified, so use the current value
1180 // of proc-bind-var for this parallel region.
1181 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1182 }
1183 // Reset for next parallel region
1184 this_thr->th.th_set_proc_bind = proc_bind_default;
1185
1186 #if OMPT_SUPPORT
1187 ompt_data_t ompt_parallel_data = ompt_data_none;
1188 ompt_data_t *implicit_task_data;
1189 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid);
1190 if (ompt_enabled.enabled &&
1191 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1192
1193 ompt_task_info_t *parent_task_info;
1194 parent_task_info = OMPT_CUR_TASK_INFO(this_thr);
1195
1196 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1197 if (ompt_enabled.ompt_callback_parallel_begin) {
1198 int team_size = 1;
1199
1200 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1201 &(parent_task_info->task_data), &(parent_task_info->frame),
1202 &ompt_parallel_data, team_size,
1203 ompt_parallel_invoker_program | ompt_parallel_team, codeptr);
1204 }
1205 }
1206 #endif // OMPT_SUPPORT
1207
1208 if (this_thr->th.th_team != serial_team) {
1209 // Nested level will be an index in the nested nthreads array
1210 int level = this_thr->th.th_team->t.t_level;
1211
1212 if (serial_team->t.t_serialized) {
1213 /* this serial team was already used
1214 TODO increase performance by making this locks more specific */
1215 kmp_team_t *new_team;
1216
1217 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1218
1219 new_team =
1220 __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1221 #if OMPT_SUPPORT
1222 ompt_parallel_data,
1223 #endif
1224 proc_bind, &this_thr->th.th_current_task->td_icvs,
1225 0 USE_NESTED_HOT_ARG(NULL));
1226 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1227 KMP_ASSERT(new_team);
1228
1229 /* setup new serialized team and install it */
1230 new_team->t.t_threads[0] = this_thr;
1231 new_team->t.t_parent = this_thr->th.th_team;
1232 serial_team = new_team;
1233 this_thr->th.th_serial_team = serial_team;
1234
1235 KF_TRACE(
1236 10,
1237 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1238 global_tid, serial_team));
1239
1240 /* TODO the above breaks the requirement that if we run out of resources,
1241 then we can still guarantee that serialized teams are ok, since we may
1242 need to allocate a new one */
1243 } else {
1244 KF_TRACE(
1245 10,
1246 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1247 global_tid, serial_team));
1248 }
1249
1250 /* we have to initialize this serial team */
1251 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1252 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1253 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team);
1254 serial_team->t.t_ident = loc;
1255 serial_team->t.t_serialized = 1;
1256 serial_team->t.t_nproc = 1;
1257 serial_team->t.t_parent = this_thr->th.th_team;
1258 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched;
1259 this_thr->th.th_team = serial_team;
1260 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid;
1261
1262 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid,
1263 this_thr->th.th_current_task));
1264 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1);
1265 this_thr->th.th_current_task->td_flags.executing = 0;
1266
1267 __kmp_push_current_task_to_thread(this_thr, serial_team, 0);
1268
1269 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an
1270 implicit task for each serialized task represented by
1271 team->t.t_serialized? */
1272 copy_icvs(&this_thr->th.th_current_task->td_icvs,
1273 &this_thr->th.th_current_task->td_parent->td_icvs);
1274
1275 // Thread value exists in the nested nthreads array for the next nested
1276 // level
1277 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1278 this_thr->th.th_current_task->td_icvs.nproc =
1279 __kmp_nested_nth.nth[level + 1];
1280 }
1281
1282 if (__kmp_nested_proc_bind.used &&
1283 (level + 1 < __kmp_nested_proc_bind.used)) {
1284 this_thr->th.th_current_task->td_icvs.proc_bind =
1285 __kmp_nested_proc_bind.bind_types[level + 1];
1286 }
1287
1288 #if USE_DEBUGGER
1289 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger.
1290 #endif
1291 this_thr->th.th_info.ds.ds_tid = 0;
1292
1293 /* set thread cache values */
1294 this_thr->th.th_team_nproc = 1;
1295 this_thr->th.th_team_master = this_thr;
1296 this_thr->th.th_team_serialized = 1;
1297
1298 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1;
1299 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1300 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save
1301
1302 propagateFPControl(serial_team);
1303
1304 /* check if we need to allocate dispatch buffers stack */
1305 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1306 if (!serial_team->t.t_dispatch->th_disp_buffer) {
1307 serial_team->t.t_dispatch->th_disp_buffer =
1308 (dispatch_private_info_t *)__kmp_allocate(
1309 sizeof(dispatch_private_info_t));
1310 }
1311 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1312
1313 KMP_MB();
1314
1315 } else {
1316 /* this serialized team is already being used,
1317 * that's fine, just add another nested level */
1318 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team);
1319 KMP_DEBUG_ASSERT(serial_team->t.t_threads);
1320 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr);
1321 ++serial_team->t.t_serialized;
1322 this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1323
1324 // Nested level will be an index in the nested nthreads array
1325 int level = this_thr->th.th_team->t.t_level;
1326 // Thread value exists in the nested nthreads array for the next nested
1327 // level
1328 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) {
1329 this_thr->th.th_current_task->td_icvs.nproc =
1330 __kmp_nested_nth.nth[level + 1];
1331 }
1332 serial_team->t.t_level++;
1333 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level "
1334 "of serial team %p to %d\n",
1335 global_tid, serial_team, serial_team->t.t_level));
1336
1337 /* allocate/push dispatch buffers stack */
1338 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1339 {
1340 dispatch_private_info_t *disp_buffer =
1341 (dispatch_private_info_t *)__kmp_allocate(
1342 sizeof(dispatch_private_info_t));
1343 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1344 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1345 }
1346 this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1347
1348 KMP_MB();
1349 }
1350 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1351
1352 // Perform the display affinity functionality for
1353 // serialized parallel regions
1354 if (__kmp_display_affinity) {
1355 if (this_thr->th.th_prev_level != serial_team->t.t_level ||
1356 this_thr->th.th_prev_num_threads != 1) {
1357 // NULL means use the affinity-format-var ICV
1358 __kmp_aux_display_affinity(global_tid, NULL);
1359 this_thr->th.th_prev_level = serial_team->t.t_level;
1360 this_thr->th.th_prev_num_threads = 1;
1361 }
1362 }
1363
1364 if (__kmp_env_consistency_check)
1365 __kmp_push_parallel(global_tid, NULL);
1366 #if OMPT_SUPPORT
1367 serial_team->t.ompt_team_info.master_return_address = codeptr;
1368 if (ompt_enabled.enabled &&
1369 this_thr->th.ompt_thread_info.state != ompt_state_overhead) {
1370 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1371
1372 ompt_lw_taskteam_t lw_taskteam;
1373 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid,
1374 &ompt_parallel_data, codeptr);
1375
1376 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1);
1377 // don't use lw_taskteam after linking. content was swaped
1378
1379 /* OMPT implicit task begin */
1380 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr);
1381 if (ompt_enabled.ompt_callback_implicit_task) {
1382 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1383 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr),
1384 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial?
1385 OMPT_CUR_TASK_INFO(this_thr)
1386 ->thread_num = __kmp_tid_from_gtid(global_tid);
1387 }
1388
1389 /* OMPT state */
1390 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1391 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0);
1392 }
1393 #endif
1394 }
1395
1396 /* most of the work for a fork */
1397 /* return true if we really went parallel, false if serialized */
__kmp_fork_call(ident_t * loc,int gtid,enum fork_context_e call_context,kmp_int32 argc,microtask_t microtask,launch_t invoker,kmp_va_list ap)1398 int __kmp_fork_call(ident_t *loc, int gtid,
1399 enum fork_context_e call_context, // Intel, GNU, ...
1400 kmp_int32 argc, microtask_t microtask, launch_t invoker,
1401 kmp_va_list ap) {
1402 void **argv;
1403 int i;
1404 int master_tid;
1405 int master_this_cons;
1406 kmp_team_t *team;
1407 kmp_team_t *parent_team;
1408 kmp_info_t *master_th;
1409 kmp_root_t *root;
1410 int nthreads;
1411 int master_active;
1412 int master_set_numthreads;
1413 int level;
1414 int active_level;
1415 int teams_level;
1416 #if KMP_NESTED_HOT_TEAMS
1417 kmp_hot_team_ptr_t **p_hot_teams;
1418 #endif
1419 { // KMP_TIME_BLOCK
1420 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1421 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1422
1423 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid));
1424 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) {
1425 /* Some systems prefer the stack for the root thread(s) to start with */
1426 /* some gap from the parent stack to prevent false sharing. */
1427 void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1428 /* These 2 lines below are so this does not get optimized out */
1429 if (__kmp_stkpadding > KMP_MAX_STKPADDING)
1430 __kmp_stkpadding += (short)((kmp_int64)dummy);
1431 }
1432
1433 /* initialize if needed */
1434 KMP_DEBUG_ASSERT(
1435 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown
1436 if (!TCR_4(__kmp_init_parallel))
1437 __kmp_parallel_initialize();
1438 __kmp_resume_if_soft_paused();
1439
1440 /* setup current data */
1441 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with
1442 // shutdown
1443 parent_team = master_th->th.th_team;
1444 master_tid = master_th->th.th_info.ds.ds_tid;
1445 master_this_cons = master_th->th.th_local.this_construct;
1446 root = master_th->th.th_root;
1447 master_active = root->r.r_active;
1448 master_set_numthreads = master_th->th.th_set_nproc;
1449
1450 #if OMPT_SUPPORT
1451 ompt_data_t ompt_parallel_data = ompt_data_none;
1452 ompt_data_t *parent_task_data;
1453 ompt_frame_t *ompt_frame;
1454 ompt_data_t *implicit_task_data;
1455 void *return_address = NULL;
1456
1457 if (ompt_enabled.enabled) {
1458 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame,
1459 NULL, NULL);
1460 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1461 }
1462 #endif
1463
1464 // Nested level will be an index in the nested nthreads array
1465 level = parent_team->t.t_level;
1466 // used to launch non-serial teams even if nested is not allowed
1467 active_level = parent_team->t.t_active_level;
1468 // needed to check nesting inside the teams
1469 teams_level = master_th->th.th_teams_level;
1470 #if KMP_NESTED_HOT_TEAMS
1471 p_hot_teams = &master_th->th.th_hot_teams;
1472 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) {
1473 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate(
1474 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1475 (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1476 // it is either actual or not needed (when active_level > 0)
1477 (*p_hot_teams)[0].hot_team_nth = 1;
1478 }
1479 #endif
1480
1481 #if OMPT_SUPPORT
1482 if (ompt_enabled.enabled) {
1483 if (ompt_enabled.ompt_callback_parallel_begin) {
1484 int team_size = master_set_numthreads
1485 ? master_set_numthreads
1486 : get__nproc_2(parent_team, master_tid);
1487 int flags = OMPT_INVOKER(call_context) |
1488 ((microtask == (microtask_t)__kmp_teams_master)
1489 ? ompt_parallel_league
1490 : ompt_parallel_team);
1491 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)(
1492 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags,
1493 return_address);
1494 }
1495 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1496 }
1497 #endif
1498
1499 master_th->th.th_ident = loc;
1500
1501 if (master_th->th.th_teams_microtask && ap &&
1502 microtask != (microtask_t)__kmp_teams_master && level == teams_level) {
1503 // AC: This is start of parallel that is nested inside teams construct.
1504 // The team is actual (hot), all workers are ready at the fork barrier.
1505 // No lock needed to initialize the team a bit, then free workers.
1506 parent_team->t.t_ident = loc;
1507 __kmp_alloc_argv_entries(argc, parent_team, TRUE);
1508 parent_team->t.t_argc = argc;
1509 argv = (void **)parent_team->t.t_argv;
1510 for (i = argc - 1; i >= 0; --i)
1511 *argv++ = va_arg(kmp_va_deref(ap), void *);
1512 // Increment our nested depth levels, but not increase the serialization
1513 if (parent_team == master_th->th.th_serial_team) {
1514 // AC: we are in serialized parallel
1515 __kmpc_serialized_parallel(loc, gtid);
1516 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1);
1517
1518 if (call_context == fork_context_gnu) {
1519 // AC: need to decrement t_serialized for enquiry functions to work
1520 // correctly, will restore at join time
1521 parent_team->t.t_serialized--;
1522 return TRUE;
1523 }
1524
1525 #if OMPT_SUPPORT
1526 void *dummy;
1527 void **exit_frame_p;
1528
1529 ompt_lw_taskteam_t lw_taskteam;
1530
1531 if (ompt_enabled.enabled) {
1532 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1533 &ompt_parallel_data, return_address);
1534 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr);
1535
1536 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1537 // don't use lw_taskteam after linking. content was swaped
1538
1539 /* OMPT implicit task begin */
1540 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1541 if (ompt_enabled.ompt_callback_implicit_task) {
1542 OMPT_CUR_TASK_INFO(master_th)
1543 ->thread_num = __kmp_tid_from_gtid(gtid);
1544 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1545 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1546 implicit_task_data, 1,
1547 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1548 }
1549
1550 /* OMPT state */
1551 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1552 } else {
1553 exit_frame_p = &dummy;
1554 }
1555 #endif
1556 // AC: need to decrement t_serialized for enquiry functions to work
1557 // correctly, will restore at join time
1558 parent_team->t.t_serialized--;
1559
1560 {
1561 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1562 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1563 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv
1564 #if OMPT_SUPPORT
1565 ,
1566 exit_frame_p
1567 #endif
1568 );
1569 }
1570
1571 #if OMPT_SUPPORT
1572 if (ompt_enabled.enabled) {
1573 *exit_frame_p = NULL;
1574 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none;
1575 if (ompt_enabled.ompt_callback_implicit_task) {
1576 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1577 ompt_scope_end, NULL, implicit_task_data, 1,
1578 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
1579 }
1580 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1581 __ompt_lw_taskteam_unlink(master_th);
1582 if (ompt_enabled.ompt_callback_parallel_end) {
1583 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1584 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th),
1585 OMPT_INVOKER(call_context) | ompt_parallel_team,
1586 return_address);
1587 }
1588 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1589 }
1590 #endif
1591 return TRUE;
1592 }
1593
1594 parent_team->t.t_pkfn = microtask;
1595 parent_team->t.t_invoke = invoker;
1596 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1597 parent_team->t.t_active_level++;
1598 parent_team->t.t_level++;
1599 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save
1600
1601 #if OMPT_SUPPORT
1602 if (ompt_enabled.enabled) {
1603 ompt_lw_taskteam_t lw_taskteam;
1604 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1605 &ompt_parallel_data, return_address);
1606 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true);
1607 }
1608 #endif
1609
1610 /* Change number of threads in the team if requested */
1611 if (master_set_numthreads) { // The parallel has num_threads clause
1612 if (master_set_numthreads < master_th->th.th_teams_size.nth) {
1613 // AC: only can reduce number of threads dynamically, can't increase
1614 kmp_info_t **other_threads = parent_team->t.t_threads;
1615 parent_team->t.t_nproc = master_set_numthreads;
1616 for (i = 0; i < master_set_numthreads; ++i) {
1617 other_threads[i]->th.th_team_nproc = master_set_numthreads;
1618 }
1619 // Keep extra threads hot in the team for possible next parallels
1620 }
1621 master_th->th.th_set_nproc = 0;
1622 }
1623
1624 #if USE_DEBUGGER
1625 if (__kmp_debugging) { // Let debugger override number of threads.
1626 int nth = __kmp_omp_num_threads(loc);
1627 if (nth > 0) { // 0 means debugger doesn't want to change num threads
1628 master_set_numthreads = nth;
1629 }
1630 }
1631 #endif
1632
1633 #if USE_ITT_BUILD
1634 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) ||
1635 KMP_ITT_DEBUG) &&
1636 __kmp_forkjoin_frames_mode == 3 &&
1637 parent_team->t.t_active_level == 1 // only report frames at level 1
1638 && master_th->th.th_teams_size.nteams == 1) {
1639 kmp_uint64 tmp_time = __itt_get_timestamp();
1640 master_th->th.th_frame_time = tmp_time;
1641 parent_team->t.t_region_time = tmp_time;
1642 }
1643 if (__itt_stack_caller_create_ptr) {
1644 // create new stack stitching id before entering fork barrier
1645 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create();
1646 }
1647 #endif /* USE_ITT_BUILD */
1648
1649 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, "
1650 "master_th=%p, gtid=%d\n",
1651 root, parent_team, master_th, gtid));
1652 __kmp_internal_fork(loc, gtid, parent_team);
1653 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, "
1654 "master_th=%p, gtid=%d\n",
1655 root, parent_team, master_th, gtid));
1656
1657 if (call_context == fork_context_gnu)
1658 return TRUE;
1659
1660 /* Invoke microtask for MASTER thread */
1661 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
1662 parent_team->t.t_id, parent_team->t.t_pkfn));
1663
1664 if (!parent_team->t.t_invoke(gtid)) {
1665 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
1666 }
1667 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
1668 parent_team->t.t_id, parent_team->t.t_pkfn));
1669 KMP_MB(); /* Flush all pending memory write invalidates. */
1670
1671 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
1672
1673 return TRUE;
1674 } // Parallel closely nested in teams construct
1675
1676 #if KMP_DEBUG
1677 if (__kmp_tasking_mode != tskm_immediate_exec) {
1678 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
1679 parent_team->t.t_task_team[master_th->th.th_task_state]);
1680 }
1681 #endif
1682
1683 if (parent_team->t.t_active_level >=
1684 master_th->th.th_current_task->td_icvs.max_active_levels) {
1685 nthreads = 1;
1686 } else {
1687 int enter_teams = ((ap == NULL && active_level == 0) ||
1688 (ap && teams_level > 0 && teams_level == level));
1689 nthreads =
1690 master_set_numthreads
1691 ? master_set_numthreads
1692 : get__nproc_2(
1693 parent_team,
1694 master_tid); // TODO: get nproc directly from current task
1695
1696 // Check if we need to take forkjoin lock? (no need for serialized
1697 // parallel out of teams construct). This code moved here from
1698 // __kmp_reserve_threads() to speedup nested serialized parallels.
1699 if (nthreads > 1) {
1700 if ((get__max_active_levels(master_th) == 1 &&
1701 (root->r.r_in_parallel && !enter_teams)) ||
1702 (__kmp_library == library_serial)) {
1703 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d"
1704 " threads\n",
1705 gtid, nthreads));
1706 nthreads = 1;
1707 }
1708 }
1709 if (nthreads > 1) {
1710 /* determine how many new threads we can use */
1711 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
1712 /* AC: If we execute teams from parallel region (on host), then teams
1713 should be created but each can only have 1 thread if nesting is
1714 disabled. If teams called from serial region, then teams and their
1715 threads should be created regardless of the nesting setting. */
1716 nthreads = __kmp_reserve_threads(root, parent_team, master_tid,
1717 nthreads, enter_teams);
1718 if (nthreads == 1) {
1719 // Free lock for single thread execution here; for multi-thread
1720 // execution it will be freed later after team of threads created
1721 // and initialized
1722 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
1723 }
1724 }
1725 }
1726 KMP_DEBUG_ASSERT(nthreads > 0);
1727
1728 // If we temporarily changed the set number of threads then restore it now
1729 master_th->th.th_set_nproc = 0;
1730
1731 /* create a serialized parallel region? */
1732 if (nthreads == 1) {
1733 /* josh todo: hypothetical question: what do we do for OS X*? */
1734 #if KMP_OS_LINUX && \
1735 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1736 void *args[argc];
1737 #else
1738 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *));
1739 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \
1740 KMP_ARCH_AARCH64) */
1741
1742 KA_TRACE(20,
1743 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid));
1744
1745 __kmpc_serialized_parallel(loc, gtid);
1746
1747 if (call_context == fork_context_intel) {
1748 /* TODO this sucks, use the compiler itself to pass args! :) */
1749 master_th->th.th_serial_team->t.t_ident = loc;
1750 if (!ap) {
1751 // revert change made in __kmpc_serialized_parallel()
1752 master_th->th.th_serial_team->t.t_level--;
1753 // Get args from parent team for teams construct
1754
1755 #if OMPT_SUPPORT
1756 void *dummy;
1757 void **exit_frame_p;
1758 ompt_task_info_t *task_info;
1759
1760 ompt_lw_taskteam_t lw_taskteam;
1761
1762 if (ompt_enabled.enabled) {
1763 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1764 &ompt_parallel_data, return_address);
1765
1766 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1767 // don't use lw_taskteam after linking. content was swaped
1768
1769 task_info = OMPT_CUR_TASK_INFO(master_th);
1770 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1771 if (ompt_enabled.ompt_callback_implicit_task) {
1772 OMPT_CUR_TASK_INFO(master_th)
1773 ->thread_num = __kmp_tid_from_gtid(gtid);
1774 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1775 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1776 &(task_info->task_data), 1,
1777 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1778 ompt_task_implicit);
1779 }
1780
1781 /* OMPT state */
1782 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1783 } else {
1784 exit_frame_p = &dummy;
1785 }
1786 #endif
1787
1788 {
1789 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1790 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1791 __kmp_invoke_microtask(microtask, gtid, 0, argc,
1792 parent_team->t.t_argv
1793 #if OMPT_SUPPORT
1794 ,
1795 exit_frame_p
1796 #endif
1797 );
1798 }
1799
1800 #if OMPT_SUPPORT
1801 if (ompt_enabled.enabled) {
1802 *exit_frame_p = NULL;
1803 if (ompt_enabled.ompt_callback_implicit_task) {
1804 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1805 ompt_scope_end, NULL, &(task_info->task_data), 1,
1806 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1807 ompt_task_implicit);
1808 }
1809 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1810 __ompt_lw_taskteam_unlink(master_th);
1811 if (ompt_enabled.ompt_callback_parallel_end) {
1812 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1813 &ompt_parallel_data, parent_task_data,
1814 OMPT_INVOKER(call_context) | ompt_parallel_team,
1815 return_address);
1816 }
1817 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1818 }
1819 #endif
1820 } else if (microtask == (microtask_t)__kmp_teams_master) {
1821 KMP_DEBUG_ASSERT(master_th->th.th_team ==
1822 master_th->th.th_serial_team);
1823 team = master_th->th.th_team;
1824 // team->t.t_pkfn = microtask;
1825 team->t.t_invoke = invoker;
1826 __kmp_alloc_argv_entries(argc, team, TRUE);
1827 team->t.t_argc = argc;
1828 argv = (void **)team->t.t_argv;
1829 if (ap) {
1830 for (i = argc - 1; i >= 0; --i)
1831 *argv++ = va_arg(kmp_va_deref(ap), void *);
1832 } else {
1833 for (i = 0; i < argc; ++i)
1834 // Get args from parent team for teams construct
1835 argv[i] = parent_team->t.t_argv[i];
1836 }
1837 // AC: revert change made in __kmpc_serialized_parallel()
1838 // because initial code in teams should have level=0
1839 team->t.t_level--;
1840 // AC: call special invoker for outer "parallel" of teams construct
1841 invoker(gtid);
1842 #if OMPT_SUPPORT
1843 if (ompt_enabled.enabled) {
1844 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th);
1845 if (ompt_enabled.ompt_callback_implicit_task) {
1846 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1847 ompt_scope_end, NULL, &(task_info->task_data), 0,
1848 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial);
1849 }
1850 if (ompt_enabled.ompt_callback_parallel_end) {
1851 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1852 &ompt_parallel_data, parent_task_data,
1853 OMPT_INVOKER(call_context) | ompt_parallel_league,
1854 return_address);
1855 }
1856 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1857 }
1858 #endif
1859 } else {
1860 argv = args;
1861 for (i = argc - 1; i >= 0; --i)
1862 *argv++ = va_arg(kmp_va_deref(ap), void *);
1863 KMP_MB();
1864
1865 #if OMPT_SUPPORT
1866 void *dummy;
1867 void **exit_frame_p;
1868 ompt_task_info_t *task_info;
1869
1870 ompt_lw_taskteam_t lw_taskteam;
1871
1872 if (ompt_enabled.enabled) {
1873 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1874 &ompt_parallel_data, return_address);
1875 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0);
1876 // don't use lw_taskteam after linking. content was swaped
1877 task_info = OMPT_CUR_TASK_INFO(master_th);
1878 exit_frame_p = &(task_info->frame.exit_frame.ptr);
1879
1880 /* OMPT implicit task begin */
1881 implicit_task_data = OMPT_CUR_TASK_DATA(master_th);
1882 if (ompt_enabled.ompt_callback_implicit_task) {
1883 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1884 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th),
1885 implicit_task_data, 1, __kmp_tid_from_gtid(gtid),
1886 ompt_task_implicit);
1887 OMPT_CUR_TASK_INFO(master_th)
1888 ->thread_num = __kmp_tid_from_gtid(gtid);
1889 }
1890
1891 /* OMPT state */
1892 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1893 } else {
1894 exit_frame_p = &dummy;
1895 }
1896 #endif
1897
1898 {
1899 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1900 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1901 __kmp_invoke_microtask(microtask, gtid, 0, argc, args
1902 #if OMPT_SUPPORT
1903 ,
1904 exit_frame_p
1905 #endif
1906 );
1907 }
1908
1909 #if OMPT_SUPPORT
1910 if (ompt_enabled.enabled) {
1911 *exit_frame_p = NULL;
1912 if (ompt_enabled.ompt_callback_implicit_task) {
1913 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
1914 ompt_scope_end, NULL, &(task_info->task_data), 1,
1915 OMPT_CUR_TASK_INFO(master_th)->thread_num,
1916 ompt_task_implicit);
1917 }
1918
1919 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
1920 __ompt_lw_taskteam_unlink(master_th);
1921 if (ompt_enabled.ompt_callback_parallel_end) {
1922 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
1923 &ompt_parallel_data, parent_task_data,
1924 OMPT_INVOKER(call_context) | ompt_parallel_team,
1925 return_address);
1926 }
1927 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1928 }
1929 #endif
1930 }
1931 } else if (call_context == fork_context_gnu) {
1932 #if OMPT_SUPPORT
1933 ompt_lw_taskteam_t lwt;
1934 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data,
1935 return_address);
1936
1937 lwt.ompt_task_info.frame.exit_frame = ompt_data_none;
1938 __ompt_lw_taskteam_link(&lwt, master_th, 1);
1939 // don't use lw_taskteam after linking. content was swaped
1940 #endif
1941
1942 // we were called from GNU native code
1943 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1944 return FALSE;
1945 } else {
1946 KMP_ASSERT2(call_context < fork_context_last,
1947 "__kmp_fork_call: unknown fork_context parameter");
1948 }
1949
1950 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid));
1951 KMP_MB();
1952 return FALSE;
1953 } // if (nthreads == 1)
1954
1955 // GEH: only modify the executing flag in the case when not serialized
1956 // serialized case is handled in kmpc_serialized_parallel
1957 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, "
1958 "curtask=%p, curtask_max_aclevel=%d\n",
1959 parent_team->t.t_active_level, master_th,
1960 master_th->th.th_current_task,
1961 master_th->th.th_current_task->td_icvs.max_active_levels));
1962 // TODO: GEH - cannot do this assertion because root thread not set up as
1963 // executing
1964 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1965 master_th->th.th_current_task->td_flags.executing = 0;
1966
1967 if (!master_th->th.th_teams_microtask || level > teams_level) {
1968 /* Increment our nested depth level */
1969 KMP_ATOMIC_INC(&root->r.r_in_parallel);
1970 }
1971
1972 // See if we need to make a copy of the ICVs.
1973 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1974 if ((level + 1 < __kmp_nested_nth.used) &&
1975 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) {
1976 nthreads_icv = __kmp_nested_nth.nth[level + 1];
1977 } else {
1978 nthreads_icv = 0; // don't update
1979 }
1980
1981 // Figure out the proc_bind_policy for the new team.
1982 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1983 kmp_proc_bind_t proc_bind_icv =
1984 proc_bind_default; // proc_bind_default means don't update
1985 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) {
1986 proc_bind = proc_bind_false;
1987 } else {
1988 if (proc_bind == proc_bind_default) {
1989 // No proc_bind clause specified; use current proc-bind-var for this
1990 // parallel region
1991 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1992 }
1993 /* else: The proc_bind policy was specified explicitly on parallel clause.
1994 This overrides proc-bind-var for this parallel region, but does not
1995 change proc-bind-var. */
1996 // Figure the value of proc-bind-var for the child threads.
1997 if ((level + 1 < __kmp_nested_proc_bind.used) &&
1998 (__kmp_nested_proc_bind.bind_types[level + 1] !=
1999 master_th->th.th_current_task->td_icvs.proc_bind)) {
2000 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1];
2001 }
2002 }
2003
2004 // Reset for next parallel region
2005 master_th->th.th_set_proc_bind = proc_bind_default;
2006
2007 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) {
2008 kmp_internal_control_t new_icvs;
2009 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
2010 new_icvs.next = NULL;
2011 if (nthreads_icv > 0) {
2012 new_icvs.nproc = nthreads_icv;
2013 }
2014 if (proc_bind_icv != proc_bind_default) {
2015 new_icvs.proc_bind = proc_bind_icv;
2016 }
2017
2018 /* allocate a new parallel team */
2019 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2020 team = __kmp_allocate_team(root, nthreads, nthreads,
2021 #if OMPT_SUPPORT
2022 ompt_parallel_data,
2023 #endif
2024 proc_bind, &new_icvs,
2025 argc USE_NESTED_HOT_ARG(master_th));
2026 } else {
2027 /* allocate a new parallel team */
2028 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n"));
2029 team = __kmp_allocate_team(root, nthreads, nthreads,
2030 #if OMPT_SUPPORT
2031 ompt_parallel_data,
2032 #endif
2033 proc_bind,
2034 &master_th->th.th_current_task->td_icvs,
2035 argc USE_NESTED_HOT_ARG(master_th));
2036 }
2037 KF_TRACE(
2038 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team));
2039
2040 /* setup the new team */
2041 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2042 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2043 KMP_CHECK_UPDATE(team->t.t_ident, loc);
2044 KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2045 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2046 #if OMPT_SUPPORT
2047 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address,
2048 return_address);
2049 #endif
2050 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe
2051 // TODO: parent_team->t.t_level == INT_MAX ???
2052 if (!master_th->th.th_teams_microtask || level > teams_level) {
2053 int new_level = parent_team->t.t_level + 1;
2054 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2055 new_level = parent_team->t.t_active_level + 1;
2056 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2057 } else {
2058 // AC: Do not increase parallel level at start of the teams construct
2059 int new_level = parent_team->t.t_level;
2060 KMP_CHECK_UPDATE(team->t.t_level, new_level);
2061 new_level = parent_team->t.t_active_level;
2062 KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2063 }
2064 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2065 // set master's schedule as new run-time schedule
2066 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
2067
2068 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2069 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator);
2070
2071 // Update the floating point rounding in the team if required.
2072 propagateFPControl(team);
2073
2074 if (__kmp_tasking_mode != tskm_immediate_exec) {
2075 // Set master's task team to team's task team. Unless this is hot team, it
2076 // should be NULL.
2077 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2078 parent_team->t.t_task_team[master_th->th.th_task_state]);
2079 KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team "
2080 "%p, new task_team %p / team %p\n",
2081 __kmp_gtid_from_thread(master_th),
2082 master_th->th.th_task_team, parent_team,
2083 team->t.t_task_team[master_th->th.th_task_state], team));
2084
2085 if (active_level || master_th->th.th_task_team) {
2086 // Take a memo of master's task_state
2087 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2088 if (master_th->th.th_task_state_top >=
2089 master_th->th.th_task_state_stack_sz) { // increase size
2090 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz;
2091 kmp_uint8 *old_stack, *new_stack;
2092 kmp_uint32 i;
2093 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2094 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) {
2095 new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2096 }
2097 for (i = master_th->th.th_task_state_stack_sz; i < new_size;
2098 ++i) { // zero-init rest of stack
2099 new_stack[i] = 0;
2100 }
2101 old_stack = master_th->th.th_task_state_memo_stack;
2102 master_th->th.th_task_state_memo_stack = new_stack;
2103 master_th->th.th_task_state_stack_sz = new_size;
2104 __kmp_free(old_stack);
2105 }
2106 // Store master's task_state on stack
2107 master_th->th
2108 .th_task_state_memo_stack[master_th->th.th_task_state_top] =
2109 master_th->th.th_task_state;
2110 master_th->th.th_task_state_top++;
2111 #if KMP_NESTED_HOT_TEAMS
2112 if (master_th->th.th_hot_teams &&
2113 active_level < __kmp_hot_teams_max_level &&
2114 team == master_th->th.th_hot_teams[active_level].hot_team) {
2115 // Restore master's nested state if nested hot team
2116 master_th->th.th_task_state =
2117 master_th->th
2118 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2119 } else {
2120 #endif
2121 master_th->th.th_task_state = 0;
2122 #if KMP_NESTED_HOT_TEAMS
2123 }
2124 #endif
2125 }
2126 #if !KMP_NESTED_HOT_TEAMS
2127 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) ||
2128 (team == root->r.r_hot_team));
2129 #endif
2130 }
2131
2132 KA_TRACE(
2133 20,
2134 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2135 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id,
2136 team->t.t_nproc));
2137 KMP_DEBUG_ASSERT(team != root->r.r_hot_team ||
2138 (team->t.t_master_tid == 0 &&
2139 (team->t.t_parent == root->r.r_root_team ||
2140 team->t.t_parent->t.t_serialized)));
2141 KMP_MB();
2142
2143 /* now, setup the arguments */
2144 argv = (void **)team->t.t_argv;
2145 if (ap) {
2146 for (i = argc - 1; i >= 0; --i) {
2147 void *new_argv = va_arg(kmp_va_deref(ap), void *);
2148 KMP_CHECK_UPDATE(*argv, new_argv);
2149 argv++;
2150 }
2151 } else {
2152 for (i = 0; i < argc; ++i) {
2153 // Get args from parent team for teams construct
2154 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2155 }
2156 }
2157
2158 /* now actually fork the threads */
2159 KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2160 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2161 root->r.r_active = TRUE;
2162
2163 __kmp_fork_team_threads(root, team, master_th, gtid);
2164 __kmp_setup_icv_copy(team, nthreads,
2165 &master_th->th.th_current_task->td_icvs, loc);
2166
2167 #if OMPT_SUPPORT
2168 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2169 #endif
2170
2171 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2172
2173 #if USE_ITT_BUILD
2174 if (team->t.t_active_level == 1 // only report frames at level 1
2175 && !master_th->th.th_teams_microtask) { // not in teams construct
2176 #if USE_ITT_NOTIFY
2177 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2178 (__kmp_forkjoin_frames_mode == 3 ||
2179 __kmp_forkjoin_frames_mode == 1)) {
2180 kmp_uint64 tmp_time = 0;
2181 if (__itt_get_timestamp_ptr)
2182 tmp_time = __itt_get_timestamp();
2183 // Internal fork - report frame begin
2184 master_th->th.th_frame_time = tmp_time;
2185 if (__kmp_forkjoin_frames_mode == 3)
2186 team->t.t_region_time = tmp_time;
2187 } else
2188 // only one notification scheme (either "submit" or "forking/joined", not both)
2189 #endif /* USE_ITT_NOTIFY */
2190 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) &&
2191 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) {
2192 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer.
2193 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2194 }
2195 }
2196 #endif /* USE_ITT_BUILD */
2197
2198 /* now go on and do the work */
2199 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team);
2200 KMP_MB();
2201 KF_TRACE(10,
2202 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2203 root, team, master_th, gtid));
2204
2205 #if USE_ITT_BUILD
2206 if (__itt_stack_caller_create_ptr) {
2207 team->t.t_stack_id =
2208 __kmp_itt_stack_caller_create(); // create new stack stitching id
2209 // before entering fork barrier
2210 }
2211 #endif /* USE_ITT_BUILD */
2212
2213 // AC: skip __kmp_internal_fork at teams construct, let only master
2214 // threads execute
2215 if (ap) {
2216 __kmp_internal_fork(loc, gtid, team);
2217 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, "
2218 "master_th=%p, gtid=%d\n",
2219 root, team, master_th, gtid));
2220 }
2221
2222 if (call_context == fork_context_gnu) {
2223 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2224 return TRUE;
2225 }
2226
2227 /* Invoke microtask for MASTER thread */
2228 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid,
2229 team->t.t_id, team->t.t_pkfn));
2230 } // END of timer KMP_fork_call block
2231
2232 #if KMP_STATS_ENABLED
2233 // If beginning a teams construct, then change thread state
2234 stats_state_e previous_state = KMP_GET_THREAD_STATE();
2235 if (!ap) {
2236 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION);
2237 }
2238 #endif
2239
2240 if (!team->t.t_invoke(gtid)) {
2241 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread");
2242 }
2243
2244 #if KMP_STATS_ENABLED
2245 // If was beginning of a teams construct, then reset thread state
2246 if (!ap) {
2247 KMP_SET_THREAD_STATE(previous_state);
2248 }
2249 #endif
2250
2251 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid,
2252 team->t.t_id, team->t.t_pkfn));
2253 KMP_MB(); /* Flush all pending memory write invalidates. */
2254
2255 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid));
2256
2257 #if OMPT_SUPPORT
2258 if (ompt_enabled.enabled) {
2259 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2260 }
2261 #endif
2262
2263 return TRUE;
2264 }
2265
2266 #if OMPT_SUPPORT
__kmp_join_restore_state(kmp_info_t * thread,kmp_team_t * team)2267 static inline void __kmp_join_restore_state(kmp_info_t *thread,
2268 kmp_team_t *team) {
2269 // restore state outside the region
2270 thread->th.ompt_thread_info.state =
2271 ((team->t.t_serialized) ? ompt_state_work_serial
2272 : ompt_state_work_parallel);
2273 }
2274
__kmp_join_ompt(int gtid,kmp_info_t * thread,kmp_team_t * team,ompt_data_t * parallel_data,int flags,void * codeptr)2275 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread,
2276 kmp_team_t *team, ompt_data_t *parallel_data,
2277 int flags, void *codeptr) {
2278 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2279 if (ompt_enabled.ompt_callback_parallel_end) {
2280 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)(
2281 parallel_data, &(task_info->task_data), flags, codeptr);
2282 }
2283
2284 task_info->frame.enter_frame = ompt_data_none;
2285 __kmp_join_restore_state(thread, team);
2286 }
2287 #endif
2288
__kmp_join_call(ident_t * loc,int gtid,enum fork_context_e fork_context,int exit_teams)2289 void __kmp_join_call(ident_t *loc, int gtid
2290 #if OMPT_SUPPORT
2291 ,
2292 enum fork_context_e fork_context
2293 #endif
2294 ,
2295 int exit_teams) {
2296 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2297 kmp_team_t *team;
2298 kmp_team_t *parent_team;
2299 kmp_info_t *master_th;
2300 kmp_root_t *root;
2301 int master_active;
2302
2303 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid));
2304
2305 /* setup current data */
2306 master_th = __kmp_threads[gtid];
2307 root = master_th->th.th_root;
2308 team = master_th->th.th_team;
2309 parent_team = team->t.t_parent;
2310
2311 master_th->th.th_ident = loc;
2312
2313 #if OMPT_SUPPORT
2314 void *team_microtask = (void *)team->t.t_pkfn;
2315 // For GOMP interface with serialized parallel, need the
2316 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task
2317 // and end-parallel events.
2318 if (ompt_enabled.enabled &&
2319 !(team->t.t_serialized && fork_context == fork_context_gnu)) {
2320 master_th->th.ompt_thread_info.state = ompt_state_overhead;
2321 }
2322 #endif
2323
2324 #if KMP_DEBUG
2325 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2326 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, "
2327 "th_task_team = %p\n",
2328 __kmp_gtid_from_thread(master_th), team,
2329 team->t.t_task_team[master_th->th.th_task_state],
2330 master_th->th.th_task_team));
2331 KMP_DEBUG_ASSERT(master_th->th.th_task_team ==
2332 team->t.t_task_team[master_th->th.th_task_state]);
2333 }
2334 #endif
2335
2336 if (team->t.t_serialized) {
2337 if (master_th->th.th_teams_microtask) {
2338 // We are in teams construct
2339 int level = team->t.t_level;
2340 int tlevel = master_th->th.th_teams_level;
2341 if (level == tlevel) {
2342 // AC: we haven't incremented it earlier at start of teams construct,
2343 // so do it here - at the end of teams construct
2344 team->t.t_level++;
2345 } else if (level == tlevel + 1) {
2346 // AC: we are exiting parallel inside teams, need to increment
2347 // serialization in order to restore it in the next call to
2348 // __kmpc_end_serialized_parallel
2349 team->t.t_serialized++;
2350 }
2351 }
2352 __kmpc_end_serialized_parallel(loc, gtid);
2353
2354 #if OMPT_SUPPORT
2355 if (ompt_enabled.enabled) {
2356 __kmp_join_restore_state(master_th, parent_team);
2357 }
2358 #endif
2359
2360 return;
2361 }
2362
2363 master_active = team->t.t_master_active;
2364
2365 if (!exit_teams) {
2366 // AC: No barrier for internal teams at exit from teams construct.
2367 // But there is barrier for external team (league).
2368 __kmp_internal_join(loc, gtid, team);
2369 } else {
2370 master_th->th.th_task_state =
2371 0; // AC: no tasking in teams (out of any parallel)
2372 }
2373
2374 KMP_MB();
2375
2376 #if OMPT_SUPPORT
2377 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data);
2378 void *codeptr = team->t.ompt_team_info.master_return_address;
2379 #endif
2380
2381 #if USE_ITT_BUILD
2382 if (__itt_stack_caller_create_ptr) {
2383 // destroy the stack stitching id after join barrier
2384 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id);
2385 }
2386 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer.
2387 if (team->t.t_active_level == 1 &&
2388 (!master_th->th.th_teams_microtask || /* not in teams construct */
2389 master_th->th.th_teams_size.nteams == 1)) {
2390 master_th->th.th_ident = loc;
2391 // only one notification scheme (either "submit" or "forking/joined", not
2392 // both)
2393 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
2394 __kmp_forkjoin_frames_mode == 3)
2395 __kmp_itt_frame_submit(gtid, team->t.t_region_time,
2396 master_th->th.th_frame_time, 0, loc,
2397 master_th->th.th_team_nproc, 1);
2398 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) &&
2399 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames)
2400 __kmp_itt_region_joined(gtid);
2401 } // active_level == 1
2402 #endif /* USE_ITT_BUILD */
2403
2404 if (master_th->th.th_teams_microtask && !exit_teams &&
2405 team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2406 team->t.t_level == master_th->th.th_teams_level + 1) {
2407 // AC: We need to leave the team structure intact at the end of parallel
2408 // inside the teams construct, so that at the next parallel same (hot) team
2409 // works, only adjust nesting levels
2410 #if OMPT_SUPPORT
2411 ompt_data_t ompt_parallel_data = ompt_data_none;
2412 if (ompt_enabled.enabled) {
2413 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2414 if (ompt_enabled.ompt_callback_implicit_task) {
2415 int ompt_team_size = team->t.t_nproc;
2416 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2417 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2418 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit);
2419 }
2420 task_info->frame.exit_frame = ompt_data_none;
2421 task_info->task_data = ompt_data_none;
2422 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th);
2423 __ompt_lw_taskteam_unlink(master_th);
2424 }
2425 #endif
2426 /* Decrement our nested depth level */
2427 team->t.t_level--;
2428 team->t.t_active_level--;
2429 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2430
2431 // Restore number of threads in the team if needed. This code relies on
2432 // the proper adjustment of th_teams_size.nth after the fork in
2433 // __kmp_teams_master on each teams master in the case that
2434 // __kmp_reserve_threads reduced it.
2435 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) {
2436 int old_num = master_th->th.th_team_nproc;
2437 int new_num = master_th->th.th_teams_size.nth;
2438 kmp_info_t **other_threads = team->t.t_threads;
2439 team->t.t_nproc = new_num;
2440 for (int i = 0; i < old_num; ++i) {
2441 other_threads[i]->th.th_team_nproc = new_num;
2442 }
2443 // Adjust states of non-used threads of the team
2444 for (int i = old_num; i < new_num; ++i) {
2445 // Re-initialize thread's barrier data.
2446 KMP_DEBUG_ASSERT(other_threads[i]);
2447 kmp_balign_t *balign = other_threads[i]->th.th_bar;
2448 for (int b = 0; b < bs_last_barrier; ++b) {
2449 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
2450 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2451 #if USE_DEBUGGER
2452 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
2453 #endif
2454 }
2455 if (__kmp_tasking_mode != tskm_immediate_exec) {
2456 // Synchronize thread's task state
2457 other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2458 }
2459 }
2460 }
2461
2462 #if OMPT_SUPPORT
2463 if (ompt_enabled.enabled) {
2464 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data,
2465 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr);
2466 }
2467 #endif
2468
2469 return;
2470 }
2471
2472 /* do cleanup and restore the parent team */
2473 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid;
2474 master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2475
2476 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid];
2477
2478 /* jc: The following lock has instructions with REL and ACQ semantics,
2479 separating the parallel user code called in this parallel region
2480 from the serial user code called after this function returns. */
2481 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2482
2483 if (!master_th->th.th_teams_microtask ||
2484 team->t.t_level > master_th->th.th_teams_level) {
2485 /* Decrement our nested depth level */
2486 KMP_ATOMIC_DEC(&root->r.r_in_parallel);
2487 }
2488 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0);
2489
2490 #if OMPT_SUPPORT
2491 if (ompt_enabled.enabled) {
2492 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
2493 if (ompt_enabled.ompt_callback_implicit_task) {
2494 int flags = (team_microtask == (void *)__kmp_teams_master)
2495 ? ompt_task_initial
2496 : ompt_task_implicit;
2497 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc;
2498 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2499 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size,
2500 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags);
2501 }
2502 task_info->frame.exit_frame = ompt_data_none;
2503 task_info->task_data = ompt_data_none;
2504 }
2505 #endif
2506
2507 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0,
2508 master_th, team));
2509 __kmp_pop_current_task_from_thread(master_th);
2510
2511 #if KMP_AFFINITY_SUPPORTED
2512 // Restore master thread's partition.
2513 master_th->th.th_first_place = team->t.t_first_place;
2514 master_th->th.th_last_place = team->t.t_last_place;
2515 #endif // KMP_AFFINITY_SUPPORTED
2516 master_th->th.th_def_allocator = team->t.t_def_allocator;
2517
2518 updateHWFPControl(team);
2519
2520 if (root->r.r_active != master_active)
2521 root->r.r_active = master_active;
2522
2523 __kmp_free_team(root, team USE_NESTED_HOT_ARG(
2524 master_th)); // this will free worker threads
2525
2526 /* this race was fun to find. make sure the following is in the critical
2527 region otherwise assertions may fail occasionally since the old team may be
2528 reallocated and the hierarchy appears inconsistent. it is actually safe to
2529 run and won't cause any bugs, but will cause those assertion failures. it's
2530 only one deref&assign so might as well put this in the critical region */
2531 master_th->th.th_team = parent_team;
2532 master_th->th.th_team_nproc = parent_team->t.t_nproc;
2533 master_th->th.th_team_master = parent_team->t.t_threads[0];
2534 master_th->th.th_team_serialized = parent_team->t.t_serialized;
2535
2536 /* restore serialized team, if need be */
2537 if (parent_team->t.t_serialized &&
2538 parent_team != master_th->th.th_serial_team &&
2539 parent_team != root->r.r_root_team) {
2540 __kmp_free_team(root,
2541 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL));
2542 master_th->th.th_serial_team = parent_team;
2543 }
2544
2545 if (__kmp_tasking_mode != tskm_immediate_exec) {
2546 if (master_th->th.th_task_state_top >
2547 0) { // Restore task state from memo stack
2548 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2549 // Remember master's state if we re-use this nested hot team
2550 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] =
2551 master_th->th.th_task_state;
2552 --master_th->th.th_task_state_top; // pop
2553 // Now restore state at this level
2554 master_th->th.th_task_state =
2555 master_th->th
2556 .th_task_state_memo_stack[master_th->th.th_task_state_top];
2557 }
2558 // Copy the task team from the parent team to the master thread
2559 master_th->th.th_task_team =
2560 parent_team->t.t_task_team[master_th->th.th_task_state];
2561 KA_TRACE(20,
2562 ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2563 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team,
2564 parent_team));
2565 }
2566
2567 // TODO: GEH - cannot do this assertion because root thread not set up as
2568 // executing
2569 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2570 master_th->th.th_current_task->td_flags.executing = 1;
2571
2572 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2573
2574 #if OMPT_SUPPORT
2575 int flags =
2576 OMPT_INVOKER(fork_context) |
2577 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league
2578 : ompt_parallel_team);
2579 if (ompt_enabled.enabled) {
2580 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags,
2581 codeptr);
2582 }
2583 #endif
2584
2585 KMP_MB();
2586 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid));
2587 }
2588
2589 /* Check whether we should push an internal control record onto the
2590 serial team stack. If so, do it. */
__kmp_save_internal_controls(kmp_info_t * thread)2591 void __kmp_save_internal_controls(kmp_info_t *thread) {
2592
2593 if (thread->th.th_team != thread->th.th_serial_team) {
2594 return;
2595 }
2596 if (thread->th.th_team->t.t_serialized > 1) {
2597 int push = 0;
2598
2599 if (thread->th.th_team->t.t_control_stack_top == NULL) {
2600 push = 1;
2601 } else {
2602 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2603 thread->th.th_team->t.t_serialized) {
2604 push = 1;
2605 }
2606 }
2607 if (push) { /* push a record on the serial team's stack */
2608 kmp_internal_control_t *control =
2609 (kmp_internal_control_t *)__kmp_allocate(
2610 sizeof(kmp_internal_control_t));
2611
2612 copy_icvs(control, &thread->th.th_current_task->td_icvs);
2613
2614 control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2615
2616 control->next = thread->th.th_team->t.t_control_stack_top;
2617 thread->th.th_team->t.t_control_stack_top = control;
2618 }
2619 }
2620 }
2621
2622 /* Changes set_nproc */
__kmp_set_num_threads(int new_nth,int gtid)2623 void __kmp_set_num_threads(int new_nth, int gtid) {
2624 kmp_info_t *thread;
2625 kmp_root_t *root;
2626
2627 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth));
2628 KMP_DEBUG_ASSERT(__kmp_init_serial);
2629
2630 if (new_nth < 1)
2631 new_nth = 1;
2632 else if (new_nth > __kmp_max_nth)
2633 new_nth = __kmp_max_nth;
2634
2635 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2636 thread = __kmp_threads[gtid];
2637 if (thread->th.th_current_task->td_icvs.nproc == new_nth)
2638 return; // nothing to do
2639
2640 __kmp_save_internal_controls(thread);
2641
2642 set__nproc(thread, new_nth);
2643
2644 // If this omp_set_num_threads() call will cause the hot team size to be
2645 // reduced (in the absence of a num_threads clause), then reduce it now,
2646 // rather than waiting for the next parallel region.
2647 root = thread->th.th_root;
2648 if (__kmp_init_parallel && (!root->r.r_active) &&
2649 (root->r.r_hot_team->t.t_nproc > new_nth)
2650 #if KMP_NESTED_HOT_TEAMS
2651 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2652 #endif
2653 ) {
2654 kmp_team_t *hot_team = root->r.r_hot_team;
2655 int f;
2656
2657 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
2658
2659 // Release the extra threads we don't need any more.
2660 for (f = new_nth; f < hot_team->t.t_nproc; f++) {
2661 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2662 if (__kmp_tasking_mode != tskm_immediate_exec) {
2663 // When decreasing team size, threads no longer in the team should unref
2664 // task team.
2665 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2666 }
2667 __kmp_free_thread(hot_team->t.t_threads[f]);
2668 hot_team->t.t_threads[f] = NULL;
2669 }
2670 hot_team->t.t_nproc = new_nth;
2671 #if KMP_NESTED_HOT_TEAMS
2672 if (thread->th.th_hot_teams) {
2673 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team);
2674 thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2675 }
2676 #endif
2677
2678 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
2679
2680 // Update the t_nproc field in the threads that are still active.
2681 for (f = 0; f < new_nth; f++) {
2682 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL);
2683 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2684 }
2685 // Special flag in case omp_set_num_threads() call
2686 hot_team->t.t_size_changed = -1;
2687 }
2688 }
2689
2690 /* Changes max_active_levels */
__kmp_set_max_active_levels(int gtid,int max_active_levels)2691 void __kmp_set_max_active_levels(int gtid, int max_active_levels) {
2692 kmp_info_t *thread;
2693
2694 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread "
2695 "%d = (%d)\n",
2696 gtid, max_active_levels));
2697 KMP_DEBUG_ASSERT(__kmp_init_serial);
2698
2699 // validate max_active_levels
2700 if (max_active_levels < 0) {
2701 KMP_WARNING(ActiveLevelsNegative, max_active_levels);
2702 // We ignore this call if the user has specified a negative value.
2703 // The current setting won't be changed. The last valid setting will be
2704 // used. A warning will be issued (if warnings are allowed as controlled by
2705 // the KMP_WARNINGS env var).
2706 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new "
2707 "max_active_levels for thread %d = (%d)\n",
2708 gtid, max_active_levels));
2709 return;
2710 }
2711 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) {
2712 // it's OK, the max_active_levels is within the valid range: [ 0;
2713 // KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2714 // We allow a zero value. (implementation defined behavior)
2715 } else {
2716 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels,
2717 KMP_MAX_ACTIVE_LEVELS_LIMIT);
2718 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2719 // Current upper limit is MAX_INT. (implementation defined behavior)
2720 // If the input exceeds the upper limit, we correct the input to be the
2721 // upper limit. (implementation defined behavior)
2722 // Actually, the flow should never get here until we use MAX_INT limit.
2723 }
2724 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new "
2725 "max_active_levels for thread %d = (%d)\n",
2726 gtid, max_active_levels));
2727
2728 thread = __kmp_threads[gtid];
2729
2730 __kmp_save_internal_controls(thread);
2731
2732 set__max_active_levels(thread, max_active_levels);
2733 }
2734
2735 /* Gets max_active_levels */
__kmp_get_max_active_levels(int gtid)2736 int __kmp_get_max_active_levels(int gtid) {
2737 kmp_info_t *thread;
2738
2739 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid));
2740 KMP_DEBUG_ASSERT(__kmp_init_serial);
2741
2742 thread = __kmp_threads[gtid];
2743 KMP_DEBUG_ASSERT(thread->th.th_current_task);
2744 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, "
2745 "curtask_maxaclevel=%d\n",
2746 gtid, thread->th.th_current_task,
2747 thread->th.th_current_task->td_icvs.max_active_levels));
2748 return thread->th.th_current_task->td_icvs.max_active_levels;
2749 }
2750
2751 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int));
2752 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int));
2753
2754 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
__kmp_set_schedule(int gtid,kmp_sched_t kind,int chunk)2755 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) {
2756 kmp_info_t *thread;
2757 kmp_sched_t orig_kind;
2758 // kmp_team_t *team;
2759
2760 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n",
2761 gtid, (int)kind, chunk));
2762 KMP_DEBUG_ASSERT(__kmp_init_serial);
2763
2764 // Check if the kind parameter is valid, correct if needed.
2765 // Valid parameters should fit in one of two intervals - standard or extended:
2766 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2767 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103
2768 orig_kind = kind;
2769 kind = __kmp_sched_without_mods(kind);
2770
2771 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2772 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) {
2773 // TODO: Hint needs attention in case we change the default schedule.
2774 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind),
2775 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"),
2776 __kmp_msg_null);
2777 kind = kmp_sched_default;
2778 chunk = 0; // ignore chunk value in case of bad kind
2779 }
2780
2781 thread = __kmp_threads[gtid];
2782
2783 __kmp_save_internal_controls(thread);
2784
2785 if (kind < kmp_sched_upper_std) {
2786 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) {
2787 // differ static chunked vs. unchunked: chunk should be invalid to
2788 // indicate unchunked schedule (which is the default)
2789 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2790 } else {
2791 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2792 __kmp_sch_map[kind - kmp_sched_lower - 1];
2793 }
2794 } else {
2795 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2796 // kmp_sched_lower - 2 ];
2797 thread->th.th_current_task->td_icvs.sched.r_sched_type =
2798 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std -
2799 kmp_sched_lower - 2];
2800 }
2801 __kmp_sched_apply_mods_intkind(
2802 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type));
2803 if (kind == kmp_sched_auto || chunk < 1) {
2804 // ignore parameter chunk for schedule auto
2805 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2806 } else {
2807 thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2808 }
2809 }
2810
2811 /* Gets def_sched_var ICV values */
__kmp_get_schedule(int gtid,kmp_sched_t * kind,int * chunk)2812 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) {
2813 kmp_info_t *thread;
2814 enum sched_type th_type;
2815
2816 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid));
2817 KMP_DEBUG_ASSERT(__kmp_init_serial);
2818
2819 thread = __kmp_threads[gtid];
2820
2821 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2822 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) {
2823 case kmp_sch_static:
2824 case kmp_sch_static_greedy:
2825 case kmp_sch_static_balanced:
2826 *kind = kmp_sched_static;
2827 __kmp_sched_apply_mods_stdkind(kind, th_type);
2828 *chunk = 0; // chunk was not set, try to show this fact via zero value
2829 return;
2830 case kmp_sch_static_chunked:
2831 *kind = kmp_sched_static;
2832 break;
2833 case kmp_sch_dynamic_chunked:
2834 *kind = kmp_sched_dynamic;
2835 break;
2836 case kmp_sch_guided_chunked:
2837 case kmp_sch_guided_iterative_chunked:
2838 case kmp_sch_guided_analytical_chunked:
2839 *kind = kmp_sched_guided;
2840 break;
2841 case kmp_sch_auto:
2842 *kind = kmp_sched_auto;
2843 break;
2844 case kmp_sch_trapezoidal:
2845 *kind = kmp_sched_trapezoidal;
2846 break;
2847 #if KMP_STATIC_STEAL_ENABLED
2848 case kmp_sch_static_steal:
2849 *kind = kmp_sched_static_steal;
2850 break;
2851 #endif
2852 default:
2853 KMP_FATAL(UnknownSchedulingType, th_type);
2854 }
2855
2856 __kmp_sched_apply_mods_stdkind(kind, th_type);
2857 *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2858 }
2859
__kmp_get_ancestor_thread_num(int gtid,int level)2860 int __kmp_get_ancestor_thread_num(int gtid, int level) {
2861
2862 int ii, dd;
2863 kmp_team_t *team;
2864 kmp_info_t *thr;
2865
2866 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level));
2867 KMP_DEBUG_ASSERT(__kmp_init_serial);
2868
2869 // validate level
2870 if (level == 0)
2871 return 0;
2872 if (level < 0)
2873 return -1;
2874 thr = __kmp_threads[gtid];
2875 team = thr->th.th_team;
2876 ii = team->t.t_level;
2877 if (level > ii)
2878 return -1;
2879
2880 if (thr->th.th_teams_microtask) {
2881 // AC: we are in teams region where multiple nested teams have same level
2882 int tlevel = thr->th.th_teams_level; // the level of the teams construct
2883 if (level <=
2884 tlevel) { // otherwise usual algorithm works (will not touch the teams)
2885 KMP_DEBUG_ASSERT(ii >= tlevel);
2886 // AC: As we need to pass by the teams league, we need to artificially
2887 // increase ii
2888 if (ii == tlevel) {
2889 ii += 2; // three teams have same level
2890 } else {
2891 ii++; // two teams have same level
2892 }
2893 }
2894 }
2895
2896 if (ii == level)
2897 return __kmp_tid_from_gtid(gtid);
2898
2899 dd = team->t.t_serialized;
2900 level++;
2901 while (ii > level) {
2902 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2903 }
2904 if ((team->t.t_serialized) && (!dd)) {
2905 team = team->t.t_parent;
2906 continue;
2907 }
2908 if (ii > level) {
2909 team = team->t.t_parent;
2910 dd = team->t.t_serialized;
2911 ii--;
2912 }
2913 }
2914
2915 return (dd > 1) ? (0) : (team->t.t_master_tid);
2916 }
2917
__kmp_get_team_size(int gtid,int level)2918 int __kmp_get_team_size(int gtid, int level) {
2919
2920 int ii, dd;
2921 kmp_team_t *team;
2922 kmp_info_t *thr;
2923
2924 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level));
2925 KMP_DEBUG_ASSERT(__kmp_init_serial);
2926
2927 // validate level
2928 if (level == 0)
2929 return 1;
2930 if (level < 0)
2931 return -1;
2932 thr = __kmp_threads[gtid];
2933 team = thr->th.th_team;
2934 ii = team->t.t_level;
2935 if (level > ii)
2936 return -1;
2937
2938 if (thr->th.th_teams_microtask) {
2939 // AC: we are in teams region where multiple nested teams have same level
2940 int tlevel = thr->th.th_teams_level; // the level of the teams construct
2941 if (level <=
2942 tlevel) { // otherwise usual algorithm works (will not touch the teams)
2943 KMP_DEBUG_ASSERT(ii >= tlevel);
2944 // AC: As we need to pass by the teams league, we need to artificially
2945 // increase ii
2946 if (ii == tlevel) {
2947 ii += 2; // three teams have same level
2948 } else {
2949 ii++; // two teams have same level
2950 }
2951 }
2952 }
2953
2954 while (ii > level) {
2955 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) {
2956 }
2957 if (team->t.t_serialized && (!dd)) {
2958 team = team->t.t_parent;
2959 continue;
2960 }
2961 if (ii > level) {
2962 team = team->t.t_parent;
2963 ii--;
2964 }
2965 }
2966
2967 return team->t.t_nproc;
2968 }
2969
__kmp_get_schedule_global()2970 kmp_r_sched_t __kmp_get_schedule_global() {
2971 // This routine created because pairs (__kmp_sched, __kmp_chunk) and
2972 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults
2973 // independently. So one can get the updated schedule here.
2974
2975 kmp_r_sched_t r_sched;
2976
2977 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static,
2978 // __kmp_guided. __kmp_sched should keep original value, so that user can set
2979 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in
2980 // different roots (even in OMP 2.5)
2981 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched);
2982 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched);
2983 if (s == kmp_sch_static) {
2984 // replace STATIC with more detailed schedule (balanced or greedy)
2985 r_sched.r_sched_type = __kmp_static;
2986 } else if (s == kmp_sch_guided_chunked) {
2987 // replace GUIDED with more detailed schedule (iterative or analytical)
2988 r_sched.r_sched_type = __kmp_guided;
2989 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2990 r_sched.r_sched_type = __kmp_sched;
2991 }
2992 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers);
2993
2994 if (__kmp_chunk < KMP_DEFAULT_CHUNK) {
2995 // __kmp_chunk may be wrong here (if it was not ever set)
2996 r_sched.chunk = KMP_DEFAULT_CHUNK;
2997 } else {
2998 r_sched.chunk = __kmp_chunk;
2999 }
3000
3001 return r_sched;
3002 }
3003
3004 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
3005 at least argc number of *t_argv entries for the requested team. */
__kmp_alloc_argv_entries(int argc,kmp_team_t * team,int realloc)3006 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) {
3007
3008 KMP_DEBUG_ASSERT(team);
3009 if (!realloc || argc > team->t.t_max_argc) {
3010
3011 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, "
3012 "current entries=%d\n",
3013 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0));
3014 /* if previously allocated heap space for args, free them */
3015 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0])
3016 __kmp_free((void *)team->t.t_argv);
3017
3018 if (argc <= KMP_INLINE_ARGV_ENTRIES) {
3019 /* use unused space in the cache line for arguments */
3020 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
3021 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d "
3022 "argv entries\n",
3023 team->t.t_id, team->t.t_max_argc));
3024 team->t.t_argv = &team->t.t_inline_argv[0];
3025 if (__kmp_storage_map) {
3026 __kmp_print_storage_map_gtid(
3027 -1, &team->t.t_inline_argv[0],
3028 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
3029 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv",
3030 team->t.t_id);
3031 }
3032 } else {
3033 /* allocate space for arguments in the heap */
3034 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1))
3035 ? KMP_MIN_MALLOC_ARGV_ENTRIES
3036 : 2 * argc;
3037 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d "
3038 "argv entries\n",
3039 team->t.t_id, team->t.t_max_argc));
3040 team->t.t_argv =
3041 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc);
3042 if (__kmp_storage_map) {
3043 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0],
3044 &team->t.t_argv[team->t.t_max_argc],
3045 sizeof(void *) * team->t.t_max_argc,
3046 "team_%d.t_argv", team->t.t_id);
3047 }
3048 }
3049 }
3050 }
3051
__kmp_allocate_team_arrays(kmp_team_t * team,int max_nth)3052 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) {
3053 int i;
3054 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
3055 team->t.t_threads =
3056 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth);
3057 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate(
3058 sizeof(dispatch_shared_info_t) * num_disp_buff);
3059 team->t.t_dispatch =
3060 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth);
3061 team->t.t_implicit_task_taskdata =
3062 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth);
3063 team->t.t_max_nproc = max_nth;
3064
3065 /* setup dispatch buffers */
3066 for (i = 0; i < num_disp_buff; ++i) {
3067 team->t.t_disp_buffer[i].buffer_index = i;
3068 team->t.t_disp_buffer[i].doacross_buf_idx = i;
3069 }
3070 }
3071
__kmp_free_team_arrays(kmp_team_t * team)3072 static void __kmp_free_team_arrays(kmp_team_t *team) {
3073 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
3074 int i;
3075 for (i = 0; i < team->t.t_max_nproc; ++i) {
3076 if (team->t.t_dispatch[i].th_disp_buffer != NULL) {
3077 __kmp_free(team->t.t_dispatch[i].th_disp_buffer);
3078 team->t.t_dispatch[i].th_disp_buffer = NULL;
3079 }
3080 }
3081 #if KMP_USE_HIER_SCHED
3082 __kmp_dispatch_free_hierarchies(team);
3083 #endif
3084 __kmp_free(team->t.t_threads);
3085 __kmp_free(team->t.t_disp_buffer);
3086 __kmp_free(team->t.t_dispatch);
3087 __kmp_free(team->t.t_implicit_task_taskdata);
3088 team->t.t_threads = NULL;
3089 team->t.t_disp_buffer = NULL;
3090 team->t.t_dispatch = NULL;
3091 team->t.t_implicit_task_taskdata = 0;
3092 }
3093
__kmp_reallocate_team_arrays(kmp_team_t * team,int max_nth)3094 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3095 kmp_info_t **oldThreads = team->t.t_threads;
3096
3097 __kmp_free(team->t.t_disp_buffer);
3098 __kmp_free(team->t.t_dispatch);
3099 __kmp_free(team->t.t_implicit_task_taskdata);
3100 __kmp_allocate_team_arrays(team, max_nth);
3101
3102 KMP_MEMCPY(team->t.t_threads, oldThreads,
3103 team->t.t_nproc * sizeof(kmp_info_t *));
3104
3105 __kmp_free(oldThreads);
3106 }
3107
__kmp_get_global_icvs(void)3108 static kmp_internal_control_t __kmp_get_global_icvs(void) {
3109
3110 kmp_r_sched_t r_sched =
3111 __kmp_get_schedule_global(); // get current state of scheduling globals
3112
3113 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0);
3114
3115 kmp_internal_control_t g_icvs = {
3116 0, // int serial_nesting_level; //corresponds to value of th_team_serialized
3117 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic
3118 // adjustment of threads (per thread)
3119 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for
3120 // whether blocktime is explicitly set
3121 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime
3122 #if KMP_USE_MONITOR
3123 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime
3124 // intervals
3125 #endif
3126 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for
3127 // next parallel region (per thread)
3128 // (use a max ub on value if __kmp_parallel_initialize not called yet)
3129 __kmp_cg_max_nth, // int thread_limit;
3130 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control
3131 // for max_active_levels
3132 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule
3133 // {sched,chunk} pair
3134 __kmp_nested_proc_bind.bind_types[0],
3135 __kmp_default_device,
3136 NULL // struct kmp_internal_control *next;
3137 };
3138
3139 return g_icvs;
3140 }
3141
__kmp_get_x_global_icvs(const kmp_team_t * team)3142 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) {
3143
3144 kmp_internal_control_t gx_icvs;
3145 gx_icvs.serial_nesting_level =
3146 0; // probably =team->t.t_serial like in save_inter_controls
3147 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs);
3148 gx_icvs.next = NULL;
3149
3150 return gx_icvs;
3151 }
3152
__kmp_initialize_root(kmp_root_t * root)3153 static void __kmp_initialize_root(kmp_root_t *root) {
3154 int f;
3155 kmp_team_t *root_team;
3156 kmp_team_t *hot_team;
3157 int hot_team_max_nth;
3158 kmp_r_sched_t r_sched =
3159 __kmp_get_schedule_global(); // get current state of scheduling globals
3160 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3161 KMP_DEBUG_ASSERT(root);
3162 KMP_ASSERT(!root->r.r_begin);
3163
3164 /* setup the root state structure */
3165 __kmp_init_lock(&root->r.r_begin_lock);
3166 root->r.r_begin = FALSE;
3167 root->r.r_active = FALSE;
3168 root->r.r_in_parallel = 0;
3169 root->r.r_blocktime = __kmp_dflt_blocktime;
3170
3171 /* setup the root team for this task */
3172 /* allocate the root team structure */
3173 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n"));
3174
3175 root_team =
3176 __kmp_allocate_team(root,
3177 1, // new_nproc
3178 1, // max_nproc
3179 #if OMPT_SUPPORT
3180 ompt_data_none, // root parallel id
3181 #endif
3182 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3183 0 // argc
3184 USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3185 );
3186 #if USE_DEBUGGER
3187 // Non-NULL value should be assigned to make the debugger display the root
3188 // team.
3189 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0));
3190 #endif
3191
3192 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team));
3193
3194 root->r.r_root_team = root_team;
3195 root_team->t.t_control_stack_top = NULL;
3196
3197 /* initialize root team */
3198 root_team->t.t_threads[0] = NULL;
3199 root_team->t.t_nproc = 1;
3200 root_team->t.t_serialized = 1;
3201 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3202 root_team->t.t_sched.sched = r_sched.sched;
3203 KA_TRACE(
3204 20,
3205 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3206 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
3207
3208 /* setup the hot team for this task */
3209 /* allocate the hot team structure */
3210 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n"));
3211
3212 hot_team =
3213 __kmp_allocate_team(root,
3214 1, // new_nproc
3215 __kmp_dflt_team_nth_ub * 2, // max_nproc
3216 #if OMPT_SUPPORT
3217 ompt_data_none, // root parallel id
3218 #endif
3219 __kmp_nested_proc_bind.bind_types[0], &r_icvs,
3220 0 // argc
3221 USE_NESTED_HOT_ARG(NULL) // master thread is unknown
3222 );
3223 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team));
3224
3225 root->r.r_hot_team = hot_team;
3226 root_team->t.t_control_stack_top = NULL;
3227
3228 /* first-time initialization */
3229 hot_team->t.t_parent = root_team;
3230
3231 /* initialize hot team */
3232 hot_team_max_nth = hot_team->t.t_max_nproc;
3233 for (f = 0; f < hot_team_max_nth; ++f) {
3234 hot_team->t.t_threads[f] = NULL;
3235 }
3236 hot_team->t.t_nproc = 1;
3237 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3238 hot_team->t.t_sched.sched = r_sched.sched;
3239 hot_team->t.t_size_changed = 0;
3240 }
3241
3242 #ifdef KMP_DEBUG
3243
3244 typedef struct kmp_team_list_item {
3245 kmp_team_p const *entry;
3246 struct kmp_team_list_item *next;
3247 } kmp_team_list_item_t;
3248 typedef kmp_team_list_item_t *kmp_team_list_t;
3249
__kmp_print_structure_team_accum(kmp_team_list_t list,kmp_team_p const * team)3250 static void __kmp_print_structure_team_accum( // Add team to list of teams.
3251 kmp_team_list_t list, // List of teams.
3252 kmp_team_p const *team // Team to add.
3253 ) {
3254
3255 // List must terminate with item where both entry and next are NULL.
3256 // Team is added to the list only once.
3257 // List is sorted in ascending order by team id.
3258 // Team id is *not* a key.
3259
3260 kmp_team_list_t l;
3261
3262 KMP_DEBUG_ASSERT(list != NULL);
3263 if (team == NULL) {
3264 return;
3265 }
3266
3267 __kmp_print_structure_team_accum(list, team->t.t_parent);
3268 __kmp_print_structure_team_accum(list, team->t.t_next_pool);
3269
3270 // Search list for the team.
3271 l = list;
3272 while (l->next != NULL && l->entry != team) {
3273 l = l->next;
3274 }
3275 if (l->next != NULL) {
3276 return; // Team has been added before, exit.
3277 }
3278
3279 // Team is not found. Search list again for insertion point.
3280 l = list;
3281 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) {
3282 l = l->next;
3283 }
3284
3285 // Insert team.
3286 {
3287 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(
3288 sizeof(kmp_team_list_item_t));
3289 *item = *l;
3290 l->entry = team;
3291 l->next = item;
3292 }
3293 }
3294
__kmp_print_structure_team(char const * title,kmp_team_p const * team)3295 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team
3296
3297 ) {
3298 __kmp_printf("%s", title);
3299 if (team != NULL) {
3300 __kmp_printf("%2x %p\n", team->t.t_id, team);
3301 } else {
3302 __kmp_printf(" - (nil)\n");
3303 }
3304 }
3305
__kmp_print_structure_thread(char const * title,kmp_info_p const * thread)3306 static void __kmp_print_structure_thread(char const *title,
3307 kmp_info_p const *thread) {
3308 __kmp_printf("%s", title);
3309 if (thread != NULL) {
3310 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread);
3311 } else {
3312 __kmp_printf(" - (nil)\n");
3313 }
3314 }
3315
__kmp_print_structure(void)3316 void __kmp_print_structure(void) {
3317
3318 kmp_team_list_t list;
3319
3320 // Initialize list of teams.
3321 list =
3322 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t));
3323 list->entry = NULL;
3324 list->next = NULL;
3325
3326 __kmp_printf("\n------------------------------\nGlobal Thread "
3327 "Table\n------------------------------\n");
3328 {
3329 int gtid;
3330 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3331 __kmp_printf("%2d", gtid);
3332 if (__kmp_threads != NULL) {
3333 __kmp_printf(" %p", __kmp_threads[gtid]);
3334 }
3335 if (__kmp_root != NULL) {
3336 __kmp_printf(" %p", __kmp_root[gtid]);
3337 }
3338 __kmp_printf("\n");
3339 }
3340 }
3341
3342 // Print out __kmp_threads array.
3343 __kmp_printf("\n------------------------------\nThreads\n--------------------"
3344 "----------\n");
3345 if (__kmp_threads != NULL) {
3346 int gtid;
3347 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3348 kmp_info_t const *thread = __kmp_threads[gtid];
3349 if (thread != NULL) {
3350 __kmp_printf("GTID %2d %p:\n", gtid, thread);
3351 __kmp_printf(" Our Root: %p\n", thread->th.th_root);
3352 __kmp_print_structure_team(" Our Team: ", thread->th.th_team);
3353 __kmp_print_structure_team(" Serial Team: ",
3354 thread->th.th_serial_team);
3355 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc);
3356 __kmp_print_structure_thread(" Master: ",
3357 thread->th.th_team_master);
3358 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized);
3359 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc);
3360 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind);
3361 __kmp_print_structure_thread(" Next in pool: ",
3362 thread->th.th_next_pool);
3363 __kmp_printf("\n");
3364 __kmp_print_structure_team_accum(list, thread->th.th_team);
3365 __kmp_print_structure_team_accum(list, thread->th.th_serial_team);
3366 }
3367 }
3368 } else {
3369 __kmp_printf("Threads array is not allocated.\n");
3370 }
3371
3372 // Print out __kmp_root array.
3373 __kmp_printf("\n------------------------------\nUbers\n----------------------"
3374 "--------\n");
3375 if (__kmp_root != NULL) {
3376 int gtid;
3377 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) {
3378 kmp_root_t const *root = __kmp_root[gtid];
3379 if (root != NULL) {
3380 __kmp_printf("GTID %2d %p:\n", gtid, root);
3381 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team);
3382 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team);
3383 __kmp_print_structure_thread(" Uber Thread: ",
3384 root->r.r_uber_thread);
3385 __kmp_printf(" Active?: %2d\n", root->r.r_active);
3386 __kmp_printf(" In Parallel: %2d\n",
3387 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel));
3388 __kmp_printf("\n");
3389 __kmp_print_structure_team_accum(list, root->r.r_root_team);
3390 __kmp_print_structure_team_accum(list, root->r.r_hot_team);
3391 }
3392 }
3393 } else {
3394 __kmp_printf("Ubers array is not allocated.\n");
3395 }
3396
3397 __kmp_printf("\n------------------------------\nTeams\n----------------------"
3398 "--------\n");
3399 while (list->next != NULL) {
3400 kmp_team_p const *team = list->entry;
3401 int i;
3402 __kmp_printf("Team %2x %p:\n", team->t.t_id, team);
3403 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent);
3404 __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid);
3405 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc);
3406 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized);
3407 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc);
3408 for (i = 0; i < team->t.t_nproc; ++i) {
3409 __kmp_printf(" Thread %2d: ", i);
3410 __kmp_print_structure_thread("", team->t.t_threads[i]);
3411 }
3412 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool);
3413 __kmp_printf("\n");
3414 list = list->next;
3415 }
3416
3417 // Print out __kmp_thread_pool and __kmp_team_pool.
3418 __kmp_printf("\n------------------------------\nPools\n----------------------"
3419 "--------\n");
3420 __kmp_print_structure_thread("Thread pool: ",
3421 CCAST(kmp_info_t *, __kmp_thread_pool));
3422 __kmp_print_structure_team("Team pool: ",
3423 CCAST(kmp_team_t *, __kmp_team_pool));
3424 __kmp_printf("\n");
3425
3426 // Free team list.
3427 while (list != NULL) {
3428 kmp_team_list_item_t *item = list;
3429 list = list->next;
3430 KMP_INTERNAL_FREE(item);
3431 }
3432 }
3433
3434 #endif
3435
3436 //---------------------------------------------------------------------------
3437 // Stuff for per-thread fast random number generator
3438 // Table of primes
3439 static const unsigned __kmp_primes[] = {
3440 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877,
3441 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3442 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201,
3443 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3444 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7,
3445 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3446 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45,
3447 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3448 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363,
3449 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3450 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f};
3451
3452 //---------------------------------------------------------------------------
3453 // __kmp_get_random: Get a random number using a linear congruential method.
__kmp_get_random(kmp_info_t * thread)3454 unsigned short __kmp_get_random(kmp_info_t *thread) {
3455 unsigned x = thread->th.th_x;
3456 unsigned short r = x >> 16;
3457
3458 thread->th.th_x = x * thread->th.th_a + 1;
3459
3460 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3461 thread->th.th_info.ds.ds_tid, r));
3462
3463 return r;
3464 }
3465 //--------------------------------------------------------
3466 // __kmp_init_random: Initialize a random number generator
__kmp_init_random(kmp_info_t * thread)3467 void __kmp_init_random(kmp_info_t *thread) {
3468 unsigned seed = thread->th.th_info.ds.ds_tid;
3469
3470 thread->th.th_a =
3471 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))];
3472 thread->th.th_x = (seed + 1) * thread->th.th_a + 1;
3473 KA_TRACE(30,
3474 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a));
3475 }
3476
3477 #if KMP_OS_WINDOWS
3478 /* reclaim array entries for root threads that are already dead, returns number
3479 * reclaimed */
__kmp_reclaim_dead_roots(void)3480 static int __kmp_reclaim_dead_roots(void) {
3481 int i, r = 0;
3482
3483 for (i = 0; i < __kmp_threads_capacity; ++i) {
3484 if (KMP_UBER_GTID(i) &&
3485 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3486 !__kmp_root[i]
3487 ->r.r_active) { // AC: reclaim only roots died in non-active state
3488 r += __kmp_unregister_root_other_thread(i);
3489 }
3490 }
3491 return r;
3492 }
3493 #endif
3494
3495 /* This function attempts to create free entries in __kmp_threads and
3496 __kmp_root, and returns the number of free entries generated.
3497
3498 For Windows* OS static library, the first mechanism used is to reclaim array
3499 entries for root threads that are already dead.
3500
3501 On all platforms, expansion is attempted on the arrays __kmp_threads_ and
3502 __kmp_root, with appropriate update to __kmp_threads_capacity. Array
3503 capacity is increased by doubling with clipping to __kmp_tp_capacity, if
3504 threadprivate cache array has been created. Synchronization with
3505 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3506
3507 After any dead root reclamation, if the clipping value allows array expansion
3508 to result in the generation of a total of nNeed free slots, the function does
3509 that expansion. If not, nothing is done beyond the possible initial root
3510 thread reclamation.
3511
3512 If any argument is negative, the behavior is undefined. */
__kmp_expand_threads(int nNeed)3513 static int __kmp_expand_threads(int nNeed) {
3514 int added = 0;
3515 int minimumRequiredCapacity;
3516 int newCapacity;
3517 kmp_info_t **newThreads;
3518 kmp_root_t **newRoot;
3519
3520 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so
3521 // resizing __kmp_threads does not need additional protection if foreign
3522 // threads are present
3523
3524 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB
3525 /* only for Windows static library */
3526 /* reclaim array entries for root threads that are already dead */
3527 added = __kmp_reclaim_dead_roots();
3528
3529 if (nNeed) {
3530 nNeed -= added;
3531 if (nNeed < 0)
3532 nNeed = 0;
3533 }
3534 #endif
3535 if (nNeed <= 0)
3536 return added;
3537
3538 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If
3539 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the
3540 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become
3541 // > __kmp_max_nth in one of two ways:
3542 //
3543 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0]
3544 // may not be reused by another thread, so we may need to increase
3545 // __kmp_threads_capacity to __kmp_max_nth + 1.
3546 //
3547 // 2) New foreign root(s) are encountered. We always register new foreign
3548 // roots. This may cause a smaller # of threads to be allocated at
3549 // subsequent parallel regions, but the worker threads hang around (and
3550 // eventually go to sleep) and need slots in the __kmp_threads[] array.
3551 //
3552 // Anyway, that is the reason for moving the check to see if
3553 // __kmp_max_nth was exceeded into __kmp_reserve_threads()
3554 // instead of having it performed here. -BB
3555
3556 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity);
3557
3558 /* compute expansion headroom to check if we can expand */
3559 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) {
3560 /* possible expansion too small -- give up */
3561 return added;
3562 }
3563 minimumRequiredCapacity = __kmp_threads_capacity + nNeed;
3564
3565 newCapacity = __kmp_threads_capacity;
3566 do {
3567 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1)
3568 : __kmp_sys_max_nth;
3569 } while (newCapacity < minimumRequiredCapacity);
3570 newThreads = (kmp_info_t **)__kmp_allocate(
3571 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE);
3572 newRoot =
3573 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity);
3574 KMP_MEMCPY(newThreads, __kmp_threads,
3575 __kmp_threads_capacity * sizeof(kmp_info_t *));
3576 KMP_MEMCPY(newRoot, __kmp_root,
3577 __kmp_threads_capacity * sizeof(kmp_root_t *));
3578
3579 kmp_info_t **temp_threads = __kmp_threads;
3580 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads;
3581 *(kmp_root_t * *volatile *)&__kmp_root = newRoot;
3582 __kmp_free(temp_threads);
3583 added += newCapacity - __kmp_threads_capacity;
3584 *(volatile int *)&__kmp_threads_capacity = newCapacity;
3585
3586 if (newCapacity > __kmp_tp_capacity) {
3587 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3588 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3589 __kmp_threadprivate_resize_cache(newCapacity);
3590 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size
3591 *(volatile int *)&__kmp_tp_capacity = newCapacity;
3592 }
3593 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3594 }
3595
3596 return added;
3597 }
3598
3599 /* Register the current thread as a root thread and obtain our gtid. We must
3600 have the __kmp_initz_lock held at this point. Argument TRUE only if are the
3601 thread that calls from __kmp_do_serial_initialize() */
__kmp_register_root(int initial_thread)3602 int __kmp_register_root(int initial_thread) {
3603 kmp_info_t *root_thread;
3604 kmp_root_t *root;
3605 int gtid;
3606 int capacity;
3607 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3608 KA_TRACE(20, ("__kmp_register_root: entered\n"));
3609 KMP_MB();
3610
3611 /* 2007-03-02:
3612 If initial thread did not invoke OpenMP RTL yet, and this thread is not an
3613 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not
3614 work as expected -- it may return false (that means there is at least one
3615 empty slot in __kmp_threads array), but it is possible the only free slot
3616 is #0, which is reserved for initial thread and so cannot be used for this
3617 one. Following code workarounds this bug.
3618
3619 However, right solution seems to be not reserving slot #0 for initial
3620 thread because:
3621 (1) there is no magic in slot #0,
3622 (2) we cannot detect initial thread reliably (the first thread which does
3623 serial initialization may be not a real initial thread).
3624 */
3625 capacity = __kmp_threads_capacity;
3626 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
3627 --capacity;
3628 }
3629
3630 /* see if there are too many threads */
3631 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) {
3632 if (__kmp_tp_cached) {
3633 __kmp_fatal(KMP_MSG(CantRegisterNewThread),
3634 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity),
3635 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null);
3636 } else {
3637 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads),
3638 __kmp_msg_null);
3639 }
3640 }
3641
3642 /* find an available thread slot */
3643 /* Don't reassign the zero slot since we need that to only be used by initial
3644 thread */
3645 for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
3646 gtid++)
3647 ;
3648 KA_TRACE(1,
3649 ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
3650 KMP_ASSERT(gtid < __kmp_threads_capacity);
3651
3652 /* update global accounting */
3653 __kmp_all_nth++;
3654 TCW_4(__kmp_nth, __kmp_nth + 1);
3655
3656 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
3657 // numbers of procs, and method #2 (keyed API call) for higher numbers.
3658 if (__kmp_adjust_gtid_mode) {
3659 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
3660 if (TCR_4(__kmp_gtid_mode) != 2) {
3661 TCW_4(__kmp_gtid_mode, 2);
3662 }
3663 } else {
3664 if (TCR_4(__kmp_gtid_mode) != 1) {
3665 TCW_4(__kmp_gtid_mode, 1);
3666 }
3667 }
3668 }
3669
3670 #ifdef KMP_ADJUST_BLOCKTIME
3671 /* Adjust blocktime to zero if necessary */
3672 /* Middle initialization might not have occurred yet */
3673 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
3674 if (__kmp_nth > __kmp_avail_proc) {
3675 __kmp_zero_bt = TRUE;
3676 }
3677 }
3678 #endif /* KMP_ADJUST_BLOCKTIME */
3679
3680 /* setup this new hierarchy */
3681 if (!(root = __kmp_root[gtid])) {
3682 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t));
3683 KMP_DEBUG_ASSERT(!root->r.r_root_team);
3684 }
3685
3686 #if KMP_STATS_ENABLED
3687 // Initialize stats as soon as possible (right after gtid assignment).
3688 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3689 __kmp_stats_thread_ptr->startLife();
3690 KMP_SET_THREAD_STATE(SERIAL_REGION);
3691 KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3692 #endif
3693 __kmp_initialize_root(root);
3694
3695 /* setup new root thread structure */
3696 if (root->r.r_uber_thread) {
3697 root_thread = root->r.r_uber_thread;
3698 } else {
3699 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
3700 if (__kmp_storage_map) {
3701 __kmp_print_thread_storage_map(root_thread, gtid);
3702 }
3703 root_thread->th.th_info.ds.ds_gtid = gtid;
3704 #if OMPT_SUPPORT
3705 root_thread->th.ompt_thread_info.thread_data = ompt_data_none;
3706 #endif
3707 root_thread->th.th_root = root;
3708 if (__kmp_env_consistency_check) {
3709 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid);
3710 }
3711 #if USE_FAST_MEMORY
3712 __kmp_initialize_fast_memory(root_thread);
3713 #endif /* USE_FAST_MEMORY */
3714
3715 #if KMP_USE_BGET
3716 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL);
3717 __kmp_initialize_bget(root_thread);
3718 #endif
3719 __kmp_init_random(root_thread); // Initialize random number generator
3720 }
3721
3722 /* setup the serial team held in reserve by the root thread */
3723 if (!root_thread->th.th_serial_team) {
3724 kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3725 KF_TRACE(10, ("__kmp_register_root: before serial_team\n"));
3726 root_thread->th.th_serial_team = __kmp_allocate_team(
3727 root, 1, 1,
3728 #if OMPT_SUPPORT
3729 ompt_data_none, // root parallel id
3730 #endif
3731 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL));
3732 }
3733 KMP_ASSERT(root_thread->th.th_serial_team);
3734 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n",
3735 root_thread->th.th_serial_team));
3736
3737 /* drop root_thread into place */
3738 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3739
3740 root->r.r_root_team->t.t_threads[0] = root_thread;
3741 root->r.r_hot_team->t.t_threads[0] = root_thread;
3742 root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3743 // AC: the team created in reserve, not for execution (it is unused for now).
3744 root_thread->th.th_serial_team->t.t_serialized = 0;
3745 root->r.r_uber_thread = root_thread;
3746
3747 /* initialize the thread, get it ready to go */
3748 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid);
3749 TCW_4(__kmp_init_gtid, TRUE);
3750
3751 /* prepare the master thread for get_gtid() */
3752 __kmp_gtid_set_specific(gtid);
3753
3754 #if USE_ITT_BUILD
3755 __kmp_itt_thread_name(gtid);
3756 #endif /* USE_ITT_BUILD */
3757
3758 #ifdef KMP_TDATA_GTID
3759 __kmp_gtid = gtid;
3760 #endif
3761 __kmp_create_worker(gtid, root_thread, __kmp_stksize);
3762 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid);
3763
3764 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, "
3765 "plain=%u\n",
3766 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team),
3767 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3768 KMP_INIT_BARRIER_STATE));
3769 { // Initialize barrier data.
3770 int b;
3771 for (b = 0; b < bs_last_barrier; ++b) {
3772 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE;
3773 #if USE_DEBUGGER
3774 root_thread->th.th_bar[b].bb.b_worker_arrived = 0;
3775 #endif
3776 }
3777 }
3778 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived ==
3779 KMP_INIT_BARRIER_STATE);
3780
3781 #if KMP_AFFINITY_SUPPORTED
3782 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3783 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3784 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3785 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3786 if (TCR_4(__kmp_init_middle)) {
3787 __kmp_affinity_set_init_mask(gtid, TRUE);
3788 }
3789 #endif /* KMP_AFFINITY_SUPPORTED */
3790 root_thread->th.th_def_allocator = __kmp_def_allocator;
3791 root_thread->th.th_prev_level = 0;
3792 root_thread->th.th_prev_num_threads = 1;
3793
3794 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
3795 tmp->cg_root = root_thread;
3796 tmp->cg_thread_limit = __kmp_cg_max_nth;
3797 tmp->cg_nthreads = 1;
3798 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with"
3799 " cg_nthreads init to 1\n",
3800 root_thread, tmp));
3801 tmp->up = NULL;
3802 root_thread->th.th_cg_roots = tmp;
3803
3804 __kmp_root_counter++;
3805
3806 #if OMPT_SUPPORT
3807 if (!initial_thread && ompt_enabled.enabled) {
3808
3809 kmp_info_t *root_thread = ompt_get_thread();
3810
3811 ompt_set_thread_state(root_thread, ompt_state_overhead);
3812
3813 if (ompt_enabled.ompt_callback_thread_begin) {
3814 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
3815 ompt_thread_initial, __ompt_get_thread_data_internal());
3816 }
3817 ompt_data_t *task_data;
3818 ompt_data_t *parallel_data;
3819 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL);
3820 if (ompt_enabled.ompt_callback_implicit_task) {
3821 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3822 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial);
3823 }
3824
3825 ompt_set_thread_state(root_thread, ompt_state_work_serial);
3826 }
3827 #endif
3828
3829 KMP_MB();
3830 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3831
3832 return gtid;
3833 }
3834
3835 #if KMP_NESTED_HOT_TEAMS
__kmp_free_hot_teams(kmp_root_t * root,kmp_info_t * thr,int level,const int max_level)3836 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level,
3837 const int max_level) {
3838 int i, n, nth;
3839 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3840 if (!hot_teams || !hot_teams[level].hot_team) {
3841 return 0;
3842 }
3843 KMP_DEBUG_ASSERT(level < max_level);
3844 kmp_team_t *team = hot_teams[level].hot_team;
3845 nth = hot_teams[level].hot_team_nth;
3846 n = nth - 1; // master is not freed
3847 if (level < max_level - 1) {
3848 for (i = 0; i < nth; ++i) {
3849 kmp_info_t *th = team->t.t_threads[i];
3850 n += __kmp_free_hot_teams(root, th, level + 1, max_level);
3851 if (i > 0 && th->th.th_hot_teams) {
3852 __kmp_free(th->th.th_hot_teams);
3853 th->th.th_hot_teams = NULL;
3854 }
3855 }
3856 }
3857 __kmp_free_team(root, team, NULL);
3858 return n;
3859 }
3860 #endif
3861
3862 // Resets a root thread and clear its root and hot teams.
3863 // Returns the number of __kmp_threads entries directly and indirectly freed.
__kmp_reset_root(int gtid,kmp_root_t * root)3864 static int __kmp_reset_root(int gtid, kmp_root_t *root) {
3865 kmp_team_t *root_team = root->r.r_root_team;
3866 kmp_team_t *hot_team = root->r.r_hot_team;
3867 int n = hot_team->t.t_nproc;
3868 int i;
3869
3870 KMP_DEBUG_ASSERT(!root->r.r_active);
3871
3872 root->r.r_root_team = NULL;
3873 root->r.r_hot_team = NULL;
3874 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team
3875 // before call to __kmp_free_team().
3876 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL));
3877 #if KMP_NESTED_HOT_TEAMS
3878 if (__kmp_hot_teams_max_level >
3879 0) { // need to free nested hot teams and their threads if any
3880 for (i = 0; i < hot_team->t.t_nproc; ++i) {
3881 kmp_info_t *th = hot_team->t.t_threads[i];
3882 if (__kmp_hot_teams_max_level > 1) {
3883 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level);
3884 }
3885 if (th->th.th_hot_teams) {
3886 __kmp_free(th->th.th_hot_teams);
3887 th->th.th_hot_teams = NULL;
3888 }
3889 }
3890 }
3891 #endif
3892 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL));
3893
3894 // Before we can reap the thread, we need to make certain that all other
3895 // threads in the teams that had this root as ancestor have stopped trying to
3896 // steal tasks.
3897 if (__kmp_tasking_mode != tskm_immediate_exec) {
3898 __kmp_wait_to_unref_task_teams();
3899 }
3900
3901 #if KMP_OS_WINDOWS
3902 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3903 KA_TRACE(
3904 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC
3905 "\n",
3906 (LPVOID) & (root->r.r_uber_thread->th),
3907 root->r.r_uber_thread->th.th_info.ds.ds_thread));
3908 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread);
3909 #endif /* KMP_OS_WINDOWS */
3910
3911 #if OMPT_SUPPORT
3912 ompt_data_t *task_data;
3913 ompt_data_t *parallel_data;
3914 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL);
3915 if (ompt_enabled.ompt_callback_implicit_task) {
3916 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
3917 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial);
3918 }
3919 if (ompt_enabled.ompt_callback_thread_end) {
3920 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(
3921 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data));
3922 }
3923 #endif
3924
3925 TCW_4(__kmp_nth,
3926 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3927 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--;
3928 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p"
3929 " to %d\n",
3930 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots,
3931 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads));
3932 if (i == 1) {
3933 // need to free contention group structure
3934 KMP_DEBUG_ASSERT(root->r.r_uber_thread ==
3935 root->r.r_uber_thread->th.th_cg_roots->cg_root);
3936 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL);
3937 __kmp_free(root->r.r_uber_thread->th.th_cg_roots);
3938 root->r.r_uber_thread->th.th_cg_roots = NULL;
3939 }
3940 __kmp_reap_thread(root->r.r_uber_thread, 1);
3941
3942 // We canot put root thread to __kmp_thread_pool, so we have to reap it
3943 // instead of freeing.
3944 root->r.r_uber_thread = NULL;
3945 /* mark root as no longer in use */
3946 root->r.r_begin = FALSE;
3947
3948 return n;
3949 }
3950
__kmp_unregister_root_current_thread(int gtid)3951 void __kmp_unregister_root_current_thread(int gtid) {
3952 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid));
3953 /* this lock should be ok, since unregister_root_current_thread is never
3954 called during an abort, only during a normal close. furthermore, if you
3955 have the forkjoin lock, you should never try to get the initz lock */
3956 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
3957 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
3958 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, "
3959 "exiting T#%d\n",
3960 gtid));
3961 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3962 return;
3963 }
3964 kmp_root_t *root = __kmp_root[gtid];
3965
3966 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
3967 KMP_ASSERT(KMP_UBER_GTID(gtid));
3968 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
3969 KMP_ASSERT(root->r.r_active == FALSE);
3970
3971 KMP_MB();
3972
3973 kmp_info_t *thread = __kmp_threads[gtid];
3974 kmp_team_t *team = thread->th.th_team;
3975 kmp_task_team_t *task_team = thread->th.th_task_team;
3976
3977 // we need to wait for the proxy tasks before finishing the thread
3978 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) {
3979 #if OMPT_SUPPORT
3980 // the runtime is shutting down so we won't report any events
3981 thread->th.ompt_thread_info.state = ompt_state_undefined;
3982 #endif
3983 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3984 }
3985
3986 __kmp_reset_root(gtid, root);
3987
3988 KMP_MB();
3989 KC_TRACE(10,
3990 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid));
3991
3992 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
3993 }
3994
3995 #if KMP_OS_WINDOWS
3996 /* __kmp_forkjoin_lock must be already held
3997 Unregisters a root thread that is not the current thread. Returns the number
3998 of __kmp_threads entries freed as a result. */
__kmp_unregister_root_other_thread(int gtid)3999 static int __kmp_unregister_root_other_thread(int gtid) {
4000 kmp_root_t *root = __kmp_root[gtid];
4001 int r;
4002
4003 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid));
4004 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
4005 KMP_ASSERT(KMP_UBER_GTID(gtid));
4006 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root);
4007 KMP_ASSERT(root->r.r_active == FALSE);
4008
4009 r = __kmp_reset_root(gtid, root);
4010 KC_TRACE(10,
4011 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid));
4012 return r;
4013 }
4014 #endif
4015
4016 #if KMP_DEBUG
__kmp_task_info()4017 void __kmp_task_info() {
4018
4019 kmp_int32 gtid = __kmp_entry_gtid();
4020 kmp_int32 tid = __kmp_tid_from_gtid(gtid);
4021 kmp_info_t *this_thr = __kmp_threads[gtid];
4022 kmp_team_t *steam = this_thr->th.th_serial_team;
4023 kmp_team_t *team = this_thr->th.th_team;
4024
4025 __kmp_printf(
4026 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p "
4027 "ptask=%p\n",
4028 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task,
4029 team->t.t_implicit_task_taskdata[tid].td_parent);
4030 }
4031 #endif // KMP_DEBUG
4032
4033 /* TODO optimize with one big memclr, take out what isn't needed, split
4034 responsibility to workers as much as possible, and delay initialization of
4035 features as much as possible */
__kmp_initialize_info(kmp_info_t * this_thr,kmp_team_t * team,int tid,int gtid)4036 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team,
4037 int tid, int gtid) {
4038 /* this_thr->th.th_info.ds.ds_gtid is setup in
4039 kmp_allocate_thread/create_worker.
4040 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
4041 kmp_info_t *master = team->t.t_threads[0];
4042 KMP_DEBUG_ASSERT(this_thr != NULL);
4043 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team);
4044 KMP_DEBUG_ASSERT(team);
4045 KMP_DEBUG_ASSERT(team->t.t_threads);
4046 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4047 KMP_DEBUG_ASSERT(master);
4048 KMP_DEBUG_ASSERT(master->th.th_root);
4049
4050 KMP_MB();
4051
4052 TCW_SYNC_PTR(this_thr->th.th_team, team);
4053
4054 this_thr->th.th_info.ds.ds_tid = tid;
4055 this_thr->th.th_set_nproc = 0;
4056 if (__kmp_tasking_mode != tskm_immediate_exec)
4057 // When tasking is possible, threads are not safe to reap until they are
4058 // done tasking; this will be set when tasking code is exited in wait
4059 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4060 else // no tasking --> always safe to reap
4061 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4062 this_thr->th.th_set_proc_bind = proc_bind_default;
4063 #if KMP_AFFINITY_SUPPORTED
4064 this_thr->th.th_new_place = this_thr->th.th_current_place;
4065 #endif
4066 this_thr->th.th_root = master->th.th_root;
4067
4068 /* setup the thread's cache of the team structure */
4069 this_thr->th.th_team_nproc = team->t.t_nproc;
4070 this_thr->th.th_team_master = master;
4071 this_thr->th.th_team_serialized = team->t.t_serialized;
4072 TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4073
4074 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata);
4075
4076 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4077 tid, gtid, this_thr, this_thr->th.th_current_task));
4078
4079 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr,
4080 team, tid, TRUE);
4081
4082 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4083 tid, gtid, this_thr, this_thr->th.th_current_task));
4084 // TODO: Initialize ICVs from parent; GEH - isn't that already done in
4085 // __kmp_initialize_team()?
4086
4087 /* TODO no worksharing in speculative threads */
4088 this_thr->th.th_dispatch = &team->t.t_dispatch[tid];
4089
4090 this_thr->th.th_local.this_construct = 0;
4091
4092 if (!this_thr->th.th_pri_common) {
4093 this_thr->th.th_pri_common =
4094 (struct common_table *)__kmp_allocate(sizeof(struct common_table));
4095 if (__kmp_storage_map) {
4096 __kmp_print_storage_map_gtid(
4097 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4098 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid);
4099 }
4100 this_thr->th.th_pri_head = NULL;
4101 }
4102
4103 if (this_thr != master && // Master's CG root is initialized elsewhere
4104 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set
4105 // Make new thread's CG root same as master's
4106 KMP_DEBUG_ASSERT(master->th.th_cg_roots);
4107 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots;
4108 if (tmp) {
4109 // worker changes CG, need to check if old CG should be freed
4110 int i = tmp->cg_nthreads--;
4111 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads"
4112 " on node %p of thread %p to %d\n",
4113 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads));
4114 if (i == 1) {
4115 __kmp_free(tmp); // last thread left CG --> free it
4116 }
4117 }
4118 this_thr->th.th_cg_roots = master->th.th_cg_roots;
4119 // Increment new thread's CG root's counter to add the new thread
4120 this_thr->th.th_cg_roots->cg_nthreads++;
4121 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on"
4122 " node %p of thread %p to %d\n",
4123 this_thr, this_thr->th.th_cg_roots,
4124 this_thr->th.th_cg_roots->cg_root,
4125 this_thr->th.th_cg_roots->cg_nthreads));
4126 this_thr->th.th_current_task->td_icvs.thread_limit =
4127 this_thr->th.th_cg_roots->cg_thread_limit;
4128 }
4129
4130 /* Initialize dynamic dispatch */
4131 {
4132 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4133 // Use team max_nproc since this will never change for the team.
4134 size_t disp_size =
4135 sizeof(dispatch_private_info_t) *
4136 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers);
4137 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid,
4138 team->t.t_max_nproc));
4139 KMP_ASSERT(dispatch);
4140 KMP_DEBUG_ASSERT(team->t.t_dispatch);
4141 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]);
4142
4143 dispatch->th_disp_index = 0;
4144 dispatch->th_doacross_buf_idx = 0;
4145 if (!dispatch->th_disp_buffer) {
4146 dispatch->th_disp_buffer =
4147 (dispatch_private_info_t *)__kmp_allocate(disp_size);
4148
4149 if (__kmp_storage_map) {
4150 __kmp_print_storage_map_gtid(
4151 gtid, &dispatch->th_disp_buffer[0],
4152 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1
4153 ? 1
4154 : __kmp_dispatch_num_buffers],
4155 disp_size, "th_%d.th_dispatch.th_disp_buffer "
4156 "(team_%d.t_dispatch[%d].th_disp_buffer)",
4157 gtid, team->t.t_id, gtid);
4158 }
4159 } else {
4160 memset(&dispatch->th_disp_buffer[0], '\0', disp_size);
4161 }
4162
4163 dispatch->th_dispatch_pr_current = 0;
4164 dispatch->th_dispatch_sh_current = 0;
4165
4166 dispatch->th_deo_fcn = 0; /* ORDERED */
4167 dispatch->th_dxo_fcn = 0; /* END ORDERED */
4168 }
4169
4170 this_thr->th.th_next_pool = NULL;
4171
4172 if (!this_thr->th.th_task_state_memo_stack) {
4173 size_t i;
4174 this_thr->th.th_task_state_memo_stack =
4175 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8));
4176 this_thr->th.th_task_state_top = 0;
4177 this_thr->th.th_task_state_stack_sz = 4;
4178 for (i = 0; i < this_thr->th.th_task_state_stack_sz;
4179 ++i) // zero init the stack
4180 this_thr->th.th_task_state_memo_stack[i] = 0;
4181 }
4182
4183 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here);
4184 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0);
4185
4186 KMP_MB();
4187 }
4188
4189 /* allocate a new thread for the requesting team. this is only called from
4190 within a forkjoin critical section. we will first try to get an available
4191 thread from the thread pool. if none is available, we will fork a new one
4192 assuming we are able to create a new one. this should be assured, as the
4193 caller should check on this first. */
__kmp_allocate_thread(kmp_root_t * root,kmp_team_t * team,int new_tid)4194 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team,
4195 int new_tid) {
4196 kmp_team_t *serial_team;
4197 kmp_info_t *new_thr;
4198 int new_gtid;
4199
4200 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid()));
4201 KMP_DEBUG_ASSERT(root && team);
4202 #if !KMP_NESTED_HOT_TEAMS
4203 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid()));
4204 #endif
4205 KMP_MB();
4206
4207 /* first, try to get one from the thread pool */
4208 if (__kmp_thread_pool) {
4209 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool);
4210 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool;
4211 if (new_thr == __kmp_thread_pool_insert_pt) {
4212 __kmp_thread_pool_insert_pt = NULL;
4213 }
4214 TCW_4(new_thr->th.th_in_pool, FALSE);
4215 __kmp_suspend_initialize_thread(new_thr);
4216 __kmp_lock_suspend_mx(new_thr);
4217 if (new_thr->th.th_active_in_pool == TRUE) {
4218 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE);
4219 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
4220 new_thr->th.th_active_in_pool = FALSE;
4221 }
4222 __kmp_unlock_suspend_mx(new_thr);
4223
4224 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4225 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid));
4226 KMP_ASSERT(!new_thr->th.th_team);
4227 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity);
4228
4229 /* setup the thread structure */
4230 __kmp_initialize_info(new_thr, team, new_tid,
4231 new_thr->th.th_info.ds.ds_gtid);
4232 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team);
4233
4234 TCW_4(__kmp_nth, __kmp_nth + 1);
4235
4236 new_thr->th.th_task_state = 0;
4237 new_thr->th.th_task_state_top = 0;
4238 new_thr->th.th_task_state_stack_sz = 4;
4239
4240 #ifdef KMP_ADJUST_BLOCKTIME
4241 /* Adjust blocktime back to zero if necessary */
4242 /* Middle initialization might not have occurred yet */
4243 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4244 if (__kmp_nth > __kmp_avail_proc) {
4245 __kmp_zero_bt = TRUE;
4246 }
4247 }
4248 #endif /* KMP_ADJUST_BLOCKTIME */
4249
4250 #if KMP_DEBUG
4251 // If thread entered pool via __kmp_free_thread, wait_flag should !=
4252 // KMP_BARRIER_PARENT_FLAG.
4253 int b;
4254 kmp_balign_t *balign = new_thr->th.th_bar;
4255 for (b = 0; b < bs_last_barrier; ++b)
4256 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4257 #endif
4258
4259 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4260 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid));
4261
4262 KMP_MB();
4263 return new_thr;
4264 }
4265
4266 /* no, well fork a new one */
4267 KMP_ASSERT(__kmp_nth == __kmp_all_nth);
4268 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity);
4269
4270 #if KMP_USE_MONITOR
4271 // If this is the first worker thread the RTL is creating, then also
4272 // launch the monitor thread. We try to do this as early as possible.
4273 if (!TCR_4(__kmp_init_monitor)) {
4274 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
4275 if (!TCR_4(__kmp_init_monitor)) {
4276 KF_TRACE(10, ("before __kmp_create_monitor\n"));
4277 TCW_4(__kmp_init_monitor, 1);
4278 __kmp_create_monitor(&__kmp_monitor);
4279 KF_TRACE(10, ("after __kmp_create_monitor\n"));
4280 #if KMP_OS_WINDOWS
4281 // AC: wait until monitor has started. This is a fix for CQ232808.
4282 // The reason is that if the library is loaded/unloaded in a loop with
4283 // small (parallel) work in between, then there is high probability that
4284 // monitor thread started after the library shutdown. At shutdown it is
4285 // too late to cope with the problem, because when the master is in
4286 // DllMain (process detach) the monitor has no chances to start (it is
4287 // blocked), and master has no means to inform the monitor that the
4288 // library has gone, because all the memory which the monitor can access
4289 // is going to be released/reset.
4290 while (TCR_4(__kmp_init_monitor) < 2) {
4291 KMP_YIELD(TRUE);
4292 }
4293 KF_TRACE(10, ("after monitor thread has started\n"));
4294 #endif
4295 }
4296 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
4297 }
4298 #endif
4299
4300 KMP_MB();
4301 for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
4302 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
4303 }
4304
4305 /* allocate space for it. */
4306 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
4307
4308 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4309
4310 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG
4311 // suppress race conditions detection on synchronization flags in debug mode
4312 // this helps to analyze library internals eliminating false positives
4313 __itt_suppress_mark_range(
4314 __itt_suppress_range, __itt_suppress_threading_errors,
4315 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc));
4316 __itt_suppress_mark_range(
4317 __itt_suppress_range, __itt_suppress_threading_errors,
4318 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state));
4319 #if KMP_OS_WINDOWS
4320 __itt_suppress_mark_range(
4321 __itt_suppress_range, __itt_suppress_threading_errors,
4322 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init));
4323 #else
4324 __itt_suppress_mark_range(__itt_suppress_range,
4325 __itt_suppress_threading_errors,
4326 &new_thr->th.th_suspend_init_count,
4327 sizeof(new_thr->th.th_suspend_init_count));
4328 #endif
4329 // TODO: check if we need to also suppress b_arrived flags
4330 __itt_suppress_mark_range(__itt_suppress_range,
4331 __itt_suppress_threading_errors,
4332 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go),
4333 sizeof(new_thr->th.th_bar[0].bb.b_go));
4334 __itt_suppress_mark_range(__itt_suppress_range,
4335 __itt_suppress_threading_errors,
4336 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go),
4337 sizeof(new_thr->th.th_bar[1].bb.b_go));
4338 __itt_suppress_mark_range(__itt_suppress_range,
4339 __itt_suppress_threading_errors,
4340 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go),
4341 sizeof(new_thr->th.th_bar[2].bb.b_go));
4342 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */
4343 if (__kmp_storage_map) {
4344 __kmp_print_thread_storage_map(new_thr, new_gtid);
4345 }
4346
4347 // add the reserve serialized team, initialized from the team's master thread
4348 {
4349 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team);
4350 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n"));
4351 new_thr->th.th_serial_team = serial_team =
4352 (kmp_team_t *)__kmp_allocate_team(root, 1, 1,
4353 #if OMPT_SUPPORT
4354 ompt_data_none, // root parallel id
4355 #endif
4356 proc_bind_default, &r_icvs,
4357 0 USE_NESTED_HOT_ARG(NULL));
4358 }
4359 KMP_ASSERT(serial_team);
4360 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for
4361 // execution (it is unused for now).
4362 serial_team->t.t_threads[0] = new_thr;
4363 KF_TRACE(10,
4364 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4365 new_thr));
4366
4367 /* setup the thread structures */
4368 __kmp_initialize_info(new_thr, team, new_tid, new_gtid);
4369
4370 #if USE_FAST_MEMORY
4371 __kmp_initialize_fast_memory(new_thr);
4372 #endif /* USE_FAST_MEMORY */
4373
4374 #if KMP_USE_BGET
4375 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL);
4376 __kmp_initialize_bget(new_thr);
4377 #endif
4378
4379 __kmp_init_random(new_thr); // Initialize random number generator
4380
4381 /* Initialize these only once when thread is grabbed for a team allocation */
4382 KA_TRACE(20,
4383 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4384 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
4385
4386 int b;
4387 kmp_balign_t *balign = new_thr->th.th_bar;
4388 for (b = 0; b < bs_last_barrier; ++b) {
4389 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4390 balign[b].bb.team = NULL;
4391 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4392 balign[b].bb.use_oncore_barrier = 0;
4393 }
4394
4395 new_thr->th.th_spin_here = FALSE;
4396 new_thr->th.th_next_waiting = 0;
4397 #if KMP_OS_UNIX
4398 new_thr->th.th_blocking = false;
4399 #endif
4400
4401 #if KMP_AFFINITY_SUPPORTED
4402 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4403 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4404 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4405 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4406 #endif
4407 new_thr->th.th_def_allocator = __kmp_def_allocator;
4408 new_thr->th.th_prev_level = 0;
4409 new_thr->th.th_prev_num_threads = 1;
4410
4411 TCW_4(new_thr->th.th_in_pool, FALSE);
4412 new_thr->th.th_active_in_pool = FALSE;
4413 TCW_4(new_thr->th.th_active, TRUE);
4414
4415 /* adjust the global counters */
4416 __kmp_all_nth++;
4417 __kmp_nth++;
4418
4419 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low
4420 // numbers of procs, and method #2 (keyed API call) for higher numbers.
4421 if (__kmp_adjust_gtid_mode) {
4422 if (__kmp_all_nth >= __kmp_tls_gtid_min) {
4423 if (TCR_4(__kmp_gtid_mode) != 2) {
4424 TCW_4(__kmp_gtid_mode, 2);
4425 }
4426 } else {
4427 if (TCR_4(__kmp_gtid_mode) != 1) {
4428 TCW_4(__kmp_gtid_mode, 1);
4429 }
4430 }
4431 }
4432
4433 #ifdef KMP_ADJUST_BLOCKTIME
4434 /* Adjust blocktime back to zero if necessary */
4435 /* Middle initialization might not have occurred yet */
4436 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
4437 if (__kmp_nth > __kmp_avail_proc) {
4438 __kmp_zero_bt = TRUE;
4439 }
4440 }
4441 #endif /* KMP_ADJUST_BLOCKTIME */
4442
4443 /* actually fork it and create the new worker thread */
4444 KF_TRACE(
4445 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr));
4446 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize);
4447 KF_TRACE(10,
4448 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr));
4449
4450 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(),
4451 new_gtid));
4452 KMP_MB();
4453 return new_thr;
4454 }
4455
4456 /* Reinitialize team for reuse.
4457 The hot team code calls this case at every fork barrier, so EPCC barrier
4458 test are extremely sensitive to changes in it, esp. writes to the team
4459 struct, which cause a cache invalidation in all threads.
4460 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */
__kmp_reinitialize_team(kmp_team_t * team,kmp_internal_control_t * new_icvs,ident_t * loc)4461 static void __kmp_reinitialize_team(kmp_team_t *team,
4462 kmp_internal_control_t *new_icvs,
4463 ident_t *loc) {
4464 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4465 team->t.t_threads[0], team));
4466 KMP_DEBUG_ASSERT(team && new_icvs);
4467 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
4468 KMP_CHECK_UPDATE(team->t.t_ident, loc);
4469
4470 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4471 // Copy ICVs to the master thread's implicit taskdata
4472 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE);
4473 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4474
4475 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4476 team->t.t_threads[0], team));
4477 }
4478
4479 /* Initialize the team data structure.
4480 This assumes the t_threads and t_max_nproc are already set.
4481 Also, we don't touch the arguments */
__kmp_initialize_team(kmp_team_t * team,int new_nproc,kmp_internal_control_t * new_icvs,ident_t * loc)4482 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc,
4483 kmp_internal_control_t *new_icvs,
4484 ident_t *loc) {
4485 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team));
4486
4487 /* verify */
4488 KMP_DEBUG_ASSERT(team);
4489 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc);
4490 KMP_DEBUG_ASSERT(team->t.t_threads);
4491 KMP_MB();
4492
4493 team->t.t_master_tid = 0; /* not needed */
4494 /* team->t.t_master_bar; not needed */
4495 team->t.t_serialized = new_nproc > 1 ? 0 : 1;
4496 team->t.t_nproc = new_nproc;
4497
4498 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */
4499 team->t.t_next_pool = NULL;
4500 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess
4501 * up hot team */
4502
4503 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4504 team->t.t_invoke = NULL; /* not needed */
4505
4506 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4507 team->t.t_sched.sched = new_icvs->sched.sched;
4508
4509 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4510 team->t.t_fp_control_saved = FALSE; /* not needed */
4511 team->t.t_x87_fpu_control_word = 0; /* not needed */
4512 team->t.t_mxcsr = 0; /* not needed */
4513 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4514
4515 team->t.t_construct = 0;
4516
4517 team->t.t_ordered.dt.t_value = 0;
4518 team->t.t_master_active = FALSE;
4519
4520 #ifdef KMP_DEBUG
4521 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */
4522 #endif
4523 #if KMP_OS_WINDOWS
4524 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */
4525 #endif
4526
4527 team->t.t_control_stack_top = NULL;
4528
4529 __kmp_reinitialize_team(team, new_icvs, loc);
4530
4531 KMP_MB();
4532 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team));
4533 }
4534
4535 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
4536 /* Sets full mask for thread and returns old mask, no changes to structures. */
4537 static void
__kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t * old_mask)4538 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) {
4539 if (KMP_AFFINITY_CAPABLE()) {
4540 int status;
4541 if (old_mask != NULL) {
4542 status = __kmp_get_system_affinity(old_mask, TRUE);
4543 int error = errno;
4544 if (status != 0) {
4545 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error),
4546 __kmp_msg_null);
4547 }
4548 }
4549 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE);
4550 }
4551 }
4552 #endif
4553
4554 #if KMP_AFFINITY_SUPPORTED
4555
4556 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4557 // It calculates the worker + master thread's partition based upon the parent
4558 // thread's partition, and binds each worker to a thread in their partition.
4559 // The master thread's partition should already include its current binding.
__kmp_partition_places(kmp_team_t * team,int update_master_only)4560 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) {
4561 // Copy the master thread's place partition to the team struct
4562 kmp_info_t *master_th = team->t.t_threads[0];
4563 KMP_DEBUG_ASSERT(master_th != NULL);
4564 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4565 int first_place = master_th->th.th_first_place;
4566 int last_place = master_th->th.th_last_place;
4567 int masters_place = master_th->th.th_current_place;
4568 team->t.t_first_place = first_place;
4569 team->t.t_last_place = last_place;
4570
4571 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) "
4572 "bound to place %d partition = [%d,%d]\n",
4573 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]),
4574 team->t.t_id, masters_place, first_place, last_place));
4575
4576 switch (proc_bind) {
4577
4578 case proc_bind_default:
4579 // serial teams might have the proc_bind policy set to proc_bind_default. It
4580 // doesn't matter, as we don't rebind master thread for any proc_bind policy
4581 KMP_DEBUG_ASSERT(team->t.t_nproc == 1);
4582 break;
4583
4584 case proc_bind_master: {
4585 int f;
4586 int n_th = team->t.t_nproc;
4587 for (f = 1; f < n_th; f++) {
4588 kmp_info_t *th = team->t.t_threads[f];
4589 KMP_DEBUG_ASSERT(th != NULL);
4590 th->th.th_first_place = first_place;
4591 th->th.th_last_place = last_place;
4592 th->th.th_new_place = masters_place;
4593 if (__kmp_display_affinity && masters_place != th->th.th_current_place &&
4594 team->t.t_display_affinity != 1) {
4595 team->t.t_display_affinity = 1;
4596 }
4597
4598 KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d "
4599 "partition = [%d,%d]\n",
4600 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4601 f, masters_place, first_place, last_place));
4602 }
4603 } break;
4604
4605 case proc_bind_close: {
4606 int f;
4607 int n_th = team->t.t_nproc;
4608 int n_places;
4609 if (first_place <= last_place) {
4610 n_places = last_place - first_place + 1;
4611 } else {
4612 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4613 }
4614 if (n_th <= n_places) {
4615 int place = masters_place;
4616 for (f = 1; f < n_th; f++) {
4617 kmp_info_t *th = team->t.t_threads[f];
4618 KMP_DEBUG_ASSERT(th != NULL);
4619
4620 if (place == last_place) {
4621 place = first_place;
4622 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4623 place = 0;
4624 } else {
4625 place++;
4626 }
4627 th->th.th_first_place = first_place;
4628 th->th.th_last_place = last_place;
4629 th->th.th_new_place = place;
4630 if (__kmp_display_affinity && place != th->th.th_current_place &&
4631 team->t.t_display_affinity != 1) {
4632 team->t.t_display_affinity = 1;
4633 }
4634
4635 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4636 "partition = [%d,%d]\n",
4637 __kmp_gtid_from_thread(team->t.t_threads[f]),
4638 team->t.t_id, f, place, first_place, last_place));
4639 }
4640 } else {
4641 int S, rem, gap, s_count;
4642 S = n_th / n_places;
4643 s_count = 0;
4644 rem = n_th - (S * n_places);
4645 gap = rem > 0 ? n_places / rem : n_places;
4646 int place = masters_place;
4647 int gap_ct = gap;
4648 for (f = 0; f < n_th; f++) {
4649 kmp_info_t *th = team->t.t_threads[f];
4650 KMP_DEBUG_ASSERT(th != NULL);
4651
4652 th->th.th_first_place = first_place;
4653 th->th.th_last_place = last_place;
4654 th->th.th_new_place = place;
4655 if (__kmp_display_affinity && place != th->th.th_current_place &&
4656 team->t.t_display_affinity != 1) {
4657 team->t.t_display_affinity = 1;
4658 }
4659 s_count++;
4660
4661 if ((s_count == S) && rem && (gap_ct == gap)) {
4662 // do nothing, add an extra thread to place on next iteration
4663 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4664 // we added an extra thread to this place; move to next place
4665 if (place == last_place) {
4666 place = first_place;
4667 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4668 place = 0;
4669 } else {
4670 place++;
4671 }
4672 s_count = 0;
4673 gap_ct = 1;
4674 rem--;
4675 } else if (s_count == S) { // place full; don't add extra
4676 if (place == last_place) {
4677 place = first_place;
4678 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4679 place = 0;
4680 } else {
4681 place++;
4682 }
4683 gap_ct++;
4684 s_count = 0;
4685 }
4686
4687 KA_TRACE(100,
4688 ("__kmp_partition_places: close: T#%d(%d:%d) place %d "
4689 "partition = [%d,%d]\n",
4690 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f,
4691 th->th.th_new_place, first_place, last_place));
4692 }
4693 KMP_DEBUG_ASSERT(place == masters_place);
4694 }
4695 } break;
4696
4697 case proc_bind_spread: {
4698 int f;
4699 int n_th = team->t.t_nproc;
4700 int n_places;
4701 int thidx;
4702 if (first_place <= last_place) {
4703 n_places = last_place - first_place + 1;
4704 } else {
4705 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4706 }
4707 if (n_th <= n_places) {
4708 int place = -1;
4709
4710 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) {
4711 int S = n_places / n_th;
4712 int s_count, rem, gap, gap_ct;
4713
4714 place = masters_place;
4715 rem = n_places - n_th * S;
4716 gap = rem ? n_th / rem : 1;
4717 gap_ct = gap;
4718 thidx = n_th;
4719 if (update_master_only == 1)
4720 thidx = 1;
4721 for (f = 0; f < thidx; f++) {
4722 kmp_info_t *th = team->t.t_threads[f];
4723 KMP_DEBUG_ASSERT(th != NULL);
4724
4725 th->th.th_first_place = place;
4726 th->th.th_new_place = place;
4727 if (__kmp_display_affinity && place != th->th.th_current_place &&
4728 team->t.t_display_affinity != 1) {
4729 team->t.t_display_affinity = 1;
4730 }
4731 s_count = 1;
4732 while (s_count < S) {
4733 if (place == last_place) {
4734 place = first_place;
4735 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4736 place = 0;
4737 } else {
4738 place++;
4739 }
4740 s_count++;
4741 }
4742 if (rem && (gap_ct == gap)) {
4743 if (place == last_place) {
4744 place = first_place;
4745 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4746 place = 0;
4747 } else {
4748 place++;
4749 }
4750 rem--;
4751 gap_ct = 0;
4752 }
4753 th->th.th_last_place = place;
4754 gap_ct++;
4755
4756 if (place == last_place) {
4757 place = first_place;
4758 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4759 place = 0;
4760 } else {
4761 place++;
4762 }
4763
4764 KA_TRACE(100,
4765 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4766 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n",
4767 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id,
4768 f, th->th.th_new_place, th->th.th_first_place,
4769 th->th.th_last_place, __kmp_affinity_num_masks));
4770 }
4771 } else {
4772 /* Having uniform space of available computation places I can create
4773 T partitions of round(P/T) size and put threads into the first
4774 place of each partition. */
4775 double current = static_cast<double>(masters_place);
4776 double spacing =
4777 (static_cast<double>(n_places + 1) / static_cast<double>(n_th));
4778 int first, last;
4779 kmp_info_t *th;
4780
4781 thidx = n_th + 1;
4782 if (update_master_only == 1)
4783 thidx = 1;
4784 for (f = 0; f < thidx; f++) {
4785 first = static_cast<int>(current);
4786 last = static_cast<int>(current + spacing) - 1;
4787 KMP_DEBUG_ASSERT(last >= first);
4788 if (first >= n_places) {
4789 if (masters_place) {
4790 first -= n_places;
4791 last -= n_places;
4792 if (first == (masters_place + 1)) {
4793 KMP_DEBUG_ASSERT(f == n_th);
4794 first--;
4795 }
4796 if (last == masters_place) {
4797 KMP_DEBUG_ASSERT(f == (n_th - 1));
4798 last--;
4799 }
4800 } else {
4801 KMP_DEBUG_ASSERT(f == n_th);
4802 first = 0;
4803 last = 0;
4804 }
4805 }
4806 if (last >= n_places) {
4807 last = (n_places - 1);
4808 }
4809 place = first;
4810 current += spacing;
4811 if (f < n_th) {
4812 KMP_DEBUG_ASSERT(0 <= first);
4813 KMP_DEBUG_ASSERT(n_places > first);
4814 KMP_DEBUG_ASSERT(0 <= last);
4815 KMP_DEBUG_ASSERT(n_places > last);
4816 KMP_DEBUG_ASSERT(last_place >= first_place);
4817 th = team->t.t_threads[f];
4818 KMP_DEBUG_ASSERT(th);
4819 th->th.th_first_place = first;
4820 th->th.th_new_place = place;
4821 th->th.th_last_place = last;
4822 if (__kmp_display_affinity && place != th->th.th_current_place &&
4823 team->t.t_display_affinity != 1) {
4824 team->t.t_display_affinity = 1;
4825 }
4826 KA_TRACE(100,
4827 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4828 "partition = [%d,%d], spacing = %.4f\n",
4829 __kmp_gtid_from_thread(team->t.t_threads[f]),
4830 team->t.t_id, f, th->th.th_new_place,
4831 th->th.th_first_place, th->th.th_last_place, spacing));
4832 }
4833 }
4834 }
4835 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4836 } else {
4837 int S, rem, gap, s_count;
4838 S = n_th / n_places;
4839 s_count = 0;
4840 rem = n_th - (S * n_places);
4841 gap = rem > 0 ? n_places / rem : n_places;
4842 int place = masters_place;
4843 int gap_ct = gap;
4844 thidx = n_th;
4845 if (update_master_only == 1)
4846 thidx = 1;
4847 for (f = 0; f < thidx; f++) {
4848 kmp_info_t *th = team->t.t_threads[f];
4849 KMP_DEBUG_ASSERT(th != NULL);
4850
4851 th->th.th_first_place = place;
4852 th->th.th_last_place = place;
4853 th->th.th_new_place = place;
4854 if (__kmp_display_affinity && place != th->th.th_current_place &&
4855 team->t.t_display_affinity != 1) {
4856 team->t.t_display_affinity = 1;
4857 }
4858 s_count++;
4859
4860 if ((s_count == S) && rem && (gap_ct == gap)) {
4861 // do nothing, add an extra thread to place on next iteration
4862 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) {
4863 // we added an extra thread to this place; move on to next place
4864 if (place == last_place) {
4865 place = first_place;
4866 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4867 place = 0;
4868 } else {
4869 place++;
4870 }
4871 s_count = 0;
4872 gap_ct = 1;
4873 rem--;
4874 } else if (s_count == S) { // place is full; don't add extra thread
4875 if (place == last_place) {
4876 place = first_place;
4877 } else if (place == (int)(__kmp_affinity_num_masks - 1)) {
4878 place = 0;
4879 } else {
4880 place++;
4881 }
4882 gap_ct++;
4883 s_count = 0;
4884 }
4885
4886 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d "
4887 "partition = [%d,%d]\n",
4888 __kmp_gtid_from_thread(team->t.t_threads[f]),
4889 team->t.t_id, f, th->th.th_new_place,
4890 th->th.th_first_place, th->th.th_last_place));
4891 }
4892 KMP_DEBUG_ASSERT(update_master_only || place == masters_place);
4893 }
4894 } break;
4895
4896 default:
4897 break;
4898 }
4899
4900 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id));
4901 }
4902
4903 #endif // KMP_AFFINITY_SUPPORTED
4904
4905 /* allocate a new team data structure to use. take one off of the free pool if
4906 available */
4907 kmp_team_t *
__kmp_allocate_team(kmp_root_t * root,int new_nproc,int max_nproc,ompt_data_t ompt_parallel_data,kmp_proc_bind_t new_proc_bind,kmp_internal_control_t * new_icvs,int argc USE_NESTED_HOT_ARG (kmp_info_t * master))4908 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc,
4909 #if OMPT_SUPPORT
4910 ompt_data_t ompt_parallel_data,
4911 #endif
4912 kmp_proc_bind_t new_proc_bind,
4913 kmp_internal_control_t *new_icvs,
4914 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) {
4915 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4916 int f;
4917 kmp_team_t *team;
4918 int use_hot_team = !root->r.r_active;
4919 int level = 0;
4920
4921 KA_TRACE(20, ("__kmp_allocate_team: called\n"));
4922 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0);
4923 KMP_DEBUG_ASSERT(max_nproc >= new_nproc);
4924 KMP_MB();
4925
4926 #if KMP_NESTED_HOT_TEAMS
4927 kmp_hot_team_ptr_t *hot_teams;
4928 if (master) {
4929 team = master->th.th_team;
4930 level = team->t.t_active_level;
4931 if (master->th.th_teams_microtask) { // in teams construct?
4932 if (master->th.th_teams_size.nteams > 1 &&
4933 ( // #teams > 1
4934 team->t.t_pkfn ==
4935 (microtask_t)__kmp_teams_master || // inner fork of the teams
4936 master->th.th_teams_level <
4937 team->t.t_level)) { // or nested parallel inside the teams
4938 ++level; // not increment if #teams==1, or for outer fork of the teams;
4939 // increment otherwise
4940 }
4941 }
4942 hot_teams = master->th.th_hot_teams;
4943 if (level < __kmp_hot_teams_max_level && hot_teams &&
4944 hot_teams[level].hot_team) {
4945 // hot team has already been allocated for given level
4946 use_hot_team = 1;
4947 } else {
4948 use_hot_team = 0;
4949 }
4950 } else {
4951 // check we won't access uninitialized hot_teams, just in case
4952 KMP_DEBUG_ASSERT(new_nproc == 1);
4953 }
4954 #endif
4955 // Optimization to use a "hot" team
4956 if (use_hot_team && new_nproc > 1) {
4957 KMP_DEBUG_ASSERT(new_nproc <= max_nproc);
4958 #if KMP_NESTED_HOT_TEAMS
4959 team = hot_teams[level].hot_team;
4960 #else
4961 team = root->r.r_hot_team;
4962 #endif
4963 #if KMP_DEBUG
4964 if (__kmp_tasking_mode != tskm_immediate_exec) {
4965 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
4966 "task_team[1] = %p before reinit\n",
4967 team->t.t_task_team[0], team->t.t_task_team[1]));
4968 }
4969 #endif
4970
4971 // Has the number of threads changed?
4972 /* Let's assume the most common case is that the number of threads is
4973 unchanged, and put that case first. */
4974 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4975 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n"));
4976 // This case can mean that omp_set_num_threads() was called and the hot
4977 // team size was already reduced, so we check the special flag
4978 if (team->t.t_size_changed == -1) {
4979 team->t.t_size_changed = 1;
4980 } else {
4981 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4982 }
4983
4984 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4985 kmp_r_sched_t new_sched = new_icvs->sched;
4986 // set master's schedule as new run-time schedule
4987 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched);
4988
4989 __kmp_reinitialize_team(team, new_icvs,
4990 root->r.r_uber_thread->th.th_ident);
4991
4992 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0,
4993 team->t.t_threads[0], team));
4994 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
4995
4996 #if KMP_AFFINITY_SUPPORTED
4997 if ((team->t.t_size_changed == 0) &&
4998 (team->t.t_proc_bind == new_proc_bind)) {
4999 if (new_proc_bind == proc_bind_spread) {
5000 __kmp_partition_places(
5001 team, 1); // add flag to update only master for spread
5002 }
5003 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: "
5004 "proc_bind = %d, partition = [%d,%d]\n",
5005 team->t.t_id, new_proc_bind, team->t.t_first_place,
5006 team->t.t_last_place));
5007 } else {
5008 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5009 __kmp_partition_places(team);
5010 }
5011 #else
5012 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5013 #endif /* KMP_AFFINITY_SUPPORTED */
5014 } else if (team->t.t_nproc > new_nproc) {
5015 KA_TRACE(20,
5016 ("__kmp_allocate_team: decreasing hot team thread count to %d\n",
5017 new_nproc));
5018
5019 team->t.t_size_changed = 1;
5020 #if KMP_NESTED_HOT_TEAMS
5021 if (__kmp_hot_teams_mode == 0) {
5022 // AC: saved number of threads should correspond to team's value in this
5023 // mode, can be bigger in mode 1, when hot team has threads in reserve
5024 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
5025 hot_teams[level].hot_team_nth = new_nproc;
5026 #endif // KMP_NESTED_HOT_TEAMS
5027 /* release the extra threads we don't need any more */
5028 for (f = new_nproc; f < team->t.t_nproc; f++) {
5029 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5030 if (__kmp_tasking_mode != tskm_immediate_exec) {
5031 // When decreasing team size, threads no longer in the team should
5032 // unref task team.
5033 team->t.t_threads[f]->th.th_task_team = NULL;
5034 }
5035 __kmp_free_thread(team->t.t_threads[f]);
5036 team->t.t_threads[f] = NULL;
5037 }
5038 #if KMP_NESTED_HOT_TEAMS
5039 } // (__kmp_hot_teams_mode == 0)
5040 else {
5041 // When keeping extra threads in team, switch threads to wait on own
5042 // b_go flag
5043 for (f = new_nproc; f < team->t.t_nproc; ++f) {
5044 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5045 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
5046 for (int b = 0; b < bs_last_barrier; ++b) {
5047 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
5048 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5049 }
5050 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
5051 }
5052 }
5053 }
5054 #endif // KMP_NESTED_HOT_TEAMS
5055 team->t.t_nproc = new_nproc;
5056 // TODO???: team->t.t_max_active_levels = new_max_active_levels;
5057 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched);
5058 __kmp_reinitialize_team(team, new_icvs,
5059 root->r.r_uber_thread->th.th_ident);
5060
5061 // Update remaining threads
5062 for (f = 0; f < new_nproc; ++f) {
5063 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
5064 }
5065
5066 // restore the current task state of the master thread: should be the
5067 // implicit task
5068 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0,
5069 team->t.t_threads[0], team));
5070
5071 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0);
5072
5073 #ifdef KMP_DEBUG
5074 for (f = 0; f < team->t.t_nproc; f++) {
5075 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5076 team->t.t_threads[f]->th.th_team_nproc ==
5077 team->t.t_nproc);
5078 }
5079 #endif
5080
5081 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5082 #if KMP_AFFINITY_SUPPORTED
5083 __kmp_partition_places(team);
5084 #endif
5085 } else { // team->t.t_nproc < new_nproc
5086 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5087 kmp_affin_mask_t *old_mask;
5088 if (KMP_AFFINITY_CAPABLE()) {
5089 KMP_CPU_ALLOC(old_mask);
5090 }
5091 #endif
5092
5093 KA_TRACE(20,
5094 ("__kmp_allocate_team: increasing hot team thread count to %d\n",
5095 new_nproc));
5096
5097 team->t.t_size_changed = 1;
5098
5099 #if KMP_NESTED_HOT_TEAMS
5100 int avail_threads = hot_teams[level].hot_team_nth;
5101 if (new_nproc < avail_threads)
5102 avail_threads = new_nproc;
5103 kmp_info_t **other_threads = team->t.t_threads;
5104 for (f = team->t.t_nproc; f < avail_threads; ++f) {
5105 // Adjust barrier data of reserved threads (if any) of the team
5106 // Other data will be set in __kmp_initialize_info() below.
5107 int b;
5108 kmp_balign_t *balign = other_threads[f]->th.th_bar;
5109 for (b = 0; b < bs_last_barrier; ++b) {
5110 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5111 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5112 #if USE_DEBUGGER
5113 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5114 #endif
5115 }
5116 }
5117 if (hot_teams[level].hot_team_nth >= new_nproc) {
5118 // we have all needed threads in reserve, no need to allocate any
5119 // this only possible in mode 1, cannot have reserved threads in mode 0
5120 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
5121 team->t.t_nproc = new_nproc; // just get reserved threads involved
5122 } else {
5123 // we may have some threads in reserve, but not enough
5124 team->t.t_nproc =
5125 hot_teams[level]
5126 .hot_team_nth; // get reserved threads involved if any
5127 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size
5128 #endif // KMP_NESTED_HOT_TEAMS
5129 if (team->t.t_max_nproc < new_nproc) {
5130 /* reallocate larger arrays */
5131 __kmp_reallocate_team_arrays(team, new_nproc);
5132 __kmp_reinitialize_team(team, new_icvs, NULL);
5133 }
5134
5135 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5136 /* Temporarily set full mask for master thread before creation of
5137 workers. The reason is that workers inherit the affinity from master,
5138 so if a lot of workers are created on the single core quickly, they
5139 don't get a chance to set their own affinity for a long time. */
5140 __kmp_set_thread_affinity_mask_full_tmp(old_mask);
5141 #endif
5142
5143 /* allocate new threads for the hot team */
5144 for (f = team->t.t_nproc; f < new_nproc; f++) {
5145 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f);
5146 KMP_DEBUG_ASSERT(new_worker);
5147 team->t.t_threads[f] = new_worker;
5148
5149 KA_TRACE(20,
5150 ("__kmp_allocate_team: team %d init T#%d arrived: "
5151 "join=%llu, plain=%llu\n",
5152 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f,
5153 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
5154 team->t.t_bar[bs_plain_barrier].b_arrived));
5155
5156 { // Initialize barrier data for new threads.
5157 int b;
5158 kmp_balign_t *balign = new_worker->th.th_bar;
5159 for (b = 0; b < bs_last_barrier; ++b) {
5160 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5161 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag !=
5162 KMP_BARRIER_PARENT_FLAG);
5163 #if USE_DEBUGGER
5164 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5165 #endif
5166 }
5167 }
5168 }
5169
5170 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED
5171 if (KMP_AFFINITY_CAPABLE()) {
5172 /* Restore initial master thread's affinity mask */
5173 __kmp_set_system_affinity(old_mask, TRUE);
5174 KMP_CPU_FREE(old_mask);
5175 }
5176 #endif
5177 #if KMP_NESTED_HOT_TEAMS
5178 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5179 #endif // KMP_NESTED_HOT_TEAMS
5180 /* make sure everyone is syncronized */
5181 int old_nproc = team->t.t_nproc; // save old value and use to update only
5182 // new threads below
5183 __kmp_initialize_team(team, new_nproc, new_icvs,
5184 root->r.r_uber_thread->th.th_ident);
5185
5186 /* reinitialize the threads */
5187 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5188 for (f = 0; f < team->t.t_nproc; ++f)
5189 __kmp_initialize_info(team->t.t_threads[f], team, f,
5190 __kmp_gtid_from_tid(f, team));
5191
5192 if (level) { // set th_task_state for new threads in nested hot team
5193 // __kmp_initialize_info() no longer zeroes th_task_state, so we should
5194 // only need to set the th_task_state for the new threads. th_task_state
5195 // for master thread will not be accurate until after this in
5196 // __kmp_fork_call(), so we look to the master's memo_stack to get the
5197 // correct value.
5198 for (f = old_nproc; f < team->t.t_nproc; ++f)
5199 team->t.t_threads[f]->th.th_task_state =
5200 team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5201 } else { // set th_task_state for new threads in non-nested hot team
5202 int old_state =
5203 team->t.t_threads[0]->th.th_task_state; // copy master's state
5204 for (f = old_nproc; f < team->t.t_nproc; ++f)
5205 team->t.t_threads[f]->th.th_task_state = old_state;
5206 }
5207
5208 #ifdef KMP_DEBUG
5209 for (f = 0; f < team->t.t_nproc; ++f) {
5210 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
5211 team->t.t_threads[f]->th.th_team_nproc ==
5212 team->t.t_nproc);
5213 }
5214 #endif
5215
5216 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5217 #if KMP_AFFINITY_SUPPORTED
5218 __kmp_partition_places(team);
5219 #endif
5220 } // Check changes in number of threads
5221
5222 kmp_info_t *master = team->t.t_threads[0];
5223 if (master->th.th_teams_microtask) {
5224 for (f = 1; f < new_nproc; ++f) {
5225 // propagate teams construct specific info to workers
5226 kmp_info_t *thr = team->t.t_threads[f];
5227 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5228 thr->th.th_teams_level = master->th.th_teams_level;
5229 thr->th.th_teams_size = master->th.th_teams_size;
5230 }
5231 }
5232 #if KMP_NESTED_HOT_TEAMS
5233 if (level) {
5234 // Sync barrier state for nested hot teams, not needed for outermost hot
5235 // team.
5236 for (f = 1; f < new_nproc; ++f) {
5237 kmp_info_t *thr = team->t.t_threads[f];
5238 int b;
5239 kmp_balign_t *balign = thr->th.th_bar;
5240 for (b = 0; b < bs_last_barrier; ++b) {
5241 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
5242 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5243 #if USE_DEBUGGER
5244 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
5245 #endif
5246 }
5247 }
5248 }
5249 #endif // KMP_NESTED_HOT_TEAMS
5250
5251 /* reallocate space for arguments if necessary */
5252 __kmp_alloc_argv_entries(argc, team, TRUE);
5253 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5254 // The hot team re-uses the previous task team,
5255 // if untouched during the previous release->gather phase.
5256
5257 KF_TRACE(10, (" hot_team = %p\n", team));
5258
5259 #if KMP_DEBUG
5260 if (__kmp_tasking_mode != tskm_immediate_exec) {
5261 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p "
5262 "task_team[1] = %p after reinit\n",
5263 team->t.t_task_team[0], team->t.t_task_team[1]));
5264 }
5265 #endif
5266
5267 #if OMPT_SUPPORT
5268 __ompt_team_assign_id(team, ompt_parallel_data);
5269 #endif
5270
5271 KMP_MB();
5272
5273 return team;
5274 }
5275
5276 /* next, let's try to take one from the team pool */
5277 KMP_MB();
5278 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) {
5279 /* TODO: consider resizing undersized teams instead of reaping them, now
5280 that we have a resizing mechanism */
5281 if (team->t.t_max_nproc >= max_nproc) {
5282 /* take this team from the team pool */
5283 __kmp_team_pool = team->t.t_next_pool;
5284
5285 /* setup the team for fresh use */
5286 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5287
5288 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and "
5289 "task_team[1] %p to NULL\n",
5290 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5291 team->t.t_task_team[0] = NULL;
5292 team->t.t_task_team[1] = NULL;
5293
5294 /* reallocate space for arguments if necessary */
5295 __kmp_alloc_argv_entries(argc, team, TRUE);
5296 KMP_CHECK_UPDATE(team->t.t_argc, argc);
5297
5298 KA_TRACE(
5299 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5300 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5301 { // Initialize barrier data.
5302 int b;
5303 for (b = 0; b < bs_last_barrier; ++b) {
5304 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5305 #if USE_DEBUGGER
5306 team->t.t_bar[b].b_master_arrived = 0;
5307 team->t.t_bar[b].b_team_arrived = 0;
5308 #endif
5309 }
5310 }
5311
5312 team->t.t_proc_bind = new_proc_bind;
5313
5314 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n",
5315 team->t.t_id));
5316
5317 #if OMPT_SUPPORT
5318 __ompt_team_assign_id(team, ompt_parallel_data);
5319 #endif
5320
5321 KMP_MB();
5322
5323 return team;
5324 }
5325
5326 /* reap team if it is too small, then loop back and check the next one */
5327 // not sure if this is wise, but, will be redone during the hot-teams
5328 // rewrite.
5329 /* TODO: Use technique to find the right size hot-team, don't reap them */
5330 team = __kmp_reap_team(team);
5331 __kmp_team_pool = team;
5332 }
5333
5334 /* nothing available in the pool, no matter, make a new team! */
5335 KMP_MB();
5336 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t));
5337
5338 /* and set it up */
5339 team->t.t_max_nproc = max_nproc;
5340 /* NOTE well, for some reason allocating one big buffer and dividing it up
5341 seems to really hurt performance a lot on the P4, so, let's not use this */
5342 __kmp_allocate_team_arrays(team, max_nproc);
5343
5344 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n"));
5345 __kmp_initialize_team(team, new_nproc, new_icvs, NULL);
5346
5347 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] "
5348 "%p to NULL\n",
5349 &team->t.t_task_team[0], &team->t.t_task_team[1]));
5350 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes
5351 // memory, no need to duplicate
5352 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes
5353 // memory, no need to duplicate
5354
5355 if (__kmp_storage_map) {
5356 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc);
5357 }
5358
5359 /* allocate space for arguments */
5360 __kmp_alloc_argv_entries(argc, team, FALSE);
5361 team->t.t_argc = argc;
5362
5363 KA_TRACE(20,
5364 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5365 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE));
5366 { // Initialize barrier data.
5367 int b;
5368 for (b = 0; b < bs_last_barrier; ++b) {
5369 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE;
5370 #if USE_DEBUGGER
5371 team->t.t_bar[b].b_master_arrived = 0;
5372 team->t.t_bar[b].b_team_arrived = 0;
5373 #endif
5374 }
5375 }
5376
5377 team->t.t_proc_bind = new_proc_bind;
5378
5379 #if OMPT_SUPPORT
5380 __ompt_team_assign_id(team, ompt_parallel_data);
5381 team->t.ompt_serialized_team_info = NULL;
5382 #endif
5383
5384 KMP_MB();
5385
5386 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n",
5387 team->t.t_id));
5388
5389 return team;
5390 }
5391
5392 /* TODO implement hot-teams at all levels */
5393 /* TODO implement lazy thread release on demand (disband request) */
5394
5395 /* free the team. return it to the team pool. release all the threads
5396 * associated with it */
__kmp_free_team(kmp_root_t * root,kmp_team_t * team USE_NESTED_HOT_ARG (kmp_info_t * master))5397 void __kmp_free_team(kmp_root_t *root,
5398 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) {
5399 int f;
5400 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(),
5401 team->t.t_id));
5402
5403 /* verify state */
5404 KMP_DEBUG_ASSERT(root);
5405 KMP_DEBUG_ASSERT(team);
5406 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc);
5407 KMP_DEBUG_ASSERT(team->t.t_threads);
5408
5409 int use_hot_team = team == root->r.r_hot_team;
5410 #if KMP_NESTED_HOT_TEAMS
5411 int level;
5412 kmp_hot_team_ptr_t *hot_teams;
5413 if (master) {
5414 level = team->t.t_active_level - 1;
5415 if (master->th.th_teams_microtask) { // in teams construct?
5416 if (master->th.th_teams_size.nteams > 1) {
5417 ++level; // level was not increased in teams construct for
5418 // team_of_masters
5419 }
5420 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5421 master->th.th_teams_level == team->t.t_level) {
5422 ++level; // level was not increased in teams construct for
5423 // team_of_workers before the parallel
5424 } // team->t.t_level will be increased inside parallel
5425 }
5426 hot_teams = master->th.th_hot_teams;
5427 if (level < __kmp_hot_teams_max_level) {
5428 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team);
5429 use_hot_team = 1;
5430 }
5431 }
5432 #endif // KMP_NESTED_HOT_TEAMS
5433
5434 /* team is done working */
5435 TCW_SYNC_PTR(team->t.t_pkfn,
5436 NULL); // Important for Debugging Support Library.
5437 #if KMP_OS_WINDOWS
5438 team->t.t_copyin_counter = 0; // init counter for possible reuse
5439 #endif
5440 // Do not reset pointer to parent team to NULL for hot teams.
5441
5442 /* if we are non-hot team, release our threads */
5443 if (!use_hot_team) {
5444 if (__kmp_tasking_mode != tskm_immediate_exec) {
5445 // Wait for threads to reach reapable state
5446 for (f = 1; f < team->t.t_nproc; ++f) {
5447 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5448 kmp_info_t *th = team->t.t_threads[f];
5449 volatile kmp_uint32 *state = &th->th.th_reap_state;
5450 while (*state != KMP_SAFE_TO_REAP) {
5451 #if KMP_OS_WINDOWS
5452 // On Windows a thread can be killed at any time, check this
5453 DWORD ecode;
5454 if (!__kmp_is_thread_alive(th, &ecode)) {
5455 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5456 break;
5457 }
5458 #endif
5459 // first check if thread is sleeping
5460 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5461 if (fl.is_sleeping())
5462 fl.resume(__kmp_gtid_from_thread(th));
5463 KMP_CPU_PAUSE();
5464 }
5465 }
5466
5467 // Delete task teams
5468 int tt_idx;
5469 for (tt_idx = 0; tt_idx < 2; ++tt_idx) {
5470 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5471 if (task_team != NULL) {
5472 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams
5473 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5474 team->t.t_threads[f]->th.th_task_team = NULL;
5475 }
5476 KA_TRACE(
5477 20,
5478 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n",
5479 __kmp_get_gtid(), task_team, team->t.t_id));
5480 #if KMP_NESTED_HOT_TEAMS
5481 __kmp_free_task_team(master, task_team);
5482 #endif
5483 team->t.t_task_team[tt_idx] = NULL;
5484 }
5485 }
5486 }
5487
5488 // Reset pointer to parent team only for non-hot teams.
5489 team->t.t_parent = NULL;
5490 team->t.t_level = 0;
5491 team->t.t_active_level = 0;
5492
5493 /* free the worker threads */
5494 for (f = 1; f < team->t.t_nproc; ++f) {
5495 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5496 __kmp_free_thread(team->t.t_threads[f]);
5497 team->t.t_threads[f] = NULL;
5498 }
5499
5500 /* put the team back in the team pool */
5501 /* TODO limit size of team pool, call reap_team if pool too large */
5502 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool);
5503 __kmp_team_pool = (volatile kmp_team_t *)team;
5504 } else { // Check if team was created for the masters in a teams construct
5505 // See if first worker is a CG root
5506 KMP_DEBUG_ASSERT(team->t.t_threads[1] &&
5507 team->t.t_threads[1]->th.th_cg_roots);
5508 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) {
5509 // Clean up the CG root nodes on workers so that this team can be re-used
5510 for (f = 1; f < team->t.t_nproc; ++f) {
5511 kmp_info_t *thr = team->t.t_threads[f];
5512 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots &&
5513 thr->th.th_cg_roots->cg_root == thr);
5514 // Pop current CG root off list
5515 kmp_cg_root_t *tmp = thr->th.th_cg_roots;
5516 thr->th.th_cg_roots = tmp->up;
5517 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving"
5518 " up to node %p. cg_nthreads was %d\n",
5519 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads));
5520 int i = tmp->cg_nthreads--;
5521 if (i == 1) {
5522 __kmp_free(tmp); // free CG if we are the last thread in it
5523 }
5524 // Restore current task's thread_limit from CG root
5525 if (thr->th.th_cg_roots)
5526 thr->th.th_current_task->td_icvs.thread_limit =
5527 thr->th.th_cg_roots->cg_thread_limit;
5528 }
5529 }
5530 }
5531
5532 KMP_MB();
5533 }
5534
5535 /* reap the team. destroy it, reclaim all its resources and free its memory */
__kmp_reap_team(kmp_team_t * team)5536 kmp_team_t *__kmp_reap_team(kmp_team_t *team) {
5537 kmp_team_t *next_pool = team->t.t_next_pool;
5538
5539 KMP_DEBUG_ASSERT(team);
5540 KMP_DEBUG_ASSERT(team->t.t_dispatch);
5541 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
5542 KMP_DEBUG_ASSERT(team->t.t_threads);
5543 KMP_DEBUG_ASSERT(team->t.t_argv);
5544
5545 /* TODO clean the threads that are a part of this? */
5546
5547 /* free stuff */
5548 __kmp_free_team_arrays(team);
5549 if (team->t.t_argv != &team->t.t_inline_argv[0])
5550 __kmp_free((void *)team->t.t_argv);
5551 __kmp_free(team);
5552
5553 KMP_MB();
5554 return next_pool;
5555 }
5556
5557 // Free the thread. Don't reap it, just place it on the pool of available
5558 // threads.
5559 //
5560 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5561 // binding for the affinity mechanism to be useful.
5562 //
5563 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5564 // However, we want to avoid a potential performance problem by always
5565 // scanning through the list to find the correct point at which to insert
5566 // the thread (potential N**2 behavior). To do this we keep track of the
5567 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5568 // With single-level parallelism, threads will always be added to the tail
5569 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested
5570 // parallelism, all bets are off and we may need to scan through the entire
5571 // free list.
5572 //
5573 // This change also has a potentially large performance benefit, for some
5574 // applications. Previously, as threads were freed from the hot team, they
5575 // would be placed back on the free list in inverse order. If the hot team
5576 // grew back to it's original size, then the freed thread would be placed
5577 // back on the hot team in reverse order. This could cause bad cache
5578 // locality problems on programs where the size of the hot team regularly
5579 // grew and shrunk.
5580 //
5581 // Now, for single-level parallelism, the OMP tid is always == gtid.
__kmp_free_thread(kmp_info_t * this_th)5582 void __kmp_free_thread(kmp_info_t *this_th) {
5583 int gtid;
5584 kmp_info_t **scan;
5585
5586 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5587 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid));
5588
5589 KMP_DEBUG_ASSERT(this_th);
5590
5591 // When moving thread to pool, switch thread to wait on own b_go flag, and
5592 // uninitialized (NULL team).
5593 int b;
5594 kmp_balign_t *balign = this_th->th.th_bar;
5595 for (b = 0; b < bs_last_barrier; ++b) {
5596 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5597 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5598 balign[b].bb.team = NULL;
5599 balign[b].bb.leaf_kids = 0;
5600 }
5601 this_th->th.th_task_state = 0;
5602 this_th->th.th_reap_state = KMP_SAFE_TO_REAP;
5603
5604 /* put thread back on the free pool */
5605 TCW_PTR(this_th->th.th_team, NULL);
5606 TCW_PTR(this_th->th.th_root, NULL);
5607 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */
5608
5609 while (this_th->th.th_cg_roots) {
5610 this_th->th.th_cg_roots->cg_nthreads--;
5611 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node"
5612 " %p of thread %p to %d\n",
5613 this_th, this_th->th.th_cg_roots,
5614 this_th->th.th_cg_roots->cg_root,
5615 this_th->th.th_cg_roots->cg_nthreads));
5616 kmp_cg_root_t *tmp = this_th->th.th_cg_roots;
5617 if (tmp->cg_root == this_th) { // Thread is a cg_root
5618 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0);
5619 KA_TRACE(
5620 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp));
5621 this_th->th.th_cg_roots = tmp->up;
5622 __kmp_free(tmp);
5623 } else { // Worker thread
5624 if (tmp->cg_nthreads == 0) { // last thread leaves contention group
5625 __kmp_free(tmp);
5626 }
5627 this_th->th.th_cg_roots = NULL;
5628 break;
5629 }
5630 }
5631
5632 /* If the implicit task assigned to this thread can be used by other threads
5633 * -> multiple threads can share the data and try to free the task at
5634 * __kmp_reap_thread at exit. This duplicate use of the task data can happen
5635 * with higher probability when hot team is disabled but can occurs even when
5636 * the hot team is enabled */
5637 __kmp_free_implicit_task(this_th);
5638 this_th->th.th_current_task = NULL;
5639
5640 // If the __kmp_thread_pool_insert_pt is already past the new insert
5641 // point, then we need to re-scan the entire list.
5642 gtid = this_th->th.th_info.ds.ds_gtid;
5643 if (__kmp_thread_pool_insert_pt != NULL) {
5644 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL);
5645 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) {
5646 __kmp_thread_pool_insert_pt = NULL;
5647 }
5648 }
5649
5650 // Scan down the list to find the place to insert the thread.
5651 // scan is the address of a link in the list, possibly the address of
5652 // __kmp_thread_pool itself.
5653 //
5654 // In the absence of nested parallelism, the for loop will have 0 iterations.
5655 if (__kmp_thread_pool_insert_pt != NULL) {
5656 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool);
5657 } else {
5658 scan = CCAST(kmp_info_t **, &__kmp_thread_pool);
5659 }
5660 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid);
5661 scan = &((*scan)->th.th_next_pool))
5662 ;
5663
5664 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5665 // to its address.
5666 TCW_PTR(this_th->th.th_next_pool, *scan);
5667 __kmp_thread_pool_insert_pt = *scan = this_th;
5668 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) ||
5669 (this_th->th.th_info.ds.ds_gtid <
5670 this_th->th.th_next_pool->th.th_info.ds.ds_gtid));
5671 TCW_4(this_th->th.th_in_pool, TRUE);
5672 __kmp_suspend_initialize_thread(this_th);
5673 __kmp_lock_suspend_mx(this_th);
5674 if (this_th->th.th_active == TRUE) {
5675 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth);
5676 this_th->th.th_active_in_pool = TRUE;
5677 }
5678 #if KMP_DEBUG
5679 else {
5680 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE);
5681 }
5682 #endif
5683 __kmp_unlock_suspend_mx(this_th);
5684
5685 TCW_4(__kmp_nth, __kmp_nth - 1);
5686
5687 #ifdef KMP_ADJUST_BLOCKTIME
5688 /* Adjust blocktime back to user setting or default if necessary */
5689 /* Middle initialization might never have occurred */
5690 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5691 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5692 if (__kmp_nth <= __kmp_avail_proc) {
5693 __kmp_zero_bt = FALSE;
5694 }
5695 }
5696 #endif /* KMP_ADJUST_BLOCKTIME */
5697
5698 KMP_MB();
5699 }
5700
5701 /* ------------------------------------------------------------------------ */
5702
__kmp_launch_thread(kmp_info_t * this_thr)5703 void *__kmp_launch_thread(kmp_info_t *this_thr) {
5704 int gtid = this_thr->th.th_info.ds.ds_gtid;
5705 /* void *stack_data;*/
5706 kmp_team_t **volatile pteam;
5707
5708 KMP_MB();
5709 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid));
5710
5711 if (__kmp_env_consistency_check) {
5712 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak?
5713 }
5714
5715 #if OMPT_SUPPORT
5716 ompt_data_t *thread_data;
5717 if (ompt_enabled.enabled) {
5718 thread_data = &(this_thr->th.ompt_thread_info.thread_data);
5719 *thread_data = ompt_data_none;
5720
5721 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5722 this_thr->th.ompt_thread_info.wait_id = 0;
5723 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0);
5724 this_thr->th.ompt_thread_info.parallel_flags = 0;
5725 if (ompt_enabled.ompt_callback_thread_begin) {
5726 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)(
5727 ompt_thread_worker, thread_data);
5728 }
5729 this_thr->th.ompt_thread_info.state = ompt_state_idle;
5730 }
5731 #endif
5732
5733 /* This is the place where threads wait for work */
5734 while (!TCR_4(__kmp_global.g.g_done)) {
5735 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]);
5736 KMP_MB();
5737
5738 /* wait for work to do */
5739 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid));
5740
5741 /* No tid yet since not part of a team */
5742 __kmp_fork_barrier(gtid, KMP_GTID_DNE);
5743
5744 #if OMPT_SUPPORT
5745 if (ompt_enabled.enabled) {
5746 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5747 }
5748 #endif
5749
5750 pteam = &this_thr->th.th_team;
5751
5752 /* have we been allocated? */
5753 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) {
5754 /* we were just woken up, so run our new task */
5755 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) {
5756 int rc;
5757 KA_TRACE(20,
5758 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5759 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5760 (*pteam)->t.t_pkfn));
5761
5762 updateHWFPControl(*pteam);
5763
5764 #if OMPT_SUPPORT
5765 if (ompt_enabled.enabled) {
5766 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5767 }
5768 #endif
5769
5770 rc = (*pteam)->t.t_invoke(gtid);
5771 KMP_ASSERT(rc);
5772
5773 KMP_MB();
5774 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5775 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid),
5776 (*pteam)->t.t_pkfn));
5777 }
5778 #if OMPT_SUPPORT
5779 if (ompt_enabled.enabled) {
5780 /* no frame set while outside task */
5781 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none;
5782
5783 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5784 }
5785 #endif
5786 /* join barrier after parallel region */
5787 __kmp_join_barrier(gtid);
5788 }
5789 }
5790 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5791
5792 #if OMPT_SUPPORT
5793 if (ompt_enabled.ompt_callback_thread_end) {
5794 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data);
5795 }
5796 #endif
5797
5798 this_thr->th.th_task_team = NULL;
5799 /* run the destructors for the threadprivate data for this thread */
5800 __kmp_common_destroy_gtid(gtid);
5801
5802 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid));
5803 KMP_MB();
5804 return this_thr;
5805 }
5806
5807 /* ------------------------------------------------------------------------ */
5808
__kmp_internal_end_dest(void * specific_gtid)5809 void __kmp_internal_end_dest(void *specific_gtid) {
5810 #if KMP_COMPILER_ICC
5811 #pragma warning(push)
5812 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose
5813 // significant bits
5814 #endif
5815 // Make sure no significant bits are lost
5816 int gtid = (kmp_intptr_t)specific_gtid - 1;
5817 #if KMP_COMPILER_ICC
5818 #pragma warning(pop)
5819 #endif
5820
5821 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5822 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5823 * this is because 0 is reserved for the nothing-stored case */
5824
5825 __kmp_internal_end_thread(gtid);
5826 }
5827
5828 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5829
__kmp_internal_end_dtor(void)5830 __attribute__((destructor)) void __kmp_internal_end_dtor(void) {
5831 __kmp_internal_end_atexit();
5832 }
5833
5834 #endif
5835
5836 /* [Windows] josh: when the atexit handler is called, there may still be more
5837 than one thread alive */
__kmp_internal_end_atexit(void)5838 void __kmp_internal_end_atexit(void) {
5839 KA_TRACE(30, ("__kmp_internal_end_atexit\n"));
5840 /* [Windows]
5841 josh: ideally, we want to completely shutdown the library in this atexit
5842 handler, but stat code that depends on thread specific data for gtid fails
5843 because that data becomes unavailable at some point during the shutdown, so
5844 we call __kmp_internal_end_thread instead. We should eventually remove the
5845 dependency on __kmp_get_specific_gtid in the stat code and use
5846 __kmp_internal_end_library to cleanly shutdown the library.
5847
5848 // TODO: Can some of this comment about GVS be removed?
5849 I suspect that the offending stat code is executed when the calling thread
5850 tries to clean up a dead root thread's data structures, resulting in GVS
5851 code trying to close the GVS structures for that thread, but since the stat
5852 code uses __kmp_get_specific_gtid to get the gtid with the assumption that
5853 the calling thread is cleaning up itself instead of another thread, it get
5854 confused. This happens because allowing a thread to unregister and cleanup
5855 another thread is a recent modification for addressing an issue.
5856 Based on the current design (20050722), a thread may end up
5857 trying to unregister another thread only if thread death does not trigger
5858 the calling of __kmp_internal_end_thread. For Linux* OS, there is the
5859 thread specific data destructor function to detect thread death. For
5860 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there
5861 is nothing. Thus, the workaround is applicable only for Windows static
5862 stat library. */
5863 __kmp_internal_end_library(-1);
5864 #if KMP_OS_WINDOWS
5865 __kmp_close_console();
5866 #endif
5867 }
5868
__kmp_reap_thread(kmp_info_t * thread,int is_root)5869 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) {
5870 // It is assumed __kmp_forkjoin_lock is acquired.
5871
5872 int gtid;
5873
5874 KMP_DEBUG_ASSERT(thread != NULL);
5875
5876 gtid = thread->th.th_info.ds.ds_gtid;
5877
5878 if (!is_root) {
5879 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
5880 /* Assume the threads are at the fork barrier here */
5881 KA_TRACE(
5882 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n",
5883 gtid));
5884 /* Need release fence here to prevent seg faults for tree forkjoin barrier
5885 * (GEH) */
5886 ANNOTATE_HAPPENS_BEFORE(thread);
5887 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
5888 thread);
5889 __kmp_release_64(&flag);
5890 }
5891
5892 // Terminate OS thread.
5893 __kmp_reap_worker(thread);
5894
5895 // The thread was killed asynchronously. If it was actively
5896 // spinning in the thread pool, decrement the global count.
5897 //
5898 // There is a small timing hole here - if the worker thread was just waking
5899 // up after sleeping in the pool, had reset it's th_active_in_pool flag but
5900 // not decremented the global counter __kmp_thread_pool_active_nth yet, then
5901 // the global counter might not get updated.
5902 //
5903 // Currently, this can only happen as the library is unloaded,
5904 // so there are no harmful side effects.
5905 if (thread->th.th_active_in_pool) {
5906 thread->th.th_active_in_pool = FALSE;
5907 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth);
5908 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0);
5909 }
5910 }
5911
5912 __kmp_free_implicit_task(thread);
5913
5914 // Free the fast memory for tasking
5915 #if USE_FAST_MEMORY
5916 __kmp_free_fast_memory(thread);
5917 #endif /* USE_FAST_MEMORY */
5918
5919 __kmp_suspend_uninitialize_thread(thread);
5920
5921 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread);
5922 TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5923
5924 --__kmp_all_nth;
5925 // __kmp_nth was decremented when thread is added to the pool.
5926
5927 #ifdef KMP_ADJUST_BLOCKTIME
5928 /* Adjust blocktime back to user setting or default if necessary */
5929 /* Middle initialization might never have occurred */
5930 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
5931 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
5932 if (__kmp_nth <= __kmp_avail_proc) {
5933 __kmp_zero_bt = FALSE;
5934 }
5935 }
5936 #endif /* KMP_ADJUST_BLOCKTIME */
5937
5938 /* free the memory being used */
5939 if (__kmp_env_consistency_check) {
5940 if (thread->th.th_cons) {
5941 __kmp_free_cons_stack(thread->th.th_cons);
5942 thread->th.th_cons = NULL;
5943 }
5944 }
5945
5946 if (thread->th.th_pri_common != NULL) {
5947 __kmp_free(thread->th.th_pri_common);
5948 thread->th.th_pri_common = NULL;
5949 }
5950
5951 if (thread->th.th_task_state_memo_stack != NULL) {
5952 __kmp_free(thread->th.th_task_state_memo_stack);
5953 thread->th.th_task_state_memo_stack = NULL;
5954 }
5955
5956 #if KMP_USE_BGET
5957 if (thread->th.th_local.bget_data != NULL) {
5958 __kmp_finalize_bget(thread);
5959 }
5960 #endif
5961
5962 #if KMP_AFFINITY_SUPPORTED
5963 if (thread->th.th_affin_mask != NULL) {
5964 KMP_CPU_FREE(thread->th.th_affin_mask);
5965 thread->th.th_affin_mask = NULL;
5966 }
5967 #endif /* KMP_AFFINITY_SUPPORTED */
5968
5969 #if KMP_USE_HIER_SCHED
5970 if (thread->th.th_hier_bar_data != NULL) {
5971 __kmp_free(thread->th.th_hier_bar_data);
5972 thread->th.th_hier_bar_data = NULL;
5973 }
5974 #endif
5975
5976 __kmp_reap_team(thread->th.th_serial_team);
5977 thread->th.th_serial_team = NULL;
5978 __kmp_free(thread);
5979
5980 KMP_MB();
5981
5982 } // __kmp_reap_thread
5983
__kmp_internal_end(void)5984 static void __kmp_internal_end(void) {
5985 int i;
5986
5987 /* First, unregister the library */
5988 __kmp_unregister_library();
5989
5990 #if KMP_OS_WINDOWS
5991 /* In Win static library, we can't tell when a root actually dies, so we
5992 reclaim the data structures for any root threads that have died but not
5993 unregistered themselves, in order to shut down cleanly.
5994 In Win dynamic library we also can't tell when a thread dies. */
5995 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of
5996 // dead roots
5997 #endif
5998
5999 for (i = 0; i < __kmp_threads_capacity; i++)
6000 if (__kmp_root[i])
6001 if (__kmp_root[i]->r.r_active)
6002 break;
6003 KMP_MB(); /* Flush all pending memory write invalidates. */
6004 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6005
6006 if (i < __kmp_threads_capacity) {
6007 #if KMP_USE_MONITOR
6008 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
6009 KMP_MB(); /* Flush all pending memory write invalidates. */
6010
6011 // Need to check that monitor was initialized before reaping it. If we are
6012 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then
6013 // __kmp_monitor will appear to contain valid data, but it is only valid in
6014 // the parent process, not the child.
6015 // New behavior (201008): instead of keying off of the flag
6016 // __kmp_init_parallel, the monitor thread creation is keyed off
6017 // of the new flag __kmp_init_monitor.
6018 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6019 if (TCR_4(__kmp_init_monitor)) {
6020 __kmp_reap_monitor(&__kmp_monitor);
6021 TCW_4(__kmp_init_monitor, 0);
6022 }
6023 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6024 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6025 #endif // KMP_USE_MONITOR
6026 } else {
6027 /* TODO move this to cleanup code */
6028 #ifdef KMP_DEBUG
6029 /* make sure that everything has properly ended */
6030 for (i = 0; i < __kmp_threads_capacity; i++) {
6031 if (__kmp_root[i]) {
6032 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC:
6033 // there can be uber threads alive here
6034 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active?
6035 }
6036 }
6037 #endif
6038
6039 KMP_MB();
6040
6041 // Reap the worker threads.
6042 // This is valid for now, but be careful if threads are reaped sooner.
6043 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool.
6044 // Get the next thread from the pool.
6045 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool);
6046 __kmp_thread_pool = thread->th.th_next_pool;
6047 // Reap it.
6048 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
6049 thread->th.th_next_pool = NULL;
6050 thread->th.th_in_pool = FALSE;
6051 __kmp_reap_thread(thread, 0);
6052 }
6053 __kmp_thread_pool_insert_pt = NULL;
6054
6055 // Reap teams.
6056 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool.
6057 // Get the next team from the pool.
6058 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool);
6059 __kmp_team_pool = team->t.t_next_pool;
6060 // Reap it.
6061 team->t.t_next_pool = NULL;
6062 __kmp_reap_team(team);
6063 }
6064
6065 __kmp_reap_task_teams();
6066
6067 #if KMP_OS_UNIX
6068 // Threads that are not reaped should not access any resources since they
6069 // are going to be deallocated soon, so the shutdown sequence should wait
6070 // until all threads either exit the final spin-waiting loop or begin
6071 // sleeping after the given blocktime.
6072 for (i = 0; i < __kmp_threads_capacity; i++) {
6073 kmp_info_t *thr = __kmp_threads[i];
6074 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking))
6075 KMP_CPU_PAUSE();
6076 }
6077 #endif
6078
6079 for (i = 0; i < __kmp_threads_capacity; ++i) {
6080 // TBD: Add some checking...
6081 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
6082 }
6083
6084 /* Make sure all threadprivate destructors get run by joining with all
6085 worker threads before resetting this flag */
6086 TCW_SYNC_4(__kmp_init_common, FALSE);
6087
6088 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n"));
6089 KMP_MB();
6090
6091 #if KMP_USE_MONITOR
6092 // See note above: One of the possible fixes for CQ138434 / CQ140126
6093 //
6094 // FIXME: push both code fragments down and CSE them?
6095 // push them into __kmp_cleanup() ?
6096 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock);
6097 if (TCR_4(__kmp_init_monitor)) {
6098 __kmp_reap_monitor(&__kmp_monitor);
6099 TCW_4(__kmp_init_monitor, 0);
6100 }
6101 __kmp_release_bootstrap_lock(&__kmp_monitor_lock);
6102 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n"));
6103 #endif
6104 } /* else !__kmp_global.t_active */
6105 TCW_4(__kmp_init_gtid, FALSE);
6106 KMP_MB(); /* Flush all pending memory write invalidates. */
6107
6108 __kmp_cleanup();
6109 #if OMPT_SUPPORT
6110 ompt_fini();
6111 #endif
6112 }
6113
__kmp_internal_end_library(int gtid_req)6114 void __kmp_internal_end_library(int gtid_req) {
6115 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6116 /* this shouldn't be a race condition because __kmp_internal_end() is the
6117 only place to clear __kmp_serial_init */
6118 /* we'll check this later too, after we get the lock */
6119 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6120 // redundant, because the next check will work in any case.
6121 if (__kmp_global.g.g_abort) {
6122 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n"));
6123 /* TODO abort? */
6124 return;
6125 }
6126 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6127 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n"));
6128 return;
6129 }
6130
6131 KMP_MB(); /* Flush all pending memory write invalidates. */
6132 /* find out who we are and what we should do */
6133 {
6134 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6135 KA_TRACE(
6136 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req));
6137 if (gtid == KMP_GTID_SHUTDOWN) {
6138 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system "
6139 "already shutdown\n"));
6140 return;
6141 } else if (gtid == KMP_GTID_MONITOR) {
6142 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not "
6143 "registered, or system shutdown\n"));
6144 return;
6145 } else if (gtid == KMP_GTID_DNE) {
6146 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system "
6147 "shutdown\n"));
6148 /* we don't know who we are, but we may still shutdown the library */
6149 } else if (KMP_UBER_GTID(gtid)) {
6150 /* unregister ourselves as an uber thread. gtid is no longer valid */
6151 if (__kmp_root[gtid]->r.r_active) {
6152 __kmp_global.g.g_abort = -1;
6153 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6154 __kmp_unregister_library();
6155 KA_TRACE(10,
6156 ("__kmp_internal_end_library: root still active, abort T#%d\n",
6157 gtid));
6158 return;
6159 } else {
6160 KA_TRACE(
6161 10,
6162 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid));
6163 __kmp_unregister_root_current_thread(gtid);
6164 }
6165 } else {
6166 /* worker threads may call this function through the atexit handler, if they
6167 * call exit() */
6168 /* For now, skip the usual subsequent processing and just dump the debug buffer.
6169 TODO: do a thorough shutdown instead */
6170 #ifdef DUMP_DEBUG_ON_EXIT
6171 if (__kmp_debug_buf)
6172 __kmp_dump_debug_buffer();
6173 #endif
6174 // added unregister library call here when we switch to shm linux
6175 // if we don't, it will leave lots of files in /dev/shm
6176 // cleanup shared memory file before exiting.
6177 __kmp_unregister_library();
6178 return;
6179 }
6180 }
6181 /* synchronize the termination process */
6182 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6183
6184 /* have we already finished */
6185 if (__kmp_global.g.g_abort) {
6186 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n"));
6187 /* TODO abort? */
6188 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6189 return;
6190 }
6191 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6192 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6193 return;
6194 }
6195
6196 /* We need this lock to enforce mutex between this reading of
6197 __kmp_threads_capacity and the writing by __kmp_register_root.
6198 Alternatively, we can use a counter of roots that is atomically updated by
6199 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6200 __kmp_internal_end_*. */
6201 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6202
6203 /* now we can safely conduct the actual termination */
6204 __kmp_internal_end();
6205
6206 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6207 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6208
6209 KA_TRACE(10, ("__kmp_internal_end_library: exit\n"));
6210
6211 #ifdef DUMP_DEBUG_ON_EXIT
6212 if (__kmp_debug_buf)
6213 __kmp_dump_debug_buffer();
6214 #endif
6215
6216 #if KMP_OS_WINDOWS
6217 __kmp_close_console();
6218 #endif
6219
6220 __kmp_fini_allocator();
6221
6222 } // __kmp_internal_end_library
6223
__kmp_internal_end_thread(int gtid_req)6224 void __kmp_internal_end_thread(int gtid_req) {
6225 int i;
6226
6227 /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6228 /* this shouldn't be a race condition because __kmp_internal_end() is the
6229 * only place to clear __kmp_serial_init */
6230 /* we'll check this later too, after we get the lock */
6231 // 2009-09-06: We do not set g_abort without setting g_done. This check looks
6232 // redundant, because the next check will work in any case.
6233 if (__kmp_global.g.g_abort) {
6234 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n"));
6235 /* TODO abort? */
6236 return;
6237 }
6238 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6239 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n"));
6240 return;
6241 }
6242
6243 KMP_MB(); /* Flush all pending memory write invalidates. */
6244
6245 /* find out who we are and what we should do */
6246 {
6247 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific();
6248 KA_TRACE(10,
6249 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req));
6250 if (gtid == KMP_GTID_SHUTDOWN) {
6251 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system "
6252 "already shutdown\n"));
6253 return;
6254 } else if (gtid == KMP_GTID_MONITOR) {
6255 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not "
6256 "registered, or system shutdown\n"));
6257 return;
6258 } else if (gtid == KMP_GTID_DNE) {
6259 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system "
6260 "shutdown\n"));
6261 return;
6262 /* we don't know who we are */
6263 } else if (KMP_UBER_GTID(gtid)) {
6264 /* unregister ourselves as an uber thread. gtid is no longer valid */
6265 if (__kmp_root[gtid]->r.r_active) {
6266 __kmp_global.g.g_abort = -1;
6267 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6268 KA_TRACE(10,
6269 ("__kmp_internal_end_thread: root still active, abort T#%d\n",
6270 gtid));
6271 return;
6272 } else {
6273 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n",
6274 gtid));
6275 __kmp_unregister_root_current_thread(gtid);
6276 }
6277 } else {
6278 /* just a worker thread, let's leave */
6279 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid));
6280
6281 if (gtid >= 0) {
6282 __kmp_threads[gtid]->th.th_task_team = NULL;
6283 }
6284
6285 KA_TRACE(10,
6286 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n",
6287 gtid));
6288 return;
6289 }
6290 }
6291 #if KMP_DYNAMIC_LIB
6292 if (__kmp_pause_status != kmp_hard_paused)
6293 // AC: lets not shutdown the dynamic library at the exit of uber thread,
6294 // because we will better shutdown later in the library destructor.
6295 {
6296 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req));
6297 return;
6298 }
6299 #endif
6300 /* synchronize the termination process */
6301 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6302
6303 /* have we already finished */
6304 if (__kmp_global.g.g_abort) {
6305 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n"));
6306 /* TODO abort? */
6307 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6308 return;
6309 }
6310 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) {
6311 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6312 return;
6313 }
6314
6315 /* We need this lock to enforce mutex between this reading of
6316 __kmp_threads_capacity and the writing by __kmp_register_root.
6317 Alternatively, we can use a counter of roots that is atomically updated by
6318 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and
6319 __kmp_internal_end_*. */
6320
6321 /* should we finish the run-time? are all siblings done? */
6322 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock);
6323
6324 for (i = 0; i < __kmp_threads_capacity; ++i) {
6325 if (KMP_UBER_GTID(i)) {
6326 KA_TRACE(
6327 10,
6328 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i));
6329 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6330 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6331 return;
6332 }
6333 }
6334
6335 /* now we can safely conduct the actual termination */
6336
6337 __kmp_internal_end();
6338
6339 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock);
6340 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6341
6342 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req));
6343
6344 #ifdef DUMP_DEBUG_ON_EXIT
6345 if (__kmp_debug_buf)
6346 __kmp_dump_debug_buffer();
6347 #endif
6348 } // __kmp_internal_end_thread
6349
6350 // -----------------------------------------------------------------------------
6351 // Library registration stuff.
6352
6353 static long __kmp_registration_flag = 0;
6354 // Random value used to indicate library initialization.
6355 static char *__kmp_registration_str = NULL;
6356 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6357
__kmp_reg_status_name()6358 static inline char *__kmp_reg_status_name() {
6359 /* On RHEL 3u5 if linked statically, getpid() returns different values in
6360 each thread. If registration and unregistration go in different threads
6361 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env
6362 env var can not be found, because the name will contain different pid. */
6363 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6364 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(),
6365 (int)getuid());
6366 #else
6367 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid());
6368 #endif
6369 } // __kmp_reg_status_get
6370
__kmp_register_library_startup(void)6371 void __kmp_register_library_startup(void) {
6372
6373 char *name = __kmp_reg_status_name(); // Name of the environment variable.
6374 int done = 0;
6375 union {
6376 double dtime;
6377 long ltime;
6378 } time;
6379 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6380 __kmp_initialize_system_tick();
6381 #endif
6382 __kmp_read_system_time(&time.dtime);
6383 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL);
6384 __kmp_registration_str =
6385 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag,
6386 __kmp_registration_flag, KMP_LIBRARY_FILE);
6387
6388 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name,
6389 __kmp_registration_str));
6390
6391 while (!done) {
6392
6393 char *value = NULL; // Actual value of the environment variable.
6394
6395 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6396 char *shm_name = __kmp_str_format("/%s", name);
6397 int shm_preexist = 0;
6398 char *data1;
6399 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666);
6400 if ((fd1 == -1) && (errno == EEXIST)) {
6401 // file didn't open because it already exists.
6402 // try opening existing file
6403 fd1 = shm_open(shm_name, O_RDWR, 0666);
6404 if (fd1 == -1) { // file didn't open
6405 // error out here
6406 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0),
6407 __kmp_msg_null);
6408 } else {
6409 // able to open existing file
6410 shm_preexist = 1;
6411 }
6412 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than
6413 // already exists.
6414 // error out here.
6415 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno),
6416 __kmp_msg_null);
6417 }
6418 if (shm_preexist == 0) {
6419 // we created SHM now set size
6420 if (ftruncate(fd1, SHM_SIZE) == -1) {
6421 // error occured setting size;
6422 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"),
6423 KMP_ERR(errno), __kmp_msg_null);
6424 }
6425 }
6426 data1 =
6427 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0);
6428 if (data1 == MAP_FAILED) {
6429 // failed to map shared memory
6430 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno),
6431 __kmp_msg_null);
6432 }
6433 if (shm_preexist == 0) { // set data to SHM, set value
6434 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str);
6435 }
6436 // Read value from either what we just wrote or existing file.
6437 value = __kmp_str_format("%s", data1); // read value from SHM
6438 munmap(data1, SHM_SIZE);
6439 close(fd1);
6440 #else // Windows and unix with static library
6441 // Set environment variable, but do not overwrite if it is exist.
6442 __kmp_env_set(name, __kmp_registration_str, 0);
6443 // read value to see if it got set
6444 value = __kmp_env_get(name);
6445 #endif
6446
6447 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6448 done = 1; // Ok, environment variable set successfully, exit the loop.
6449 } else {
6450 // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6451 // Check whether it alive or dead.
6452 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6453 char *tail = value;
6454 char *flag_addr_str = NULL;
6455 char *flag_val_str = NULL;
6456 char const *file_name = NULL;
6457 __kmp_str_split(tail, '-', &flag_addr_str, &tail);
6458 __kmp_str_split(tail, '-', &flag_val_str, &tail);
6459 file_name = tail;
6460 if (tail != NULL) {
6461 long *flag_addr = 0;
6462 long flag_val = 0;
6463 KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr));
6464 KMP_SSCANF(flag_val_str, "%lx", &flag_val);
6465 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) {
6466 // First, check whether environment-encoded address is mapped into
6467 // addr space.
6468 // If so, dereference it to see if it still has the right value.
6469 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) {
6470 neighbor = 1;
6471 } else {
6472 // If not, then we know the other copy of the library is no longer
6473 // running.
6474 neighbor = 2;
6475 }
6476 }
6477 }
6478 switch (neighbor) {
6479 case 0: // Cannot parse environment variable -- neighbor status unknown.
6480 // Assume it is the incompatible format of future version of the
6481 // library. Assume the other library is alive.
6482 // WARN( ... ); // TODO: Issue a warning.
6483 file_name = "unknown library";
6484 KMP_FALLTHROUGH();
6485 // Attention! Falling to the next case. That's intentional.
6486 case 1: { // Neighbor is alive.
6487 // Check it is allowed.
6488 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK");
6489 if (!__kmp_str_match_true(duplicate_ok)) {
6490 // That's not allowed. Issue fatal error.
6491 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name),
6492 KMP_HNT(DuplicateLibrary), __kmp_msg_null);
6493 }
6494 KMP_INTERNAL_FREE(duplicate_ok);
6495 __kmp_duplicate_library_ok = 1;
6496 done = 1; // Exit the loop.
6497 } break;
6498 case 2: { // Neighbor is dead.
6499
6500 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6501 // close shared memory.
6502 shm_unlink(shm_name); // this removes file in /dev/shm
6503 #else
6504 // Clear the variable and try to register library again.
6505 __kmp_env_unset(name);
6506 #endif
6507 } break;
6508 default: { KMP_DEBUG_ASSERT(0); } break;
6509 }
6510 }
6511 KMP_INTERNAL_FREE((void *)value);
6512 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6513 KMP_INTERNAL_FREE((void *)shm_name);
6514 #endif
6515 } // while
6516 KMP_INTERNAL_FREE((void *)name);
6517
6518 } // func __kmp_register_library_startup
6519
__kmp_unregister_library(void)6520 void __kmp_unregister_library(void) {
6521
6522 char *name = __kmp_reg_status_name();
6523 char *value = NULL;
6524
6525 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6526 char *shm_name = __kmp_str_format("/%s", name);
6527 int fd1 = shm_open(shm_name, O_RDONLY, 0666);
6528 if (fd1 == -1) {
6529 // file did not open. return.
6530 return;
6531 }
6532 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0);
6533 if (data1 != MAP_FAILED) {
6534 value = __kmp_str_format("%s", data1); // read value from SHM
6535 munmap(data1, SHM_SIZE);
6536 }
6537 close(fd1);
6538 #else
6539 value = __kmp_env_get(name);
6540 #endif
6541
6542 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0);
6543 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL);
6544 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) {
6545 // Ok, this is our variable. Delete it.
6546 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6547 shm_unlink(shm_name); // this removes file in /dev/shm
6548 #else
6549 __kmp_env_unset(name);
6550 #endif
6551 }
6552
6553 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library
6554 KMP_INTERNAL_FREE(shm_name);
6555 #endif
6556
6557 KMP_INTERNAL_FREE(__kmp_registration_str);
6558 KMP_INTERNAL_FREE(value);
6559 KMP_INTERNAL_FREE(name);
6560
6561 __kmp_registration_flag = 0;
6562 __kmp_registration_str = NULL;
6563
6564 } // __kmp_unregister_library
6565
6566 // End of Library registration stuff.
6567 // -----------------------------------------------------------------------------
6568
6569 #if KMP_MIC_SUPPORTED
6570
__kmp_check_mic_type()6571 static void __kmp_check_mic_type() {
6572 kmp_cpuid_t cpuid_state = {0};
6573 kmp_cpuid_t *cs_p = &cpuid_state;
6574 __kmp_x86_cpuid(1, 0, cs_p);
6575 // We don't support mic1 at the moment
6576 if ((cs_p->eax & 0xff0) == 0xB10) {
6577 __kmp_mic_type = mic2;
6578 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) {
6579 __kmp_mic_type = mic3;
6580 } else {
6581 __kmp_mic_type = non_mic;
6582 }
6583 }
6584
6585 #endif /* KMP_MIC_SUPPORTED */
6586
6587 #if KMP_HAVE_UMWAIT
__kmp_user_level_mwait_init()6588 static void __kmp_user_level_mwait_init() {
6589 struct kmp_cpuid buf;
6590 __kmp_x86_cpuid(7, 0, &buf);
6591 __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait;
6592 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n",
6593 __kmp_umwait_enabled));
6594 }
6595 #elif KMP_HAVE_MWAIT
6596 #ifndef AT_INTELPHIUSERMWAIT
6597 // Spurious, non-existent value that should always fail to return anything.
6598 // Will be replaced with the correct value when we know that.
6599 #define AT_INTELPHIUSERMWAIT 10000
6600 #endif
6601 // getauxval() function is available in RHEL7 and SLES12. If a system with an
6602 // earlier OS is used to build the RTL, we'll use the following internal
6603 // function when the entry is not found.
6604 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL;
getauxval(unsigned long)6605 unsigned long getauxval(unsigned long) { return 0; }
6606
__kmp_user_level_mwait_init()6607 static void __kmp_user_level_mwait_init() {
6608 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available
6609 // use them to find if the user-level mwait is enabled. Otherwise, forcibly
6610 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable
6611 // KMP_USER_LEVEL_MWAIT was set to TRUE.
6612 if (__kmp_mic_type == mic3) {
6613 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT);
6614 if ((res & 0x1) || __kmp_user_level_mwait) {
6615 __kmp_mwait_enabled = TRUE;
6616 if (__kmp_user_level_mwait) {
6617 KMP_INFORM(EnvMwaitWarn);
6618 }
6619 } else {
6620 __kmp_mwait_enabled = FALSE;
6621 }
6622 }
6623 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, "
6624 "__kmp_mwait_enabled = %d\n",
6625 __kmp_mic_type, __kmp_mwait_enabled));
6626 }
6627 #endif /* KMP_HAVE_UMWAIT */
6628
__kmp_do_serial_initialize(void)6629 static void __kmp_do_serial_initialize(void) {
6630 int i, gtid;
6631 int size;
6632
6633 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n"));
6634
6635 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4);
6636 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4);
6637 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8);
6638 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8);
6639 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *));
6640
6641 #if OMPT_SUPPORT
6642 ompt_pre_init();
6643 #endif
6644
6645 __kmp_validate_locks();
6646
6647 /* Initialize internal memory allocator */
6648 __kmp_init_allocator();
6649
6650 /* Register the library startup via an environment variable and check to see
6651 whether another copy of the library is already registered. */
6652
6653 __kmp_register_library_startup();
6654
6655 /* TODO reinitialization of library */
6656 if (TCR_4(__kmp_global.g.g_done)) {
6657 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n"));
6658 }
6659
6660 __kmp_global.g.g_abort = 0;
6661 TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6662
6663 /* initialize the locks */
6664 #if KMP_USE_ADAPTIVE_LOCKS
6665 #if KMP_DEBUG_ADAPTIVE_LOCKS
6666 __kmp_init_speculative_stats();
6667 #endif
6668 #endif
6669 #if KMP_STATS_ENABLED
6670 __kmp_stats_init();
6671 #endif
6672 __kmp_init_lock(&__kmp_global_lock);
6673 __kmp_init_queuing_lock(&__kmp_dispatch_lock);
6674 __kmp_init_lock(&__kmp_debug_lock);
6675 __kmp_init_atomic_lock(&__kmp_atomic_lock);
6676 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i);
6677 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i);
6678 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i);
6679 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r);
6680 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i);
6681 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r);
6682 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c);
6683 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r);
6684 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r);
6685 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c);
6686 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c);
6687 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c);
6688 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock);
6689 __kmp_init_bootstrap_lock(&__kmp_exit_lock);
6690 #if KMP_USE_MONITOR
6691 __kmp_init_bootstrap_lock(&__kmp_monitor_lock);
6692 #endif
6693 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock);
6694
6695 /* conduct initialization and initial setup of configuration */
6696
6697 __kmp_runtime_initialize();
6698
6699 #if KMP_MIC_SUPPORTED
6700 __kmp_check_mic_type();
6701 #endif
6702
6703 // Some global variable initialization moved here from kmp_env_initialize()
6704 #ifdef KMP_DEBUG
6705 kmp_diag = 0;
6706 #endif
6707 __kmp_abort_delay = 0;
6708
6709 // From __kmp_init_dflt_team_nth()
6710 /* assume the entire machine will be used */
6711 __kmp_dflt_team_nth_ub = __kmp_xproc;
6712 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) {
6713 __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6714 }
6715 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) {
6716 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6717 }
6718 __kmp_max_nth = __kmp_sys_max_nth;
6719 __kmp_cg_max_nth = __kmp_sys_max_nth;
6720 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default
6721 if (__kmp_teams_max_nth > __kmp_sys_max_nth) {
6722 __kmp_teams_max_nth = __kmp_sys_max_nth;
6723 }
6724
6725 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME"
6726 // part
6727 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6728 #if KMP_USE_MONITOR
6729 __kmp_monitor_wakeups =
6730 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6731 __kmp_bt_intervals =
6732 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups);
6733 #endif
6734 // From "KMP_LIBRARY" part of __kmp_env_initialize()
6735 __kmp_library = library_throughput;
6736 // From KMP_SCHEDULE initialization
6737 __kmp_static = kmp_sch_static_balanced;
6738 // AC: do not use analytical here, because it is non-monotonous
6739 //__kmp_guided = kmp_sch_guided_iterative_chunked;
6740 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no
6741 // need to repeat assignment
6742 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch
6743 // bit control and barrier method control parts
6744 #if KMP_FAST_REDUCTION_BARRIER
6745 #define kmp_reduction_barrier_gather_bb ((int)1)
6746 #define kmp_reduction_barrier_release_bb ((int)1)
6747 #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6748 #define kmp_reduction_barrier_release_pat bp_hyper_bar
6749 #endif // KMP_FAST_REDUCTION_BARRIER
6750 for (i = bs_plain_barrier; i < bs_last_barrier; i++) {
6751 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt;
6752 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt;
6753 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt;
6754 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt;
6755 #if KMP_FAST_REDUCTION_BARRIER
6756 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only (
6757 // lin_64 ): hyper,1
6758 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb;
6759 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb;
6760 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat;
6761 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat;
6762 }
6763 #endif // KMP_FAST_REDUCTION_BARRIER
6764 }
6765 #if KMP_FAST_REDUCTION_BARRIER
6766 #undef kmp_reduction_barrier_release_pat
6767 #undef kmp_reduction_barrier_gather_pat
6768 #undef kmp_reduction_barrier_release_bb
6769 #undef kmp_reduction_barrier_gather_bb
6770 #endif // KMP_FAST_REDUCTION_BARRIER
6771 #if KMP_MIC_SUPPORTED
6772 if (__kmp_mic_type == mic2) { // KNC
6773 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6774 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather
6775 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] =
6776 1; // forkjoin release
6777 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6778 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar;
6779 }
6780 #if KMP_FAST_REDUCTION_BARRIER
6781 if (__kmp_mic_type == mic2) { // KNC
6782 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6783 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar;
6784 }
6785 #endif // KMP_FAST_REDUCTION_BARRIER
6786 #endif // KMP_MIC_SUPPORTED
6787
6788 // From KMP_CHECKS initialization
6789 #ifdef KMP_DEBUG
6790 __kmp_env_checks = TRUE; /* development versions have the extra checks */
6791 #else
6792 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */
6793 #endif
6794
6795 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6796 __kmp_foreign_tp = TRUE;
6797
6798 __kmp_global.g.g_dynamic = FALSE;
6799 __kmp_global.g.g_dynamic_mode = dynamic_default;
6800
6801 __kmp_env_initialize(NULL);
6802
6803 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT
6804 __kmp_user_level_mwait_init();
6805 #endif
6806 // Print all messages in message catalog for testing purposes.
6807 #ifdef KMP_DEBUG
6808 char const *val = __kmp_env_get("KMP_DUMP_CATALOG");
6809 if (__kmp_str_match_true(val)) {
6810 kmp_str_buf_t buffer;
6811 __kmp_str_buf_init(&buffer);
6812 __kmp_i18n_dump_catalog(&buffer);
6813 __kmp_printf("%s", buffer.str);
6814 __kmp_str_buf_free(&buffer);
6815 }
6816 __kmp_env_free(&val);
6817 #endif
6818
6819 __kmp_threads_capacity =
6820 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub);
6821 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6822 __kmp_tp_capacity = __kmp_default_tp_capacity(
6823 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6824
6825 // If the library is shut down properly, both pools must be NULL. Just in
6826 // case, set them to NULL -- some memory may leak, but subsequent code will
6827 // work even if pools are not freed.
6828 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL);
6829 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL);
6830 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL);
6831 __kmp_thread_pool = NULL;
6832 __kmp_thread_pool_insert_pt = NULL;
6833 __kmp_team_pool = NULL;
6834
6835 /* Allocate all of the variable sized records */
6836 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
6837 * expandable */
6838 /* Since allocation is cache-aligned, just add extra padding at the end */
6839 size =
6840 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
6841 CACHE_LINE;
6842 __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
6843 __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
6844 sizeof(kmp_info_t *) * __kmp_threads_capacity);
6845
6846 /* init thread counts */
6847 KMP_DEBUG_ASSERT(__kmp_all_nth ==
6848 0); // Asserts fail if the library is reinitializing and
6849 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination.
6850 __kmp_all_nth = 0;
6851 __kmp_nth = 0;
6852
6853 /* setup the uber master thread and hierarchy */
6854 gtid = __kmp_register_root(TRUE);
6855 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid));
6856 KMP_ASSERT(KMP_UBER_GTID(gtid));
6857 KMP_ASSERT(KMP_INITIAL_GTID(gtid));
6858
6859 KMP_MB(); /* Flush all pending memory write invalidates. */
6860
6861 __kmp_common_initialize();
6862
6863 #if KMP_OS_UNIX
6864 /* invoke the child fork handler */
6865 __kmp_register_atfork();
6866 #endif
6867
6868 #if !KMP_DYNAMIC_LIB
6869 {
6870 /* Invoke the exit handler when the program finishes, only for static
6871 library. For dynamic library, we already have _fini and DllMain. */
6872 int rc = atexit(__kmp_internal_end_atexit);
6873 if (rc != 0) {
6874 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc),
6875 __kmp_msg_null);
6876 }
6877 }
6878 #endif
6879
6880 #if KMP_HANDLE_SIGNALS
6881 #if KMP_OS_UNIX
6882 /* NOTE: make sure that this is called before the user installs their own
6883 signal handlers so that the user handlers are called first. this way they
6884 can return false, not call our handler, avoid terminating the library, and
6885 continue execution where they left off. */
6886 __kmp_install_signals(FALSE);
6887 #endif /* KMP_OS_UNIX */
6888 #if KMP_OS_WINDOWS
6889 __kmp_install_signals(TRUE);
6890 #endif /* KMP_OS_WINDOWS */
6891 #endif
6892
6893 /* we have finished the serial initialization */
6894 __kmp_init_counter++;
6895
6896 __kmp_init_serial = TRUE;
6897
6898 if (__kmp_settings) {
6899 __kmp_env_print();
6900 }
6901
6902 if (__kmp_display_env || __kmp_display_env_verbose) {
6903 __kmp_env_print_2();
6904 }
6905
6906 #if OMPT_SUPPORT
6907 ompt_post_init();
6908 #endif
6909
6910 KMP_MB();
6911
6912 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n"));
6913 }
6914
__kmp_serial_initialize(void)6915 void __kmp_serial_initialize(void) {
6916 if (__kmp_init_serial) {
6917 return;
6918 }
6919 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
6920 if (__kmp_init_serial) {
6921 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6922 return;
6923 }
6924 __kmp_do_serial_initialize();
6925 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
6926 }
6927
__kmp_do_middle_initialize(void)6928 static void __kmp_do_middle_initialize(void) {
6929 int i, j;
6930 int prev_dflt_team_nth;
6931
6932 if (!__kmp_init_serial) {
6933 __kmp_do_serial_initialize();
6934 }
6935
6936 KA_TRACE(10, ("__kmp_middle_initialize: enter\n"));
6937
6938 // Save the previous value for the __kmp_dflt_team_nth so that
6939 // we can avoid some reinitialization if it hasn't changed.
6940 prev_dflt_team_nth = __kmp_dflt_team_nth;
6941
6942 #if KMP_AFFINITY_SUPPORTED
6943 // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6944 // number of cores on the machine.
6945 __kmp_affinity_initialize();
6946
6947 // Run through the __kmp_threads array and set the affinity mask
6948 // for each root thread that is currently registered with the RTL.
6949 for (i = 0; i < __kmp_threads_capacity; i++) {
6950 if (TCR_PTR(__kmp_threads[i]) != NULL) {
6951 __kmp_affinity_set_init_mask(i, TRUE);
6952 }
6953 }
6954 #endif /* KMP_AFFINITY_SUPPORTED */
6955
6956 KMP_ASSERT(__kmp_xproc > 0);
6957 if (__kmp_avail_proc == 0) {
6958 __kmp_avail_proc = __kmp_xproc;
6959 }
6960
6961 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3),
6962 // correct them now
6963 j = 0;
6964 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) {
6965 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub =
6966 __kmp_avail_proc;
6967 j++;
6968 }
6969
6970 if (__kmp_dflt_team_nth == 0) {
6971 #ifdef KMP_DFLT_NTH_CORES
6972 // Default #threads = #cores
6973 __kmp_dflt_team_nth = __kmp_ncores;
6974 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6975 "__kmp_ncores (%d)\n",
6976 __kmp_dflt_team_nth));
6977 #else
6978 // Default #threads = #available OS procs
6979 __kmp_dflt_team_nth = __kmp_avail_proc;
6980 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = "
6981 "__kmp_avail_proc(%d)\n",
6982 __kmp_dflt_team_nth));
6983 #endif /* KMP_DFLT_NTH_CORES */
6984 }
6985
6986 if (__kmp_dflt_team_nth < KMP_MIN_NTH) {
6987 __kmp_dflt_team_nth = KMP_MIN_NTH;
6988 }
6989 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) {
6990 __kmp_dflt_team_nth = __kmp_sys_max_nth;
6991 }
6992
6993 // There's no harm in continuing if the following check fails,
6994 // but it indicates an error in the previous logic.
6995 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub);
6996
6997 if (__kmp_dflt_team_nth != prev_dflt_team_nth) {
6998 // Run through the __kmp_threads array and set the num threads icv for each
6999 // root thread that is currently registered with the RTL (which has not
7000 // already explicitly set its nthreads-var with a call to
7001 // omp_set_num_threads()).
7002 for (i = 0; i < __kmp_threads_capacity; i++) {
7003 kmp_info_t *thread = __kmp_threads[i];
7004 if (thread == NULL)
7005 continue;
7006 if (thread->th.th_current_task->td_icvs.nproc != 0)
7007 continue;
7008
7009 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth);
7010 }
7011 }
7012 KA_TRACE(
7013 20,
7014 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
7015 __kmp_dflt_team_nth));
7016
7017 #ifdef KMP_ADJUST_BLOCKTIME
7018 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */
7019 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) {
7020 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0);
7021 if (__kmp_nth > __kmp_avail_proc) {
7022 __kmp_zero_bt = TRUE;
7023 }
7024 }
7025 #endif /* KMP_ADJUST_BLOCKTIME */
7026
7027 /* we have finished middle initialization */
7028 TCW_SYNC_4(__kmp_init_middle, TRUE);
7029
7030 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n"));
7031 }
7032
__kmp_middle_initialize(void)7033 void __kmp_middle_initialize(void) {
7034 if (__kmp_init_middle) {
7035 return;
7036 }
7037 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7038 if (__kmp_init_middle) {
7039 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7040 return;
7041 }
7042 __kmp_do_middle_initialize();
7043 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7044 }
7045
__kmp_parallel_initialize(void)7046 void __kmp_parallel_initialize(void) {
7047 int gtid = __kmp_entry_gtid(); // this might be a new root
7048
7049 /* synchronize parallel initialization (for sibling) */
7050 if (TCR_4(__kmp_init_parallel))
7051 return;
7052 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7053 if (TCR_4(__kmp_init_parallel)) {
7054 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7055 return;
7056 }
7057
7058 /* TODO reinitialization after we have already shut down */
7059 if (TCR_4(__kmp_global.g.g_done)) {
7060 KA_TRACE(
7061 10,
7062 ("__kmp_parallel_initialize: attempt to init while shutting down\n"));
7063 __kmp_infinite_loop();
7064 }
7065
7066 /* jc: The lock __kmp_initz_lock is already held, so calling
7067 __kmp_serial_initialize would cause a deadlock. So we call
7068 __kmp_do_serial_initialize directly. */
7069 if (!__kmp_init_middle) {
7070 __kmp_do_middle_initialize();
7071 }
7072 __kmp_resume_if_hard_paused();
7073
7074 /* begin initialization */
7075 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n"));
7076 KMP_ASSERT(KMP_UBER_GTID(gtid));
7077
7078 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
7079 // Save the FP control regs.
7080 // Worker threads will set theirs to these values at thread startup.
7081 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word);
7082 __kmp_store_mxcsr(&__kmp_init_mxcsr);
7083 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
7084 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
7085
7086 #if KMP_OS_UNIX
7087 #if KMP_HANDLE_SIGNALS
7088 /* must be after __kmp_serial_initialize */
7089 __kmp_install_signals(TRUE);
7090 #endif
7091 #endif
7092
7093 __kmp_suspend_initialize();
7094
7095 #if defined(USE_LOAD_BALANCE)
7096 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7097 __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
7098 }
7099 #else
7100 if (__kmp_global.g.g_dynamic_mode == dynamic_default) {
7101 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7102 }
7103 #endif
7104
7105 if (__kmp_version) {
7106 __kmp_print_version_2();
7107 }
7108
7109 /* we have finished parallel initialization */
7110 TCW_SYNC_4(__kmp_init_parallel, TRUE);
7111
7112 KMP_MB();
7113 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
7114
7115 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7116 }
7117
7118 /* ------------------------------------------------------------------------ */
7119
__kmp_run_before_invoked_task(int gtid,int tid,kmp_info_t * this_thr,kmp_team_t * team)7120 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7121 kmp_team_t *team) {
7122 kmp_disp_t *dispatch;
7123
7124 KMP_MB();
7125
7126 /* none of the threads have encountered any constructs, yet. */
7127 this_thr->th.th_local.this_construct = 0;
7128 #if KMP_CACHE_MANAGE
7129 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived);
7130 #endif /* KMP_CACHE_MANAGE */
7131 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
7132 KMP_DEBUG_ASSERT(dispatch);
7133 KMP_DEBUG_ASSERT(team->t.t_dispatch);
7134 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[
7135 // this_thr->th.th_info.ds.ds_tid ] );
7136
7137 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */
7138 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter
7139 if (__kmp_env_consistency_check)
7140 __kmp_push_parallel(gtid, team->t.t_ident);
7141
7142 KMP_MB(); /* Flush all pending memory write invalidates. */
7143 }
7144
__kmp_run_after_invoked_task(int gtid,int tid,kmp_info_t * this_thr,kmp_team_t * team)7145 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr,
7146 kmp_team_t *team) {
7147 if (__kmp_env_consistency_check)
7148 __kmp_pop_parallel(gtid, team->t.t_ident);
7149
7150 __kmp_finish_implicit_task(this_thr);
7151 }
7152
__kmp_invoke_task_func(int gtid)7153 int __kmp_invoke_task_func(int gtid) {
7154 int rc;
7155 int tid = __kmp_tid_from_gtid(gtid);
7156 kmp_info_t *this_thr = __kmp_threads[gtid];
7157 kmp_team_t *team = this_thr->th.th_team;
7158
7159 __kmp_run_before_invoked_task(gtid, tid, this_thr, team);
7160 #if USE_ITT_BUILD
7161 if (__itt_stack_caller_create_ptr) {
7162 __kmp_itt_stack_callee_enter(
7163 (__itt_caller)
7164 team->t.t_stack_id); // inform ittnotify about entering user's code
7165 }
7166 #endif /* USE_ITT_BUILD */
7167 #if INCLUDE_SSC_MARKS
7168 SSC_MARK_INVOKING();
7169 #endif
7170
7171 #if OMPT_SUPPORT
7172 void *dummy;
7173 void **exit_frame_p;
7174 ompt_data_t *my_task_data;
7175 ompt_data_t *my_parallel_data;
7176 int ompt_team_size;
7177
7178 if (ompt_enabled.enabled) {
7179 exit_frame_p = &(
7180 team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr);
7181 } else {
7182 exit_frame_p = &dummy;
7183 }
7184
7185 my_task_data =
7186 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data);
7187 my_parallel_data = &(team->t.ompt_team_info.parallel_data);
7188 if (ompt_enabled.ompt_callback_implicit_task) {
7189 ompt_team_size = team->t.t_nproc;
7190 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7191 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size,
7192 __kmp_tid_from_gtid(gtid), ompt_task_implicit);
7193 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid);
7194 }
7195 #endif
7196
7197 #if KMP_STATS_ENABLED
7198 stats_state_e previous_state = KMP_GET_THREAD_STATE();
7199 if (previous_state == stats_state_e::TEAMS_REGION) {
7200 KMP_PUSH_PARTITIONED_TIMER(OMP_teams);
7201 } else {
7202 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel);
7203 }
7204 KMP_SET_THREAD_STATE(IMPLICIT_TASK);
7205 #endif
7206
7207 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid,
7208 tid, (int)team->t.t_argc, (void **)team->t.t_argv
7209 #if OMPT_SUPPORT
7210 ,
7211 exit_frame_p
7212 #endif
7213 );
7214 #if OMPT_SUPPORT
7215 *exit_frame_p = NULL;
7216 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team;
7217 #endif
7218
7219 #if KMP_STATS_ENABLED
7220 if (previous_state == stats_state_e::TEAMS_REGION) {
7221 KMP_SET_THREAD_STATE(previous_state);
7222 }
7223 KMP_POP_PARTITIONED_TIMER();
7224 #endif
7225
7226 #if USE_ITT_BUILD
7227 if (__itt_stack_caller_create_ptr) {
7228 __kmp_itt_stack_callee_leave(
7229 (__itt_caller)
7230 team->t.t_stack_id); // inform ittnotify about leaving user's code
7231 }
7232 #endif /* USE_ITT_BUILD */
7233 __kmp_run_after_invoked_task(gtid, tid, this_thr, team);
7234
7235 return rc;
7236 }
7237
__kmp_teams_master(int gtid)7238 void __kmp_teams_master(int gtid) {
7239 // This routine is called by all master threads in teams construct
7240 kmp_info_t *thr = __kmp_threads[gtid];
7241 kmp_team_t *team = thr->th.th_team;
7242 ident_t *loc = team->t.t_ident;
7243 thr->th.th_set_nproc = thr->th.th_teams_size.nth;
7244 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask);
7245 KMP_DEBUG_ASSERT(thr->th.th_set_nproc);
7246 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid,
7247 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask));
7248
7249 // This thread is a new CG root. Set up the proper variables.
7250 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t));
7251 tmp->cg_root = thr; // Make thr the CG root
7252 // Init to thread limit that was stored when league masters were forked
7253 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit;
7254 tmp->cg_nthreads = 1; // Init counter to one active thread, this one
7255 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init"
7256 " cg_nthreads to 1\n",
7257 thr, tmp));
7258 tmp->up = thr->th.th_cg_roots;
7259 thr->th.th_cg_roots = tmp;
7260
7261 // Launch league of teams now, but not let workers execute
7262 // (they hang on fork barrier until next parallel)
7263 #if INCLUDE_SSC_MARKS
7264 SSC_MARK_FORKING();
7265 #endif
7266 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc,
7267 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
7268 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL);
7269 #if INCLUDE_SSC_MARKS
7270 SSC_MARK_JOINING();
7271 #endif
7272 // If the team size was reduced from the limit, set it to the new size
7273 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth)
7274 thr->th.th_teams_size.nth = thr->th.th_team_nproc;
7275 // AC: last parameter "1" eliminates join barrier which won't work because
7276 // worker threads are in a fork barrier waiting for more parallel regions
7277 __kmp_join_call(loc, gtid
7278 #if OMPT_SUPPORT
7279 ,
7280 fork_context_intel
7281 #endif
7282 ,
7283 1);
7284 }
7285
__kmp_invoke_teams_master(int gtid)7286 int __kmp_invoke_teams_master(int gtid) {
7287 kmp_info_t *this_thr = __kmp_threads[gtid];
7288 kmp_team_t *team = this_thr->th.th_team;
7289 #if KMP_DEBUG
7290 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized)
7291 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn ==
7292 (void *)__kmp_teams_master);
7293 #endif
7294 __kmp_run_before_invoked_task(gtid, 0, this_thr, team);
7295 #if OMPT_SUPPORT
7296 int tid = __kmp_tid_from_gtid(gtid);
7297 ompt_data_t *task_data =
7298 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data;
7299 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data;
7300 if (ompt_enabled.ompt_callback_implicit_task) {
7301 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7302 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid,
7303 ompt_task_initial);
7304 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid;
7305 }
7306 #endif
7307 __kmp_teams_master(gtid);
7308 #if OMPT_SUPPORT
7309 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league;
7310 #endif
7311 __kmp_run_after_invoked_task(gtid, 0, this_thr, team);
7312 return 1;
7313 }
7314
7315 /* this sets the requested number of threads for the next parallel region
7316 encountered by this team. since this should be enclosed in the forkjoin
7317 critical section it should avoid race conditions with asymmetrical nested
7318 parallelism */
7319
__kmp_push_num_threads(ident_t * id,int gtid,int num_threads)7320 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) {
7321 kmp_info_t *thr = __kmp_threads[gtid];
7322
7323 if (num_threads > 0)
7324 thr->th.th_set_nproc = num_threads;
7325 }
7326
7327 /* this sets the requested number of teams for the teams region and/or
7328 the number of threads for the next parallel region encountered */
__kmp_push_num_teams(ident_t * id,int gtid,int num_teams,int num_threads)7329 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams,
7330 int num_threads) {
7331 kmp_info_t *thr = __kmp_threads[gtid];
7332 KMP_DEBUG_ASSERT(num_teams >= 0);
7333 KMP_DEBUG_ASSERT(num_threads >= 0);
7334
7335 if (num_teams == 0)
7336 num_teams = 1; // default number of teams is 1.
7337 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested?
7338 if (!__kmp_reserve_warn) {
7339 __kmp_reserve_warn = 1;
7340 __kmp_msg(kmp_ms_warning,
7341 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth),
7342 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7343 }
7344 num_teams = __kmp_teams_max_nth;
7345 }
7346 // Set number of teams (number of threads in the outer "parallel" of the
7347 // teams)
7348 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7349
7350 // Remember the number of threads for inner parallel regions
7351 if (!TCR_4(__kmp_init_middle))
7352 __kmp_middle_initialize(); // get internal globals calculated
7353 KMP_DEBUG_ASSERT(__kmp_avail_proc);
7354 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth);
7355 if (num_threads == 0) {
7356 num_threads = __kmp_avail_proc / num_teams;
7357 // adjust num_threads w/o warning as it is not user setting
7358 // num_threads = min(num_threads, nthreads-var, thread-limit-var)
7359 // no thread_limit clause specified - do not change thread-limit-var ICV
7360 if (num_threads > __kmp_dflt_team_nth) {
7361 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7362 }
7363 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) {
7364 num_threads = thr->th.th_current_task->td_icvs.thread_limit;
7365 } // prevent team size to exceed thread-limit-var
7366 if (num_teams * num_threads > __kmp_teams_max_nth) {
7367 num_threads = __kmp_teams_max_nth / num_teams;
7368 }
7369 } else {
7370 // This thread will be the master of the league masters
7371 // Store new thread limit; old limit is saved in th_cg_roots list
7372 thr->th.th_current_task->td_icvs.thread_limit = num_threads;
7373 // num_threads = min(num_threads, nthreads-var)
7374 if (num_threads > __kmp_dflt_team_nth) {
7375 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV
7376 }
7377 if (num_teams * num_threads > __kmp_teams_max_nth) {
7378 int new_threads = __kmp_teams_max_nth / num_teams;
7379 if (!__kmp_reserve_warn) { // user asked for too many threads
7380 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT
7381 __kmp_msg(kmp_ms_warning,
7382 KMP_MSG(CantFormThrTeam, num_threads, new_threads),
7383 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null);
7384 }
7385 num_threads = new_threads;
7386 }
7387 }
7388 thr->th.th_teams_size.nth = num_threads;
7389 }
7390
7391 // Set the proc_bind var to use in the following parallel region.
__kmp_push_proc_bind(ident_t * id,int gtid,kmp_proc_bind_t proc_bind)7392 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) {
7393 kmp_info_t *thr = __kmp_threads[gtid];
7394 thr->th.th_set_proc_bind = proc_bind;
7395 }
7396
7397 /* Launch the worker threads into the microtask. */
7398
__kmp_internal_fork(ident_t * id,int gtid,kmp_team_t * team)7399 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) {
7400 kmp_info_t *this_thr = __kmp_threads[gtid];
7401
7402 #ifdef KMP_DEBUG
7403 int f;
7404 #endif /* KMP_DEBUG */
7405
7406 KMP_DEBUG_ASSERT(team);
7407 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7408 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7409 KMP_MB(); /* Flush all pending memory write invalidates. */
7410
7411 team->t.t_construct = 0; /* no single directives seen yet */
7412 team->t.t_ordered.dt.t_value =
7413 0; /* thread 0 enters the ordered section first */
7414
7415 /* Reset the identifiers on the dispatch buffer */
7416 KMP_DEBUG_ASSERT(team->t.t_disp_buffer);
7417 if (team->t.t_max_nproc > 1) {
7418 int i;
7419 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) {
7420 team->t.t_disp_buffer[i].buffer_index = i;
7421 team->t.t_disp_buffer[i].doacross_buf_idx = i;
7422 }
7423 } else {
7424 team->t.t_disp_buffer[0].buffer_index = 0;
7425 team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7426 }
7427
7428 KMP_MB(); /* Flush all pending memory write invalidates. */
7429 KMP_ASSERT(this_thr->th.th_team == team);
7430
7431 #ifdef KMP_DEBUG
7432 for (f = 0; f < team->t.t_nproc; f++) {
7433 KMP_DEBUG_ASSERT(team->t.t_threads[f] &&
7434 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc);
7435 }
7436 #endif /* KMP_DEBUG */
7437
7438 /* release the worker threads so they may begin working */
7439 __kmp_fork_barrier(gtid, 0);
7440 }
7441
__kmp_internal_join(ident_t * id,int gtid,kmp_team_t * team)7442 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) {
7443 kmp_info_t *this_thr = __kmp_threads[gtid];
7444
7445 KMP_DEBUG_ASSERT(team);
7446 KMP_DEBUG_ASSERT(this_thr->th.th_team == team);
7447 KMP_ASSERT(KMP_MASTER_GTID(gtid));
7448 KMP_MB(); /* Flush all pending memory write invalidates. */
7449
7450 /* Join barrier after fork */
7451
7452 #ifdef KMP_DEBUG
7453 if (__kmp_threads[gtid] &&
7454 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) {
7455 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid,
7456 __kmp_threads[gtid]);
7457 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, "
7458 "team->t.t_nproc=%d\n",
7459 gtid, __kmp_threads[gtid]->th.th_team_nproc, team,
7460 team->t.t_nproc);
7461 __kmp_print_structure();
7462 }
7463 KMP_DEBUG_ASSERT(__kmp_threads[gtid] &&
7464 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc);
7465 #endif /* KMP_DEBUG */
7466
7467 __kmp_join_barrier(gtid); /* wait for everyone */
7468 #if OMPT_SUPPORT
7469 if (ompt_enabled.enabled &&
7470 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
7471 int ds_tid = this_thr->th.th_info.ds.ds_tid;
7472 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr);
7473 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
7474 #if OMPT_OPTIONAL
7475 void *codeptr = NULL;
7476 if (KMP_MASTER_TID(ds_tid) &&
7477 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
7478 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
7479 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address;
7480
7481 if (ompt_enabled.ompt_callback_sync_region_wait) {
7482 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
7483 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7484 codeptr);
7485 }
7486 if (ompt_enabled.ompt_callback_sync_region) {
7487 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
7488 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
7489 codeptr);
7490 }
7491 #endif
7492 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
7493 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
7494 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
7495 }
7496 }
7497 #endif
7498
7499 KMP_MB(); /* Flush all pending memory write invalidates. */
7500 KMP_ASSERT(this_thr->th.th_team == team);
7501 }
7502
7503 /* ------------------------------------------------------------------------ */
7504
7505 #ifdef USE_LOAD_BALANCE
7506
7507 // Return the worker threads actively spinning in the hot team, if we
7508 // are at the outermost level of parallelism. Otherwise, return 0.
__kmp_active_hot_team_nproc(kmp_root_t * root)7509 static int __kmp_active_hot_team_nproc(kmp_root_t *root) {
7510 int i;
7511 int retval;
7512 kmp_team_t *hot_team;
7513
7514 if (root->r.r_active) {
7515 return 0;
7516 }
7517 hot_team = root->r.r_hot_team;
7518 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) {
7519 return hot_team->t.t_nproc - 1; // Don't count master thread
7520 }
7521
7522 // Skip the master thread - it is accounted for elsewhere.
7523 retval = 0;
7524 for (i = 1; i < hot_team->t.t_nproc; i++) {
7525 if (hot_team->t.t_threads[i]->th.th_active) {
7526 retval++;
7527 }
7528 }
7529 return retval;
7530 }
7531
7532 // Perform an automatic adjustment to the number of
7533 // threads used by the next parallel region.
__kmp_load_balance_nproc(kmp_root_t * root,int set_nproc)7534 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) {
7535 int retval;
7536 int pool_active;
7537 int hot_team_active;
7538 int team_curr_active;
7539 int system_active;
7540
7541 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root,
7542 set_nproc));
7543 KMP_DEBUG_ASSERT(root);
7544 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0]
7545 ->th.th_current_task->td_icvs.dynamic == TRUE);
7546 KMP_DEBUG_ASSERT(set_nproc > 1);
7547
7548 if (set_nproc == 1) {
7549 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n"));
7550 return 1;
7551 }
7552
7553 // Threads that are active in the thread pool, active in the hot team for this
7554 // particular root (if we are at the outer par level), and the currently
7555 // executing thread (to become the master) are available to add to the new
7556 // team, but are currently contributing to the system load, and must be
7557 // accounted for.
7558 pool_active = __kmp_thread_pool_active_nth;
7559 hot_team_active = __kmp_active_hot_team_nproc(root);
7560 team_curr_active = pool_active + hot_team_active + 1;
7561
7562 // Check the system load.
7563 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active);
7564 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d "
7565 "hot team active = %d\n",
7566 system_active, pool_active, hot_team_active));
7567
7568 if (system_active < 0) {
7569 // There was an error reading the necessary info from /proc, so use the
7570 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode
7571 // = dynamic_thread_limit, we shouldn't wind up getting back here.
7572 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7573 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit");
7574
7575 // Make this call behave like the thread limit algorithm.
7576 retval = __kmp_avail_proc - __kmp_nth +
7577 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc);
7578 if (retval > set_nproc) {
7579 retval = set_nproc;
7580 }
7581 if (retval < KMP_MIN_NTH) {
7582 retval = KMP_MIN_NTH;
7583 }
7584
7585 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n",
7586 retval));
7587 return retval;
7588 }
7589
7590 // There is a slight delay in the load balance algorithm in detecting new
7591 // running procs. The real system load at this instant should be at least as
7592 // large as the #active omp thread that are available to add to the team.
7593 if (system_active < team_curr_active) {
7594 system_active = team_curr_active;
7595 }
7596 retval = __kmp_avail_proc - system_active + team_curr_active;
7597 if (retval > set_nproc) {
7598 retval = set_nproc;
7599 }
7600 if (retval < KMP_MIN_NTH) {
7601 retval = KMP_MIN_NTH;
7602 }
7603
7604 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval));
7605 return retval;
7606 } // __kmp_load_balance_nproc()
7607
7608 #endif /* USE_LOAD_BALANCE */
7609
7610 /* ------------------------------------------------------------------------ */
7611
7612 /* NOTE: this is called with the __kmp_init_lock held */
__kmp_cleanup(void)7613 void __kmp_cleanup(void) {
7614 int f;
7615
7616 KA_TRACE(10, ("__kmp_cleanup: enter\n"));
7617
7618 if (TCR_4(__kmp_init_parallel)) {
7619 #if KMP_HANDLE_SIGNALS
7620 __kmp_remove_signals();
7621 #endif
7622 TCW_4(__kmp_init_parallel, FALSE);
7623 }
7624
7625 if (TCR_4(__kmp_init_middle)) {
7626 #if KMP_AFFINITY_SUPPORTED
7627 __kmp_affinity_uninitialize();
7628 #endif /* KMP_AFFINITY_SUPPORTED */
7629 __kmp_cleanup_hierarchy();
7630 TCW_4(__kmp_init_middle, FALSE);
7631 }
7632
7633 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n"));
7634
7635 if (__kmp_init_serial) {
7636 __kmp_runtime_destroy();
7637 __kmp_init_serial = FALSE;
7638 }
7639
7640 __kmp_cleanup_threadprivate_caches();
7641
7642 for (f = 0; f < __kmp_threads_capacity; f++) {
7643 if (__kmp_root[f] != NULL) {
7644 __kmp_free(__kmp_root[f]);
7645 __kmp_root[f] = NULL;
7646 }
7647 }
7648 __kmp_free(__kmp_threads);
7649 // __kmp_threads and __kmp_root were allocated at once, as single block, so
7650 // there is no need in freeing __kmp_root.
7651 __kmp_threads = NULL;
7652 __kmp_root = NULL;
7653 __kmp_threads_capacity = 0;
7654
7655 #if KMP_USE_DYNAMIC_LOCK
7656 __kmp_cleanup_indirect_user_locks();
7657 #else
7658 __kmp_cleanup_user_locks();
7659 #endif
7660
7661 #if KMP_AFFINITY_SUPPORTED
7662 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file));
7663 __kmp_cpuinfo_file = NULL;
7664 #endif /* KMP_AFFINITY_SUPPORTED */
7665
7666 #if KMP_USE_ADAPTIVE_LOCKS
7667 #if KMP_DEBUG_ADAPTIVE_LOCKS
7668 __kmp_print_speculative_stats();
7669 #endif
7670 #endif
7671 KMP_INTERNAL_FREE(__kmp_nested_nth.nth);
7672 __kmp_nested_nth.nth = NULL;
7673 __kmp_nested_nth.size = 0;
7674 __kmp_nested_nth.used = 0;
7675 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types);
7676 __kmp_nested_proc_bind.bind_types = NULL;
7677 __kmp_nested_proc_bind.size = 0;
7678 __kmp_nested_proc_bind.used = 0;
7679 if (__kmp_affinity_format) {
7680 KMP_INTERNAL_FREE(__kmp_affinity_format);
7681 __kmp_affinity_format = NULL;
7682 }
7683
7684 __kmp_i18n_catclose();
7685
7686 #if KMP_USE_HIER_SCHED
7687 __kmp_hier_scheds.deallocate();
7688 #endif
7689
7690 #if KMP_STATS_ENABLED
7691 __kmp_stats_fini();
7692 #endif
7693
7694 KA_TRACE(10, ("__kmp_cleanup: exit\n"));
7695 }
7696
7697 /* ------------------------------------------------------------------------ */
7698
__kmp_ignore_mppbeg(void)7699 int __kmp_ignore_mppbeg(void) {
7700 char *env;
7701
7702 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) {
7703 if (__kmp_str_match_false(env))
7704 return FALSE;
7705 }
7706 // By default __kmpc_begin() is no-op.
7707 return TRUE;
7708 }
7709
__kmp_ignore_mppend(void)7710 int __kmp_ignore_mppend(void) {
7711 char *env;
7712
7713 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) {
7714 if (__kmp_str_match_false(env))
7715 return FALSE;
7716 }
7717 // By default __kmpc_end() is no-op.
7718 return TRUE;
7719 }
7720
__kmp_internal_begin(void)7721 void __kmp_internal_begin(void) {
7722 int gtid;
7723 kmp_root_t *root;
7724
7725 /* this is a very important step as it will register new sibling threads
7726 and assign these new uber threads a new gtid */
7727 gtid = __kmp_entry_gtid();
7728 root = __kmp_threads[gtid]->th.th_root;
7729 KMP_ASSERT(KMP_UBER_GTID(gtid));
7730
7731 if (root->r.r_begin)
7732 return;
7733 __kmp_acquire_lock(&root->r.r_begin_lock, gtid);
7734 if (root->r.r_begin) {
7735 __kmp_release_lock(&root->r.r_begin_lock, gtid);
7736 return;
7737 }
7738
7739 root->r.r_begin = TRUE;
7740
7741 __kmp_release_lock(&root->r.r_begin_lock, gtid);
7742 }
7743
7744 /* ------------------------------------------------------------------------ */
7745
__kmp_user_set_library(enum library_type arg)7746 void __kmp_user_set_library(enum library_type arg) {
7747 int gtid;
7748 kmp_root_t *root;
7749 kmp_info_t *thread;
7750
7751 /* first, make sure we are initialized so we can get our gtid */
7752
7753 gtid = __kmp_entry_gtid();
7754 thread = __kmp_threads[gtid];
7755
7756 root = thread->th.th_root;
7757
7758 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg,
7759 library_serial));
7760 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level
7761 thread */
7762 KMP_WARNING(SetLibraryIncorrectCall);
7763 return;
7764 }
7765
7766 switch (arg) {
7767 case library_serial:
7768 thread->th.th_set_nproc = 0;
7769 set__nproc(thread, 1);
7770 break;
7771 case library_turnaround:
7772 thread->th.th_set_nproc = 0;
7773 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7774 : __kmp_dflt_team_nth_ub);
7775 break;
7776 case library_throughput:
7777 thread->th.th_set_nproc = 0;
7778 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth
7779 : __kmp_dflt_team_nth_ub);
7780 break;
7781 default:
7782 KMP_FATAL(UnknownLibraryType, arg);
7783 }
7784
7785 __kmp_aux_set_library(arg);
7786 }
7787
__kmp_aux_set_stacksize(size_t arg)7788 void __kmp_aux_set_stacksize(size_t arg) {
7789 if (!__kmp_init_serial)
7790 __kmp_serial_initialize();
7791
7792 #if KMP_OS_DARWIN
7793 if (arg & (0x1000 - 1)) {
7794 arg &= ~(0x1000 - 1);
7795 if (arg + 0x1000) /* check for overflow if we round up */
7796 arg += 0x1000;
7797 }
7798 #endif
7799 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
7800
7801 /* only change the default stacksize before the first parallel region */
7802 if (!TCR_4(__kmp_init_parallel)) {
7803 size_t value = arg; /* argument is in bytes */
7804
7805 if (value < __kmp_sys_min_stksize)
7806 value = __kmp_sys_min_stksize;
7807 else if (value > KMP_MAX_STKSIZE)
7808 value = KMP_MAX_STKSIZE;
7809
7810 __kmp_stksize = value;
7811
7812 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */
7813 }
7814
7815 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
7816 }
7817
7818 /* set the behaviour of the runtime library */
7819 /* TODO this can cause some odd behaviour with sibling parallelism... */
__kmp_aux_set_library(enum library_type arg)7820 void __kmp_aux_set_library(enum library_type arg) {
7821 __kmp_library = arg;
7822
7823 switch (__kmp_library) {
7824 case library_serial: {
7825 KMP_INFORM(LibraryIsSerial);
7826 } break;
7827 case library_turnaround:
7828 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set)
7829 __kmp_use_yield = 2; // only yield when oversubscribed
7830 break;
7831 case library_throughput:
7832 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME)
7833 __kmp_dflt_blocktime = 200;
7834 break;
7835 default:
7836 KMP_FATAL(UnknownLibraryType, arg);
7837 }
7838 }
7839
7840 /* Getting team information common for all team API */
7841 // Returns NULL if not in teams construct
__kmp_aux_get_team_info(int & teams_serialized)7842 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) {
7843 kmp_info_t *thr = __kmp_entry_thread();
7844 teams_serialized = 0;
7845 if (thr->th.th_teams_microtask) {
7846 kmp_team_t *team = thr->th.th_team;
7847 int tlevel = thr->th.th_teams_level; // the level of the teams construct
7848 int ii = team->t.t_level;
7849 teams_serialized = team->t.t_serialized;
7850 int level = tlevel + 1;
7851 KMP_DEBUG_ASSERT(ii >= tlevel);
7852 while (ii > level) {
7853 for (teams_serialized = team->t.t_serialized;
7854 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) {
7855 }
7856 if (team->t.t_serialized && (!teams_serialized)) {
7857 team = team->t.t_parent;
7858 continue;
7859 }
7860 if (ii > level) {
7861 team = team->t.t_parent;
7862 ii--;
7863 }
7864 }
7865 return team;
7866 }
7867 return NULL;
7868 }
7869
__kmp_aux_get_team_num()7870 int __kmp_aux_get_team_num() {
7871 int serialized;
7872 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7873 if (team) {
7874 if (serialized > 1) {
7875 return 0; // teams region is serialized ( 1 team of 1 thread ).
7876 } else {
7877 return team->t.t_master_tid;
7878 }
7879 }
7880 return 0;
7881 }
7882
__kmp_aux_get_num_teams()7883 int __kmp_aux_get_num_teams() {
7884 int serialized;
7885 kmp_team_t *team = __kmp_aux_get_team_info(serialized);
7886 if (team) {
7887 if (serialized > 1) {
7888 return 1;
7889 } else {
7890 return team->t.t_parent->t.t_nproc;
7891 }
7892 }
7893 return 1;
7894 }
7895
7896 /* ------------------------------------------------------------------------ */
7897
7898 /*
7899 * Affinity Format Parser
7900 *
7901 * Field is in form of: %[[[0].]size]type
7902 * % and type are required (%% means print a literal '%')
7903 * type is either single char or long name surrounded by {},
7904 * e.g., N or {num_threads}
7905 * 0 => leading zeros
7906 * . => right justified when size is specified
7907 * by default output is left justified
7908 * size is the *minimum* field length
7909 * All other characters are printed as is
7910 *
7911 * Available field types:
7912 * L {thread_level} - omp_get_level()
7913 * n {thread_num} - omp_get_thread_num()
7914 * h {host} - name of host machine
7915 * P {process_id} - process id (integer)
7916 * T {thread_identifier} - native thread identifier (integer)
7917 * N {num_threads} - omp_get_num_threads()
7918 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1)
7919 * a {thread_affinity} - comma separated list of integers or integer ranges
7920 * (values of affinity mask)
7921 *
7922 * Implementation-specific field types can be added
7923 * If a type is unknown, print "undefined"
7924 */
7925
7926 // Structure holding the short name, long name, and corresponding data type
7927 // for snprintf. A table of these will represent the entire valid keyword
7928 // field types.
7929 typedef struct kmp_affinity_format_field_t {
7930 char short_name; // from spec e.g., L -> thread level
7931 const char *long_name; // from spec thread_level -> thread level
7932 char field_format; // data type for snprintf (typically 'd' or 's'
7933 // for integer or string)
7934 } kmp_affinity_format_field_t;
7935
7936 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = {
7937 #if KMP_AFFINITY_SUPPORTED
7938 {'A', "thread_affinity", 's'},
7939 #endif
7940 {'t', "team_num", 'd'},
7941 {'T', "num_teams", 'd'},
7942 {'L', "nesting_level", 'd'},
7943 {'n', "thread_num", 'd'},
7944 {'N', "num_threads", 'd'},
7945 {'a', "ancestor_tnum", 'd'},
7946 {'H', "host", 's'},
7947 {'P', "process_id", 'd'},
7948 {'i', "native_thread_id", 'd'}};
7949
7950 // Return the number of characters it takes to hold field
__kmp_aux_capture_affinity_field(int gtid,const kmp_info_t * th,const char ** ptr,kmp_str_buf_t * field_buffer)7951 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th,
7952 const char **ptr,
7953 kmp_str_buf_t *field_buffer) {
7954 int rc, format_index, field_value;
7955 const char *width_left, *width_right;
7956 bool pad_zeros, right_justify, parse_long_name, found_valid_name;
7957 static const int FORMAT_SIZE = 20;
7958 char format[FORMAT_SIZE] = {0};
7959 char absolute_short_name = 0;
7960
7961 KMP_DEBUG_ASSERT(gtid >= 0);
7962 KMP_DEBUG_ASSERT(th);
7963 KMP_DEBUG_ASSERT(**ptr == '%');
7964 KMP_DEBUG_ASSERT(field_buffer);
7965
7966 __kmp_str_buf_clear(field_buffer);
7967
7968 // Skip the initial %
7969 (*ptr)++;
7970
7971 // Check for %% first
7972 if (**ptr == '%') {
7973 __kmp_str_buf_cat(field_buffer, "%", 1);
7974 (*ptr)++; // skip over the second %
7975 return 1;
7976 }
7977
7978 // Parse field modifiers if they are present
7979 pad_zeros = false;
7980 if (**ptr == '0') {
7981 pad_zeros = true;
7982 (*ptr)++; // skip over 0
7983 }
7984 right_justify = false;
7985 if (**ptr == '.') {
7986 right_justify = true;
7987 (*ptr)++; // skip over .
7988 }
7989 // Parse width of field: [width_left, width_right)
7990 width_left = width_right = NULL;
7991 if (**ptr >= '0' && **ptr <= '9') {
7992 width_left = *ptr;
7993 SKIP_DIGITS(*ptr);
7994 width_right = *ptr;
7995 }
7996
7997 // Create the format for KMP_SNPRINTF based on flags parsed above
7998 format_index = 0;
7999 format[format_index++] = '%';
8000 if (!right_justify)
8001 format[format_index++] = '-';
8002 if (pad_zeros)
8003 format[format_index++] = '0';
8004 if (width_left && width_right) {
8005 int i = 0;
8006 // Only allow 8 digit number widths.
8007 // This also prevents overflowing format variable
8008 while (i < 8 && width_left < width_right) {
8009 format[format_index++] = *width_left;
8010 width_left++;
8011 i++;
8012 }
8013 }
8014
8015 // Parse a name (long or short)
8016 // Canonicalize the name into absolute_short_name
8017 found_valid_name = false;
8018 parse_long_name = (**ptr == '{');
8019 if (parse_long_name)
8020 (*ptr)++; // skip initial left brace
8021 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) /
8022 sizeof(__kmp_affinity_format_table[0]);
8023 ++i) {
8024 char short_name = __kmp_affinity_format_table[i].short_name;
8025 const char *long_name = __kmp_affinity_format_table[i].long_name;
8026 char field_format = __kmp_affinity_format_table[i].field_format;
8027 if (parse_long_name) {
8028 int length = KMP_STRLEN(long_name);
8029 if (strncmp(*ptr, long_name, length) == 0) {
8030 found_valid_name = true;
8031 (*ptr) += length; // skip the long name
8032 }
8033 } else if (**ptr == short_name) {
8034 found_valid_name = true;
8035 (*ptr)++; // skip the short name
8036 }
8037 if (found_valid_name) {
8038 format[format_index++] = field_format;
8039 format[format_index++] = '\0';
8040 absolute_short_name = short_name;
8041 break;
8042 }
8043 }
8044 if (parse_long_name) {
8045 if (**ptr != '}') {
8046 absolute_short_name = 0;
8047 } else {
8048 (*ptr)++; // skip over the right brace
8049 }
8050 }
8051
8052 // Attempt to fill the buffer with the requested
8053 // value using snprintf within __kmp_str_buf_print()
8054 switch (absolute_short_name) {
8055 case 't':
8056 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num());
8057 break;
8058 case 'T':
8059 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams());
8060 break;
8061 case 'L':
8062 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level);
8063 break;
8064 case 'n':
8065 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid));
8066 break;
8067 case 'H': {
8068 static const int BUFFER_SIZE = 256;
8069 char buf[BUFFER_SIZE];
8070 __kmp_expand_host_name(buf, BUFFER_SIZE);
8071 rc = __kmp_str_buf_print(field_buffer, format, buf);
8072 } break;
8073 case 'P':
8074 rc = __kmp_str_buf_print(field_buffer, format, getpid());
8075 break;
8076 case 'i':
8077 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid());
8078 break;
8079 case 'N':
8080 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc);
8081 break;
8082 case 'a':
8083 field_value =
8084 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1);
8085 rc = __kmp_str_buf_print(field_buffer, format, field_value);
8086 break;
8087 #if KMP_AFFINITY_SUPPORTED
8088 case 'A': {
8089 kmp_str_buf_t buf;
8090 __kmp_str_buf_init(&buf);
8091 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask);
8092 rc = __kmp_str_buf_print(field_buffer, format, buf.str);
8093 __kmp_str_buf_free(&buf);
8094 } break;
8095 #endif
8096 default:
8097 // According to spec, If an implementation does not have info for field
8098 // type, then "undefined" is printed
8099 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined");
8100 // Skip the field
8101 if (parse_long_name) {
8102 SKIP_TOKEN(*ptr);
8103 if (**ptr == '}')
8104 (*ptr)++;
8105 } else {
8106 (*ptr)++;
8107 }
8108 }
8109
8110 KMP_ASSERT(format_index <= FORMAT_SIZE);
8111 return rc;
8112 }
8113
8114 /*
8115 * Return number of characters needed to hold the affinity string
8116 * (not including null byte character)
8117 * The resultant string is printed to buffer, which the caller can then
8118 * handle afterwards
8119 */
__kmp_aux_capture_affinity(int gtid,const char * format,kmp_str_buf_t * buffer)8120 size_t __kmp_aux_capture_affinity(int gtid, const char *format,
8121 kmp_str_buf_t *buffer) {
8122 const char *parse_ptr;
8123 size_t retval;
8124 const kmp_info_t *th;
8125 kmp_str_buf_t field;
8126
8127 KMP_DEBUG_ASSERT(buffer);
8128 KMP_DEBUG_ASSERT(gtid >= 0);
8129
8130 __kmp_str_buf_init(&field);
8131 __kmp_str_buf_clear(buffer);
8132
8133 th = __kmp_threads[gtid];
8134 retval = 0;
8135
8136 // If format is NULL or zero-length string, then we use
8137 // affinity-format-var ICV
8138 parse_ptr = format;
8139 if (parse_ptr == NULL || *parse_ptr == '\0') {
8140 parse_ptr = __kmp_affinity_format;
8141 }
8142 KMP_DEBUG_ASSERT(parse_ptr);
8143
8144 while (*parse_ptr != '\0') {
8145 // Parse a field
8146 if (*parse_ptr == '%') {
8147 // Put field in the buffer
8148 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field);
8149 __kmp_str_buf_catbuf(buffer, &field);
8150 retval += rc;
8151 } else {
8152 // Put literal character in buffer
8153 __kmp_str_buf_cat(buffer, parse_ptr, 1);
8154 retval++;
8155 parse_ptr++;
8156 }
8157 }
8158 __kmp_str_buf_free(&field);
8159 return retval;
8160 }
8161
8162 // Displays the affinity string to stdout
__kmp_aux_display_affinity(int gtid,const char * format)8163 void __kmp_aux_display_affinity(int gtid, const char *format) {
8164 kmp_str_buf_t buf;
8165 __kmp_str_buf_init(&buf);
8166 __kmp_aux_capture_affinity(gtid, format, &buf);
8167 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str);
8168 __kmp_str_buf_free(&buf);
8169 }
8170
8171 /* ------------------------------------------------------------------------ */
8172
__kmp_aux_set_blocktime(int arg,kmp_info_t * thread,int tid)8173 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) {
8174 int blocktime = arg; /* argument is in milliseconds */
8175 #if KMP_USE_MONITOR
8176 int bt_intervals;
8177 #endif
8178 int bt_set;
8179
8180 __kmp_save_internal_controls(thread);
8181
8182 /* Normalize and set blocktime for the teams */
8183 if (blocktime < KMP_MIN_BLOCKTIME)
8184 blocktime = KMP_MIN_BLOCKTIME;
8185 else if (blocktime > KMP_MAX_BLOCKTIME)
8186 blocktime = KMP_MAX_BLOCKTIME;
8187
8188 set__blocktime_team(thread->th.th_team, tid, blocktime);
8189 set__blocktime_team(thread->th.th_serial_team, 0, blocktime);
8190
8191 #if KMP_USE_MONITOR
8192 /* Calculate and set blocktime intervals for the teams */
8193 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
8194
8195 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals);
8196 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals);
8197 #endif
8198
8199 /* Set whether blocktime has been set to "TRUE" */
8200 bt_set = TRUE;
8201
8202 set__bt_set_team(thread->th.th_team, tid, bt_set);
8203 set__bt_set_team(thread->th.th_serial_team, 0, bt_set);
8204 #if KMP_USE_MONITOR
8205 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
8206 "bt_intervals=%d, monitor_updates=%d\n",
8207 __kmp_gtid_from_tid(tid, thread->th.th_team),
8208 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
8209 __kmp_monitor_wakeups));
8210 #else
8211 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
8212 __kmp_gtid_from_tid(tid, thread->th.th_team),
8213 thread->th.th_team->t.t_id, tid, blocktime));
8214 #endif
8215 }
8216
__kmp_aux_set_defaults(char const * str,int len)8217 void __kmp_aux_set_defaults(char const *str, int len) {
8218 if (!__kmp_init_serial) {
8219 __kmp_serial_initialize();
8220 }
8221 __kmp_env_initialize(str);
8222
8223 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) {
8224 __kmp_env_print();
8225 }
8226 } // __kmp_aux_set_defaults
8227
8228 /* ------------------------------------------------------------------------ */
8229 /* internal fast reduction routines */
8230
8231 PACKED_REDUCTION_METHOD_T
__kmp_determine_reduction_method(ident_t * loc,kmp_int32 global_tid,kmp_int32 num_vars,size_t reduce_size,void * reduce_data,void (* reduce_func)(void * lhs_data,void * rhs_data),kmp_critical_name * lck)8232 __kmp_determine_reduction_method(
8233 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size,
8234 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
8235 kmp_critical_name *lck) {
8236
8237 // Default reduction method: critical construct ( lck != NULL, like in current
8238 // PAROPT )
8239 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method
8240 // can be selected by RTL
8241 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method
8242 // can be selected by RTL
8243 // Finally, it's up to OpenMP RTL to make a decision on which method to select
8244 // among generated by PAROPT.
8245
8246 PACKED_REDUCTION_METHOD_T retval;
8247
8248 int team_size;
8249
8250 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 )
8251 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 )
8252
8253 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \
8254 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE))
8255 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func))
8256
8257 retval = critical_reduce_block;
8258
8259 // another choice of getting a team size (with 1 dynamic deference) is slower
8260 team_size = __kmp_get_team_num_threads(global_tid);
8261 if (team_size == 1) {
8262
8263 retval = empty_reduce_block;
8264
8265 } else {
8266
8267 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8268
8269 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \
8270 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64
8271
8272 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \
8273 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8274
8275 int teamsize_cutoff = 4;
8276
8277 #if KMP_MIC_SUPPORTED
8278 if (__kmp_mic_type != non_mic) {
8279 teamsize_cutoff = 8;
8280 }
8281 #endif
8282 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8283 if (tree_available) {
8284 if (team_size <= teamsize_cutoff) {
8285 if (atomic_available) {
8286 retval = atomic_reduce_block;
8287 }
8288 } else {
8289 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8290 }
8291 } else if (atomic_available) {
8292 retval = atomic_reduce_block;
8293 }
8294 #else
8295 #error "Unknown or unsupported OS"
8296 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||
8297 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD
8298
8299 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
8300
8301 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
8302
8303 // basic tuning
8304
8305 if (atomic_available) {
8306 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ???
8307 retval = atomic_reduce_block;
8308 }
8309 } // otherwise: use critical section
8310
8311 #elif KMP_OS_DARWIN
8312
8313 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8314 if (atomic_available && (num_vars <= 3)) {
8315 retval = atomic_reduce_block;
8316 } else if (tree_available) {
8317 if ((reduce_size > (9 * sizeof(kmp_real64))) &&
8318 (reduce_size < (2000 * sizeof(kmp_real64)))) {
8319 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
8320 }
8321 } // otherwise: use critical section
8322
8323 #else
8324 #error "Unknown or unsupported OS"
8325 #endif
8326
8327 #else
8328 #error "Unknown or unsupported architecture"
8329 #endif
8330 }
8331
8332 // KMP_FORCE_REDUCTION
8333
8334 // If the team is serialized (team_size == 1), ignore the forced reduction
8335 // method and stay with the unsynchronized method (empty_reduce_block)
8336 if (__kmp_force_reduction_method != reduction_method_not_defined &&
8337 team_size != 1) {
8338
8339 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
8340
8341 int atomic_available, tree_available;
8342
8343 switch ((forced_retval = __kmp_force_reduction_method)) {
8344 case critical_reduce_block:
8345 KMP_ASSERT(lck); // lck should be != 0
8346 break;
8347
8348 case atomic_reduce_block:
8349 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
8350 if (!atomic_available) {
8351 KMP_WARNING(RedMethodNotSupported, "atomic");
8352 forced_retval = critical_reduce_block;
8353 }
8354 break;
8355
8356 case tree_reduce_block:
8357 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
8358 if (!tree_available) {
8359 KMP_WARNING(RedMethodNotSupported, "tree");
8360 forced_retval = critical_reduce_block;
8361 } else {
8362 #if KMP_FAST_REDUCTION_BARRIER
8363 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
8364 #endif
8365 }
8366 break;
8367
8368 default:
8369 KMP_ASSERT(0); // "unsupported method specified"
8370 }
8371
8372 retval = forced_retval;
8373 }
8374
8375 KA_TRACE(10, ("reduction method selected=%08x\n", retval));
8376
8377 #undef FAST_REDUCTION_TREE_METHOD_GENERATED
8378 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
8379
8380 return (retval);
8381 }
8382 // this function is for testing set/get/determine reduce method
__kmp_get_reduce_method(void)8383 kmp_int32 __kmp_get_reduce_method(void) {
8384 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8);
8385 }
8386
8387 // Soft pause sets up threads to ignore blocktime and just go to sleep.
8388 // Spin-wait code checks __kmp_pause_status and reacts accordingly.
__kmp_soft_pause()8389 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; }
8390
8391 // Hard pause shuts down the runtime completely. Resume happens naturally when
8392 // OpenMP is used subsequently.
__kmp_hard_pause()8393 void __kmp_hard_pause() {
8394 __kmp_pause_status = kmp_hard_paused;
8395 __kmp_internal_end_thread(-1);
8396 }
8397
8398 // Soft resume sets __kmp_pause_status, and wakes up all threads.
__kmp_resume_if_soft_paused()8399 void __kmp_resume_if_soft_paused() {
8400 if (__kmp_pause_status == kmp_soft_paused) {
8401 __kmp_pause_status = kmp_not_paused;
8402
8403 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) {
8404 kmp_info_t *thread = __kmp_threads[gtid];
8405 if (thread) { // Wake it if sleeping
8406 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go,
8407 thread);
8408 if (fl.is_sleeping())
8409 fl.resume(gtid);
8410 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock
8411 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep
8412 } else { // thread holds the lock and may sleep soon
8413 do { // until either the thread sleeps, or we can get the lock
8414 if (fl.is_sleeping()) {
8415 fl.resume(gtid);
8416 break;
8417 } else if (__kmp_try_suspend_mx(thread)) {
8418 __kmp_unlock_suspend_mx(thread);
8419 break;
8420 }
8421 } while (1);
8422 }
8423 }
8424 }
8425 }
8426 }
8427
8428 // This function is called via __kmpc_pause_resource. Returns 0 if successful.
8429 // TODO: add warning messages
__kmp_pause_resource(kmp_pause_status_t level)8430 int __kmp_pause_resource(kmp_pause_status_t level) {
8431 if (level == kmp_not_paused) { // requesting resume
8432 if (__kmp_pause_status == kmp_not_paused) {
8433 // error message about runtime not being paused, so can't resume
8434 return 1;
8435 } else {
8436 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused ||
8437 __kmp_pause_status == kmp_hard_paused);
8438 __kmp_pause_status = kmp_not_paused;
8439 return 0;
8440 }
8441 } else if (level == kmp_soft_paused) { // requesting soft pause
8442 if (__kmp_pause_status != kmp_not_paused) {
8443 // error message about already being paused
8444 return 1;
8445 } else {
8446 __kmp_soft_pause();
8447 return 0;
8448 }
8449 } else if (level == kmp_hard_paused) { // requesting hard pause
8450 if (__kmp_pause_status != kmp_not_paused) {
8451 // error message about already being paused
8452 return 1;
8453 } else {
8454 __kmp_hard_pause();
8455 return 0;
8456 }
8457 } else {
8458 // error message about invalid level
8459 return 1;
8460 }
8461 }
8462
8463
__kmp_omp_display_env(int verbose)8464 void __kmp_omp_display_env(int verbose) {
8465 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
8466 if (__kmp_init_serial == 0)
8467 __kmp_do_serial_initialize();
8468 __kmp_display_env_impl(!verbose, verbose);
8469 __kmp_release_bootstrap_lock(&__kmp_initz_lock);
8470 }
8471