1 /*
2 * kmp_barrier.cpp
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "kmp.h"
14 #include "kmp_wait_release.h"
15 #include "kmp_itt.h"
16 #include "kmp_os.h"
17 #include "kmp_stats.h"
18 #include "ompt-specific.h"
19
20 #if KMP_MIC
21 #include <immintrin.h>
22 #define USE_NGO_STORES 1
23 #endif // KMP_MIC
24
25 #include "tsan_annotations.h"
26
27 #if KMP_MIC && USE_NGO_STORES
28 // ICV copying
29 #define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
30 #define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31 #define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32 #define ngo_sync() __asm__ volatile("lock; addl $0,0(%%rsp)" ::: "memory")
33 #else
34 #define ngo_load(src) ((void)0)
35 #define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
36 #define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
37 #define ngo_sync() ((void)0)
38 #endif /* KMP_MIC && USE_NGO_STORES */
39
40 void __kmp_print_structure(void); // Forward declaration
41
42 // ---------------------------- Barrier Algorithms ----------------------------
43
44 // Linear Barrier
45 template <bool cancellable = false>
__kmp_linear_barrier_gather_template(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))46 static bool __kmp_linear_barrier_gather_template(
47 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
49 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
50 kmp_team_t *team = this_thr->th.th_team;
51 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
52 kmp_info_t **other_threads = team->t.t_threads;
53
54 KA_TRACE(
55 20,
56 ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57 gtid, team->t.t_id, tid, bt));
58 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59
60 #if USE_ITT_BUILD && USE_ITT_NOTIFY
61 // Barrier imbalance - save arrive time to the thread
62 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
63 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
64 __itt_get_timestamp();
65 }
66 #endif
67 // We now perform a linear reduction to signal that all of the threads have
68 // arrived.
69 if (!KMP_MASTER_TID(tid)) {
70 KA_TRACE(20,
71 ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
72 "arrived(%p): %llu => %llu\n",
73 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(0, team),
74 team->t.t_id, 0, &thr_bar->b_arrived, thr_bar->b_arrived,
75 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
76 // Mark arrival to master thread
77 /* After performing this write, a worker thread may not assume that the team
78 is valid any more - it could be deallocated by the master thread at any
79 time. */
80 ANNOTATE_BARRIER_BEGIN(this_thr);
81 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[0]);
82 flag.release();
83 } else {
84 kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
85 int nproc = this_thr->th.th_team_nproc;
86 int i;
87 // Don't have to worry about sleep bit here or atomic since team setting
88 kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
89
90 // Collect all the worker team member threads.
91 for (i = 1; i < nproc; ++i) {
92 #if KMP_CACHE_MANAGE
93 // Prefetch next thread's arrived count
94 if (i + 1 < nproc)
95 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_arrived);
96 #endif /* KMP_CACHE_MANAGE */
97 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
98 "arrived(%p) == %llu\n",
99 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
100 team->t.t_id, i,
101 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
102
103 // Wait for worker thread to arrive
104 if (cancellable) {
105 kmp_flag_64<true, false> flag(
106 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
107 if (flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)))
108 return true;
109 } else {
110 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived,
111 new_state);
112 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
113 }
114 ANNOTATE_BARRIER_END(other_threads[i]);
115 #if USE_ITT_BUILD && USE_ITT_NOTIFY
116 // Barrier imbalance - write min of the thread time and the other thread
117 // time to the thread.
118 if (__kmp_forkjoin_frames_mode == 2) {
119 this_thr->th.th_bar_min_time = KMP_MIN(
120 this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time);
121 }
122 #endif
123 if (reduce) {
124 KA_TRACE(100,
125 ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
126 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team),
127 team->t.t_id, i));
128 ANNOTATE_REDUCE_AFTER(reduce);
129 OMPT_REDUCTION_DECL(this_thr, gtid);
130 OMPT_REDUCTION_BEGIN;
131 (*reduce)(this_thr->th.th_local.reduce_data,
132 other_threads[i]->th.th_local.reduce_data);
133 OMPT_REDUCTION_END;
134 ANNOTATE_REDUCE_BEFORE(reduce);
135 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
136 }
137 }
138 // Don't have to worry about sleep bit here or atomic since team setting
139 team_bar->b_arrived = new_state;
140 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d "
141 "arrived(%p) = %llu\n",
142 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived,
143 new_state));
144 }
145 KA_TRACE(
146 20,
147 ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
148 gtid, team->t.t_id, tid, bt));
149 return false;
150 }
151
152 template <bool cancellable = false>
__kmp_linear_barrier_release_template(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))153 static bool __kmp_linear_barrier_release_template(
154 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
155 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
156 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
157 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
158 kmp_team_t *team;
159
160 if (KMP_MASTER_TID(tid)) {
161 unsigned int i;
162 kmp_uint32 nproc = this_thr->th.th_team_nproc;
163 kmp_info_t **other_threads;
164
165 team = __kmp_threads[gtid]->th.th_team;
166 KMP_DEBUG_ASSERT(team != NULL);
167 other_threads = team->t.t_threads;
168
169 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for "
170 "barrier type %d\n",
171 gtid, team->t.t_id, tid, bt));
172
173 if (nproc > 1) {
174 #if KMP_BARRIER_ICV_PUSH
175 {
176 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
177 if (propagate_icvs) {
178 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
179 for (i = 1; i < nproc; ++i) {
180 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i],
181 team, i, FALSE);
182 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
183 &team->t.t_implicit_task_taskdata[0].td_icvs);
184 }
185 ngo_sync();
186 }
187 }
188 #endif // KMP_BARRIER_ICV_PUSH
189
190 // Now, release all of the worker threads
191 for (i = 1; i < nproc; ++i) {
192 #if KMP_CACHE_MANAGE
193 // Prefetch next thread's go flag
194 if (i + 1 < nproc)
195 KMP_CACHE_PREFETCH(&other_threads[i + 1]->th.th_bar[bt].bb.b_go);
196 #endif /* KMP_CACHE_MANAGE */
197 KA_TRACE(
198 20,
199 ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
200 "go(%p): %u => %u\n",
201 gtid, team->t.t_id, tid, other_threads[i]->th.th_info.ds.ds_gtid,
202 team->t.t_id, i, &other_threads[i]->th.th_bar[bt].bb.b_go,
203 other_threads[i]->th.th_bar[bt].bb.b_go,
204 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
205 ANNOTATE_BARRIER_BEGIN(other_threads[i]);
206 kmp_flag_64<> flag(&other_threads[i]->th.th_bar[bt].bb.b_go,
207 other_threads[i]);
208 flag.release();
209 }
210 }
211 } else { // Wait for the MASTER thread to release us
212 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
213 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
214 if (cancellable) {
215 kmp_flag_64<true, false> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
216 if (flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj)))
217 return true;
218 } else {
219 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
220 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
221 }
222 ANNOTATE_BARRIER_END(this_thr);
223 #if USE_ITT_BUILD && USE_ITT_NOTIFY
224 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
225 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is
226 // disabled)
227 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
228 // Cancel wait on previous parallel region...
229 __kmp_itt_task_starting(itt_sync_obj);
230
231 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
232 return false;
233
234 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
235 if (itt_sync_obj != NULL)
236 // Call prepare as early as possible for "new" barrier
237 __kmp_itt_task_finished(itt_sync_obj);
238 } else
239 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
240 // Early exit for reaping threads releasing forkjoin barrier
241 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
242 return false;
243 // The worker thread may now assume that the team is valid.
244 #ifdef KMP_DEBUG
245 tid = __kmp_tid_from_gtid(gtid);
246 team = __kmp_threads[gtid]->th.th_team;
247 #endif
248 KMP_DEBUG_ASSERT(team != NULL);
249 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
250 KA_TRACE(20,
251 ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
252 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
253 KMP_MB(); // Flush all pending memory write invalidates.
254 }
255 KA_TRACE(
256 20,
257 ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
258 gtid, team->t.t_id, tid, bt));
259 return false;
260 }
261
__kmp_linear_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))262 static void __kmp_linear_barrier_gather(
263 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
264 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
265 __kmp_linear_barrier_gather_template<false>(
266 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
267 }
268
__kmp_linear_barrier_gather_cancellable(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))269 static bool __kmp_linear_barrier_gather_cancellable(
270 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
271 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
272 return __kmp_linear_barrier_gather_template<true>(
273 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
274 }
275
__kmp_linear_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))276 static void __kmp_linear_barrier_release(
277 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
278 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
279 __kmp_linear_barrier_release_template<false>(
280 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
281 }
282
__kmp_linear_barrier_release_cancellable(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))283 static bool __kmp_linear_barrier_release_cancellable(
284 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
285 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
286 return __kmp_linear_barrier_release_template<true>(
287 bt, this_thr, gtid, tid, propagate_icvs USE_ITT_BUILD_ARG(itt_sync_obj));
288 }
289
290 // Tree barrier
291 static void
__kmp_tree_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))292 __kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
293 int tid, void (*reduce)(void *, void *)
294 USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
295 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
296 kmp_team_t *team = this_thr->th.th_team;
297 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
298 kmp_info_t **other_threads = team->t.t_threads;
299 kmp_uint32 nproc = this_thr->th.th_team_nproc;
300 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
301 kmp_uint32 branch_factor = 1 << branch_bits;
302 kmp_uint32 child;
303 kmp_uint32 child_tid;
304 kmp_uint64 new_state;
305
306 KA_TRACE(
307 20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
308 gtid, team->t.t_id, tid, bt));
309 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
310
311 #if USE_ITT_BUILD && USE_ITT_NOTIFY
312 // Barrier imbalance - save arrive time to the thread
313 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
314 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
315 __itt_get_timestamp();
316 }
317 #endif
318 // Perform tree gather to wait until all threads have arrived; reduce any
319 // required data as we go
320 child_tid = (tid << branch_bits) + 1;
321 if (child_tid < nproc) {
322 // Parent threads wait for all their children to arrive
323 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
324 child = 1;
325 do {
326 kmp_info_t *child_thr = other_threads[child_tid];
327 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
328 #if KMP_CACHE_MANAGE
329 // Prefetch next thread's arrived count
330 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
331 KMP_CACHE_PREFETCH(
332 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_arrived);
333 #endif /* KMP_CACHE_MANAGE */
334 KA_TRACE(20,
335 ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
336 "arrived(%p) == %llu\n",
337 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
338 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
339 // Wait for child to arrive
340 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
341 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
342 ANNOTATE_BARRIER_END(child_thr);
343 #if USE_ITT_BUILD && USE_ITT_NOTIFY
344 // Barrier imbalance - write min of the thread time and a child time to
345 // the thread.
346 if (__kmp_forkjoin_frames_mode == 2) {
347 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
348 child_thr->th.th_bar_min_time);
349 }
350 #endif
351 if (reduce) {
352 KA_TRACE(100,
353 ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
354 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
355 team->t.t_id, child_tid));
356 ANNOTATE_REDUCE_AFTER(reduce);
357 OMPT_REDUCTION_DECL(this_thr, gtid);
358 OMPT_REDUCTION_BEGIN;
359 (*reduce)(this_thr->th.th_local.reduce_data,
360 child_thr->th.th_local.reduce_data);
361 OMPT_REDUCTION_END;
362 ANNOTATE_REDUCE_BEFORE(reduce);
363 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
364 }
365 child++;
366 child_tid++;
367 } while (child <= branch_factor && child_tid < nproc);
368 }
369
370 if (!KMP_MASTER_TID(tid)) { // Worker threads
371 kmp_int32 parent_tid = (tid - 1) >> branch_bits;
372
373 KA_TRACE(20,
374 ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
375 "arrived(%p): %llu => %llu\n",
376 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
377 team->t.t_id, parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
378 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
379
380 // Mark arrival to parent thread
381 /* After performing this write, a worker thread may not assume that the team
382 is valid any more - it could be deallocated by the master thread at any
383 time. */
384 ANNOTATE_BARRIER_BEGIN(this_thr);
385 kmp_flag_64<> flag(&thr_bar->b_arrived, other_threads[parent_tid]);
386 flag.release();
387 } else {
388 // Need to update the team arrived pointer if we are the master thread
389 if (nproc > 1) // New value was already computed above
390 team->t.t_bar[bt].b_arrived = new_state;
391 else
392 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
393 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d "
394 "arrived(%p) = %llu\n",
395 gtid, team->t.t_id, tid, team->t.t_id,
396 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
397 }
398 KA_TRACE(20,
399 ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
400 gtid, team->t.t_id, tid, bt));
401 }
402
__kmp_tree_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))403 static void __kmp_tree_barrier_release(
404 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
405 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
406 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
407 kmp_team_t *team;
408 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
409 kmp_uint32 nproc;
410 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
411 kmp_uint32 branch_factor = 1 << branch_bits;
412 kmp_uint32 child;
413 kmp_uint32 child_tid;
414
415 // Perform a tree release for all of the threads that have been gathered
416 if (!KMP_MASTER_TID(
417 tid)) { // Handle fork barrier workers who aren't part of a team yet
418 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n", gtid,
419 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
420 // Wait for parent thread to release us
421 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
422 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
423 ANNOTATE_BARRIER_END(this_thr);
424 #if USE_ITT_BUILD && USE_ITT_NOTIFY
425 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
426 // In fork barrier where we could not get the object reliably (or
427 // ITTNOTIFY is disabled)
428 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
429 // Cancel wait on previous parallel region...
430 __kmp_itt_task_starting(itt_sync_obj);
431
432 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
433 return;
434
435 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
436 if (itt_sync_obj != NULL)
437 // Call prepare as early as possible for "new" barrier
438 __kmp_itt_task_finished(itt_sync_obj);
439 } else
440 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
441 // Early exit for reaping threads releasing forkjoin barrier
442 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
443 return;
444
445 // The worker thread may now assume that the team is valid.
446 team = __kmp_threads[gtid]->th.th_team;
447 KMP_DEBUG_ASSERT(team != NULL);
448 tid = __kmp_tid_from_gtid(gtid);
449
450 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
451 KA_TRACE(20,
452 ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n", gtid,
453 team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
454 KMP_MB(); // Flush all pending memory write invalidates.
455 } else {
456 team = __kmp_threads[gtid]->th.th_team;
457 KMP_DEBUG_ASSERT(team != NULL);
458 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for "
459 "barrier type %d\n",
460 gtid, team->t.t_id, tid, bt));
461 }
462 nproc = this_thr->th.th_team_nproc;
463 child_tid = (tid << branch_bits) + 1;
464
465 if (child_tid < nproc) {
466 kmp_info_t **other_threads = team->t.t_threads;
467 child = 1;
468 // Parent threads release all their children
469 do {
470 kmp_info_t *child_thr = other_threads[child_tid];
471 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
472 #if KMP_CACHE_MANAGE
473 // Prefetch next thread's go count
474 if (child + 1 <= branch_factor && child_tid + 1 < nproc)
475 KMP_CACHE_PREFETCH(
476 &other_threads[child_tid + 1]->th.th_bar[bt].bb.b_go);
477 #endif /* KMP_CACHE_MANAGE */
478
479 #if KMP_BARRIER_ICV_PUSH
480 {
481 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
482 if (propagate_icvs) {
483 __kmp_init_implicit_task(team->t.t_ident,
484 team->t.t_threads[child_tid], team,
485 child_tid, FALSE);
486 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
487 &team->t.t_implicit_task_taskdata[0].td_icvs);
488 }
489 }
490 #endif // KMP_BARRIER_ICV_PUSH
491 KA_TRACE(20,
492 ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
493 "go(%p): %u => %u\n",
494 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
495 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
496 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
497 // Release child from barrier
498 ANNOTATE_BARRIER_BEGIN(child_thr);
499 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
500 flag.release();
501 child++;
502 child_tid++;
503 } while (child <= branch_factor && child_tid < nproc);
504 }
505 KA_TRACE(
506 20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
507 gtid, team->t.t_id, tid, bt));
508 }
509
510 // Hyper Barrier
511 static void
__kmp_hyper_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))512 __kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid,
513 int tid, void (*reduce)(void *, void *)
514 USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
515 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
516 kmp_team_t *team = this_thr->th.th_team;
517 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
518 kmp_info_t **other_threads = team->t.t_threads;
519 kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
520 kmp_uint32 num_threads = this_thr->th.th_team_nproc;
521 kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
522 kmp_uint32 branch_factor = 1 << branch_bits;
523 kmp_uint32 offset;
524 kmp_uint32 level;
525
526 KA_TRACE(
527 20,
528 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
529 gtid, team->t.t_id, tid, bt));
530 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
531
532 #if USE_ITT_BUILD && USE_ITT_NOTIFY
533 // Barrier imbalance - save arrive time to the thread
534 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
535 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time =
536 __itt_get_timestamp();
537 }
538 #endif
539 /* Perform a hypercube-embedded tree gather to wait until all of the threads
540 have arrived, and reduce any required data as we go. */
541 kmp_flag_64<> p_flag(&thr_bar->b_arrived);
542 for (level = 0, offset = 1; offset < num_threads;
543 level += branch_bits, offset <<= branch_bits) {
544 kmp_uint32 child;
545 kmp_uint32 child_tid;
546
547 if (((tid >> level) & (branch_factor - 1)) != 0) {
548 kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) - 1);
549
550 KMP_MB(); // Synchronize parent and child threads.
551 KA_TRACE(20,
552 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
553 "arrived(%p): %llu => %llu\n",
554 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(parent_tid, team),
555 team->t.t_id, parent_tid, &thr_bar->b_arrived,
556 thr_bar->b_arrived,
557 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
558 // Mark arrival to parent thread
559 /* After performing this write (in the last iteration of the enclosing for
560 loop), a worker thread may not assume that the team is valid any more
561 - it could be deallocated by the master thread at any time. */
562 ANNOTATE_BARRIER_BEGIN(this_thr);
563 p_flag.set_waiter(other_threads[parent_tid]);
564 p_flag.release();
565 break;
566 }
567
568 // Parent threads wait for children to arrive
569 if (new_state == KMP_BARRIER_UNUSED_STATE)
570 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
571 for (child = 1, child_tid = tid + (1 << level);
572 child < branch_factor && child_tid < num_threads;
573 child++, child_tid += (1 << level)) {
574 kmp_info_t *child_thr = other_threads[child_tid];
575 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
576 #if KMP_CACHE_MANAGE
577 kmp_uint32 next_child_tid = child_tid + (1 << level);
578 // Prefetch next thread's arrived count
579 if (child + 1 < branch_factor && next_child_tid < num_threads)
580 KMP_CACHE_PREFETCH(
581 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
582 #endif /* KMP_CACHE_MANAGE */
583 KA_TRACE(20,
584 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
585 "arrived(%p) == %llu\n",
586 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
587 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
588 // Wait for child to arrive
589 kmp_flag_64<> c_flag(&child_bar->b_arrived, new_state);
590 c_flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
591 ANNOTATE_BARRIER_END(child_thr);
592 KMP_MB(); // Synchronize parent and child threads.
593 #if USE_ITT_BUILD && USE_ITT_NOTIFY
594 // Barrier imbalance - write min of the thread time and a child time to
595 // the thread.
596 if (__kmp_forkjoin_frames_mode == 2) {
597 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
598 child_thr->th.th_bar_min_time);
599 }
600 #endif
601 if (reduce) {
602 KA_TRACE(100,
603 ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
604 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
605 team->t.t_id, child_tid));
606 ANNOTATE_REDUCE_AFTER(reduce);
607 OMPT_REDUCTION_DECL(this_thr, gtid);
608 OMPT_REDUCTION_BEGIN;
609 (*reduce)(this_thr->th.th_local.reduce_data,
610 child_thr->th.th_local.reduce_data);
611 OMPT_REDUCTION_END;
612 ANNOTATE_REDUCE_BEFORE(reduce);
613 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
614 }
615 }
616 }
617
618 if (KMP_MASTER_TID(tid)) {
619 // Need to update the team arrived pointer if we are the master thread
620 if (new_state == KMP_BARRIER_UNUSED_STATE)
621 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
622 else
623 team->t.t_bar[bt].b_arrived = new_state;
624 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d "
625 "arrived(%p) = %llu\n",
626 gtid, team->t.t_id, tid, team->t.t_id,
627 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
628 }
629 KA_TRACE(
630 20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
631 gtid, team->t.t_id, tid, bt));
632 }
633
634 // The reverse versions seem to beat the forward versions overall
635 #define KMP_REVERSE_HYPER_BAR
__kmp_hyper_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))636 static void __kmp_hyper_barrier_release(
637 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
638 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
639 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
640 kmp_team_t *team;
641 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
642 kmp_info_t **other_threads;
643 kmp_uint32 num_threads;
644 kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
645 kmp_uint32 branch_factor = 1 << branch_bits;
646 kmp_uint32 child;
647 kmp_uint32 child_tid;
648 kmp_uint32 offset;
649 kmp_uint32 level;
650
651 /* Perform a hypercube-embedded tree release for all of the threads that have
652 been gathered. If KMP_REVERSE_HYPER_BAR is defined (default) the threads
653 are released in the reverse order of the corresponding gather, otherwise
654 threads are released in the same order. */
655 if (KMP_MASTER_TID(tid)) { // master
656 team = __kmp_threads[gtid]->th.th_team;
657 KMP_DEBUG_ASSERT(team != NULL);
658 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for "
659 "barrier type %d\n",
660 gtid, team->t.t_id, tid, bt));
661 #if KMP_BARRIER_ICV_PUSH
662 if (propagate_icvs) { // master already has ICVs in final destination; copy
663 copy_icvs(&thr_bar->th_fixed_icvs,
664 &team->t.t_implicit_task_taskdata[tid].td_icvs);
665 }
666 #endif
667 } else { // Handle fork barrier workers who aren't part of a team yet
668 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n", gtid,
669 &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
670 // Wait for parent thread to release us
671 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
672 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
673 ANNOTATE_BARRIER_END(this_thr);
674 #if USE_ITT_BUILD && USE_ITT_NOTIFY
675 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
676 // In fork barrier where we could not get the object reliably
677 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
678 // Cancel wait on previous parallel region...
679 __kmp_itt_task_starting(itt_sync_obj);
680
681 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
682 return;
683
684 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
685 if (itt_sync_obj != NULL)
686 // Call prepare as early as possible for "new" barrier
687 __kmp_itt_task_finished(itt_sync_obj);
688 } else
689 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
690 // Early exit for reaping threads releasing forkjoin barrier
691 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
692 return;
693
694 // The worker thread may now assume that the team is valid.
695 team = __kmp_threads[gtid]->th.th_team;
696 KMP_DEBUG_ASSERT(team != NULL);
697 tid = __kmp_tid_from_gtid(gtid);
698
699 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
700 KA_TRACE(20,
701 ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
702 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
703 KMP_MB(); // Flush all pending memory write invalidates.
704 }
705 num_threads = this_thr->th.th_team_nproc;
706 other_threads = team->t.t_threads;
707
708 #ifdef KMP_REVERSE_HYPER_BAR
709 // Count up to correct level for parent
710 for (level = 0, offset = 1;
711 offset < num_threads && (((tid >> level) & (branch_factor - 1)) == 0);
712 level += branch_bits, offset <<= branch_bits)
713 ;
714
715 // Now go down from there
716 for (level -= branch_bits, offset >>= branch_bits; offset != 0;
717 level -= branch_bits, offset >>= branch_bits)
718 #else
719 // Go down the tree, level by level
720 for (level = 0, offset = 1; offset < num_threads;
721 level += branch_bits, offset <<= branch_bits)
722 #endif // KMP_REVERSE_HYPER_BAR
723 {
724 #ifdef KMP_REVERSE_HYPER_BAR
725 /* Now go in reverse order through the children, highest to lowest.
726 Initial setting of child is conservative here. */
727 child = num_threads >> ((level == 0) ? level : level - 1);
728 for (child = (child < branch_factor - 1) ? child : branch_factor - 1,
729 child_tid = tid + (child << level);
730 child >= 1; child--, child_tid -= (1 << level))
731 #else
732 if (((tid >> level) & (branch_factor - 1)) != 0)
733 // No need to go lower than this, since this is the level parent would be
734 // notified
735 break;
736 // Iterate through children on this level of the tree
737 for (child = 1, child_tid = tid + (1 << level);
738 child < branch_factor && child_tid < num_threads;
739 child++, child_tid += (1 << level))
740 #endif // KMP_REVERSE_HYPER_BAR
741 {
742 if (child_tid >= num_threads)
743 continue; // Child doesn't exist so keep going
744 else {
745 kmp_info_t *child_thr = other_threads[child_tid];
746 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
747 #if KMP_CACHE_MANAGE
748 kmp_uint32 next_child_tid = child_tid - (1 << level);
749 // Prefetch next thread's go count
750 #ifdef KMP_REVERSE_HYPER_BAR
751 if (child - 1 >= 1 && next_child_tid < num_threads)
752 #else
753 if (child + 1 < branch_factor && next_child_tid < num_threads)
754 #endif // KMP_REVERSE_HYPER_BAR
755 KMP_CACHE_PREFETCH(
756 &other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
757 #endif /* KMP_CACHE_MANAGE */
758
759 #if KMP_BARRIER_ICV_PUSH
760 if (propagate_icvs) // push my fixed ICVs to my child
761 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
762 #endif // KMP_BARRIER_ICV_PUSH
763
764 KA_TRACE(
765 20,
766 ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
767 "go(%p): %u => %u\n",
768 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
769 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
770 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
771 // Release child from barrier
772 ANNOTATE_BARRIER_BEGIN(child_thr);
773 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
774 flag.release();
775 }
776 }
777 }
778 #if KMP_BARRIER_ICV_PUSH
779 if (propagate_icvs &&
780 !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
781 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
782 FALSE);
783 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
784 &thr_bar->th_fixed_icvs);
785 }
786 #endif
787 KA_TRACE(
788 20,
789 ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
790 gtid, team->t.t_id, tid, bt));
791 }
792
793 // Hierarchical Barrier
794
795 // Initialize thread barrier data
796 /* Initializes/re-initializes the hierarchical barrier data stored on a thread.
797 Performs the minimum amount of initialization required based on how the team
798 has changed. Returns true if leaf children will require both on-core and
799 traditional wake-up mechanisms. For example, if the team size increases,
800 threads already in the team will respond to on-core wakeup on their parent
801 thread, but threads newly added to the team will only be listening on the
802 their local b_go. */
__kmp_init_hierarchical_barrier_thread(enum barrier_type bt,kmp_bstate_t * thr_bar,kmp_uint32 nproc,int gtid,int tid,kmp_team_t * team)803 static bool __kmp_init_hierarchical_barrier_thread(enum barrier_type bt,
804 kmp_bstate_t *thr_bar,
805 kmp_uint32 nproc, int gtid,
806 int tid, kmp_team_t *team) {
807 // Checks to determine if (re-)initialization is needed
808 bool uninitialized = thr_bar->team == NULL;
809 bool team_changed = team != thr_bar->team;
810 bool team_sz_changed = nproc != thr_bar->nproc;
811 bool tid_changed = tid != thr_bar->old_tid;
812 bool retval = false;
813
814 if (uninitialized || team_sz_changed) {
815 __kmp_get_hierarchy(nproc, thr_bar);
816 }
817
818 if (uninitialized || team_sz_changed || tid_changed) {
819 thr_bar->my_level = thr_bar->depth - 1; // default for master
820 thr_bar->parent_tid = -1; // default for master
821 if (!KMP_MASTER_TID(
822 tid)) { // if not master, find parent thread in hierarchy
823 kmp_uint32 d = 0;
824 while (d < thr_bar->depth) { // find parent based on level of thread in
825 // hierarchy, and note level
826 kmp_uint32 rem;
827 if (d == thr_bar->depth - 2) { // reached level right below the master
828 thr_bar->parent_tid = 0;
829 thr_bar->my_level = d;
830 break;
831 } else if ((rem = tid % thr_bar->skip_per_level[d + 1]) !=
832 0) { // TODO: can we make this op faster?
833 // thread is not a subtree root at next level, so this is max
834 thr_bar->parent_tid = tid - rem;
835 thr_bar->my_level = d;
836 break;
837 }
838 ++d;
839 }
840 }
841 thr_bar->offset = 7 - (tid - thr_bar->parent_tid - 1);
842 thr_bar->old_tid = tid;
843 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
844 thr_bar->team = team;
845 thr_bar->parent_bar =
846 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
847 }
848 if (uninitialized || team_changed || tid_changed) {
849 thr_bar->team = team;
850 thr_bar->parent_bar =
851 &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
852 retval = true;
853 }
854 if (uninitialized || team_sz_changed || tid_changed) {
855 thr_bar->nproc = nproc;
856 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
857 if (thr_bar->my_level == 0)
858 thr_bar->leaf_kids = 0;
859 if (thr_bar->leaf_kids && (kmp_uint32)tid + thr_bar->leaf_kids + 1 > nproc)
860 thr_bar->leaf_kids = nproc - tid - 1;
861 thr_bar->leaf_state = 0;
862 for (int i = 0; i < thr_bar->leaf_kids; ++i)
863 ((char *)&(thr_bar->leaf_state))[7 - i] = 1;
864 }
865 return retval;
866 }
867
__kmp_hierarchical_barrier_gather(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,void (* reduce)(void *,void *)USE_ITT_BUILD_ARG (void * itt_sync_obj))868 static void __kmp_hierarchical_barrier_gather(
869 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
870 void (*reduce)(void *, void *) USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
871 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
872 kmp_team_t *team = this_thr->th.th_team;
873 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
874 kmp_uint32 nproc = this_thr->th.th_team_nproc;
875 kmp_info_t **other_threads = team->t.t_threads;
876 kmp_uint64 new_state;
877
878 int level = team->t.t_level;
879 if (other_threads[0]
880 ->th.th_teams_microtask) // are we inside the teams construct?
881 if (this_thr->th.th_teams_size.nteams > 1)
882 ++level; // level was not increased in teams construct for team_of_masters
883 if (level == 1)
884 thr_bar->use_oncore_barrier = 1;
885 else
886 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
887
888 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for "
889 "barrier type %d\n",
890 gtid, team->t.t_id, tid, bt));
891 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
892
893 #if USE_ITT_BUILD && USE_ITT_NOTIFY
894 // Barrier imbalance - save arrive time to the thread
895 if (__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
896 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
897 }
898 #endif
899
900 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid,
901 team);
902
903 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
904 kmp_int32 child_tid;
905 new_state =
906 (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
907 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
908 thr_bar->use_oncore_barrier) {
909 if (thr_bar->leaf_kids) {
910 // First, wait for leaf children to check-in on my b_arrived flag
911 kmp_uint64 leaf_state =
912 KMP_MASTER_TID(tid)
913 ? thr_bar->b_arrived | thr_bar->leaf_state
914 : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
915 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting "
916 "for leaf kids\n",
917 gtid, team->t.t_id, tid));
918 kmp_flag_64<> flag(&thr_bar->b_arrived, leaf_state);
919 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
920 if (reduce) {
921 ANNOTATE_REDUCE_AFTER(reduce);
922 OMPT_REDUCTION_DECL(this_thr, gtid);
923 OMPT_REDUCTION_BEGIN;
924 for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids;
925 ++child_tid) {
926 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
927 "T#%d(%d:%d)\n",
928 gtid, team->t.t_id, tid,
929 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
930 child_tid));
931 ANNOTATE_BARRIER_END(other_threads[child_tid]);
932 (*reduce)(this_thr->th.th_local.reduce_data,
933 other_threads[child_tid]->th.th_local.reduce_data);
934 }
935 OMPT_REDUCTION_END;
936 ANNOTATE_REDUCE_BEFORE(reduce);
937 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
938 }
939 // clear leaf_state bits
940 KMP_TEST_THEN_AND64(&thr_bar->b_arrived, ~(thr_bar->leaf_state));
941 }
942 // Next, wait for higher level children on each child's b_arrived flag
943 for (kmp_uint32 d = 1; d < thr_bar->my_level;
944 ++d) { // gather lowest level threads first, but skip 0
945 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
946 skip = thr_bar->skip_per_level[d];
947 if (last > nproc)
948 last = nproc;
949 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
950 kmp_info_t *child_thr = other_threads[child_tid];
951 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
952 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
953 "T#%d(%d:%d) "
954 "arrived(%p) == %llu\n",
955 gtid, team->t.t_id, tid,
956 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
957 child_tid, &child_bar->b_arrived, new_state));
958 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
959 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
960 ANNOTATE_BARRIER_END(child_thr);
961 if (reduce) {
962 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
963 "T#%d(%d:%d)\n",
964 gtid, team->t.t_id, tid,
965 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
966 child_tid));
967 ANNOTATE_REDUCE_AFTER(reduce);
968 (*reduce)(this_thr->th.th_local.reduce_data,
969 child_thr->th.th_local.reduce_data);
970 ANNOTATE_REDUCE_BEFORE(reduce);
971 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
972 }
973 }
974 }
975 } else { // Blocktime is not infinite
976 for (kmp_uint32 d = 0; d < thr_bar->my_level;
977 ++d) { // Gather lowest level threads first
978 kmp_uint32 last = tid + thr_bar->skip_per_level[d + 1],
979 skip = thr_bar->skip_per_level[d];
980 if (last > nproc)
981 last = nproc;
982 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
983 kmp_info_t *child_thr = other_threads[child_tid];
984 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
985 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait "
986 "T#%d(%d:%d) "
987 "arrived(%p) == %llu\n",
988 gtid, team->t.t_id, tid,
989 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
990 child_tid, &child_bar->b_arrived, new_state));
991 kmp_flag_64<> flag(&child_bar->b_arrived, new_state);
992 flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
993 ANNOTATE_BARRIER_END(child_thr);
994 if (reduce) {
995 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += "
996 "T#%d(%d:%d)\n",
997 gtid, team->t.t_id, tid,
998 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
999 child_tid));
1000 ANNOTATE_REDUCE_AFTER(reduce);
1001 (*reduce)(this_thr->th.th_local.reduce_data,
1002 child_thr->th.th_local.reduce_data);
1003 ANNOTATE_REDUCE_BEFORE(reduce);
1004 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
1005 }
1006 }
1007 }
1008 }
1009 }
1010 // All subordinates are gathered; now release parent if not master thread
1011
1012 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
1013 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing"
1014 " T#%d(%d:%d) arrived(%p): %llu => %llu\n",
1015 gtid, team->t.t_id, tid,
1016 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id,
1017 thr_bar->parent_tid, &thr_bar->b_arrived, thr_bar->b_arrived,
1018 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
1019 /* Mark arrival to parent: After performing this write, a worker thread may
1020 not assume that the team is valid any more - it could be deallocated by
1021 the master thread at any time. */
1022 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ||
1023 !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived
1024 // flag; release it
1025 ANNOTATE_BARRIER_BEGIN(this_thr);
1026 kmp_flag_64<> flag(&thr_bar->b_arrived,
1027 other_threads[thr_bar->parent_tid]);
1028 flag.release();
1029 } else {
1030 // Leaf does special release on "offset" bits of parent's b_arrived flag
1031 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
1032 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
1033 flag.set_waiter(other_threads[thr_bar->parent_tid]);
1034 flag.release();
1035 }
1036 } else { // Master thread needs to update the team's b_arrived value
1037 team->t.t_bar[bt].b_arrived = new_state;
1038 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d "
1039 "arrived(%p) = %llu\n",
1040 gtid, team->t.t_id, tid, team->t.t_id,
1041 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
1042 }
1043 // Is the team access below unsafe or just technically invalid?
1044 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for "
1045 "barrier type %d\n",
1046 gtid, team->t.t_id, tid, bt));
1047 }
1048
__kmp_hierarchical_barrier_release(enum barrier_type bt,kmp_info_t * this_thr,int gtid,int tid,int propagate_icvs USE_ITT_BUILD_ARG (void * itt_sync_obj))1049 static void __kmp_hierarchical_barrier_release(
1050 enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
1051 int propagate_icvs USE_ITT_BUILD_ARG(void *itt_sync_obj)) {
1052 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
1053 kmp_team_t *team;
1054 kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
1055 kmp_uint32 nproc;
1056 bool team_change = false; // indicates on-core barrier shouldn't be used
1057
1058 if (KMP_MASTER_TID(tid)) {
1059 team = __kmp_threads[gtid]->th.th_team;
1060 KMP_DEBUG_ASSERT(team != NULL);
1061 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master "
1062 "entered barrier type %d\n",
1063 gtid, team->t.t_id, tid, bt));
1064 } else { // Worker threads
1065 // Wait for parent thread to release me
1066 if (!thr_bar->use_oncore_barrier ||
1067 __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME || thr_bar->my_level != 0 ||
1068 thr_bar->team == NULL) {
1069 // Use traditional method of waiting on my own b_go flag
1070 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
1071 kmp_flag_64<> flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
1072 flag.wait(this_thr, TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1073 ANNOTATE_BARRIER_END(this_thr);
1074 TCW_8(thr_bar->b_go,
1075 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1076 } else { // Thread barrier data is initialized, this is a leaf, blocktime is
1077 // infinite, not nested
1078 // Wait on my "offset" bits on parent's b_go flag
1079 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
1080 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP,
1081 thr_bar->offset, bt,
1082 this_thr USE_ITT_BUILD_ARG(itt_sync_obj));
1083 flag.wait(this_thr, TRUE);
1084 if (thr_bar->wait_flag ==
1085 KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
1086 TCW_8(thr_bar->b_go,
1087 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1088 } else { // Reset my bits on parent's b_go flag
1089 (RCAST(volatile char *,
1090 &(thr_bar->parent_bar->b_go)))[thr_bar->offset] = 0;
1091 }
1092 }
1093 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
1094 // Early exit for reaping threads releasing forkjoin barrier
1095 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
1096 return;
1097 // The worker thread may now assume that the team is valid.
1098 team = __kmp_threads[gtid]->th.th_team;
1099 KMP_DEBUG_ASSERT(team != NULL);
1100 tid = __kmp_tid_from_gtid(gtid);
1101
1102 KA_TRACE(
1103 20,
1104 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
1105 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
1106 KMP_MB(); // Flush all pending memory write invalidates.
1107 }
1108
1109 nproc = this_thr->th.th_team_nproc;
1110 int level = team->t.t_level;
1111 if (team->t.t_threads[0]
1112 ->th.th_teams_microtask) { // are we inside the teams construct?
1113 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1114 this_thr->th.th_teams_level == level)
1115 ++level; // level was not increased in teams construct for team_of_workers
1116 if (this_thr->th.th_teams_size.nteams > 1)
1117 ++level; // level was not increased in teams construct for team_of_masters
1118 }
1119 if (level == 1)
1120 thr_bar->use_oncore_barrier = 1;
1121 else
1122 thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
1123
1124 // If the team size has increased, we still communicate with old leaves via
1125 // oncore barrier.
1126 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
1127 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
1128 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid,
1129 tid, team);
1130 // But if the entire team changes, we won't use oncore barrier at all
1131 if (team_change)
1132 old_leaf_kids = 0;
1133
1134 #if KMP_BARRIER_ICV_PUSH
1135 if (propagate_icvs) {
1136 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid,
1137 FALSE);
1138 if (KMP_MASTER_TID(
1139 tid)) { // master already has copy in final destination; copy
1140 copy_icvs(&thr_bar->th_fixed_icvs,
1141 &team->t.t_implicit_task_taskdata[tid].td_icvs);
1142 } else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1143 thr_bar->use_oncore_barrier) { // optimization for inf blocktime
1144 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
1145 // leaves (on-core children) pull parent's fixed ICVs directly to local
1146 // ICV store
1147 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1148 &thr_bar->parent_bar->th_fixed_icvs);
1149 // non-leaves will get ICVs piggybacked with b_go via NGO store
1150 } else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
1151 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can
1152 // access
1153 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
1154 else // leaves copy parent's fixed ICVs directly to local ICV store
1155 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1156 &thr_bar->parent_bar->th_fixed_icvs);
1157 }
1158 }
1159 #endif // KMP_BARRIER_ICV_PUSH
1160
1161 // Now, release my children
1162 if (thr_bar->my_level) { // not a leaf
1163 kmp_int32 child_tid;
1164 kmp_uint32 last;
1165 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
1166 thr_bar->use_oncore_barrier) {
1167 if (KMP_MASTER_TID(tid)) { // do a flat release
1168 // Set local b_go to bump children via NGO store of the cache line
1169 // containing IVCs and b_go.
1170 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
1171 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of
1172 // the cache line
1173 ngo_load(&thr_bar->th_fixed_icvs);
1174 // This loops over all the threads skipping only the leaf nodes in the
1175 // hierarchy
1176 for (child_tid = thr_bar->skip_per_level[1]; child_tid < (int)nproc;
1177 child_tid += thr_bar->skip_per_level[1]) {
1178 kmp_bstate_t *child_bar =
1179 &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
1180 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1181 "releasing T#%d(%d:%d)"
1182 " go(%p): %u => %u\n",
1183 gtid, team->t.t_id, tid,
1184 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1185 child_tid, &child_bar->b_go, child_bar->b_go,
1186 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1187 // Use ngo store (if available) to both store ICVs and release child
1188 // via child's b_go
1189 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1190 }
1191 ngo_sync();
1192 }
1193 TCW_8(thr_bar->b_go,
1194 KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1195 // Now, release leaf children
1196 if (thr_bar->leaf_kids) { // if there are any
1197 // We test team_change on the off-chance that the level 1 team changed.
1198 if (team_change ||
1199 old_leaf_kids < thr_bar->leaf_kids) { // some old, some new
1200 if (old_leaf_kids) { // release old leaf kids
1201 thr_bar->b_go |= old_leaf_state;
1202 }
1203 // Release new leaf kids
1204 last = tid + thr_bar->skip_per_level[1];
1205 if (last > nproc)
1206 last = nproc;
1207 for (child_tid = tid + 1 + old_leaf_kids; child_tid < (int)last;
1208 ++child_tid) { // skip_per_level[0]=1
1209 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1210 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1211 KA_TRACE(
1212 20,
1213 ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1214 " T#%d(%d:%d) go(%p): %u => %u\n",
1215 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1216 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1217 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1218 // Release child using child's b_go flag
1219 ANNOTATE_BARRIER_BEGIN(child_thr);
1220 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1221 flag.release();
1222 }
1223 } else { // Release all children at once with leaf_state bits on my own
1224 // b_go flag
1225 thr_bar->b_go |= thr_bar->leaf_state;
1226 }
1227 }
1228 } else { // Blocktime is not infinite; do a simple hierarchical release
1229 for (int d = thr_bar->my_level - 1; d >= 0;
1230 --d) { // Release highest level threads first
1231 last = tid + thr_bar->skip_per_level[d + 1];
1232 kmp_uint32 skip = thr_bar->skip_per_level[d];
1233 if (last > nproc)
1234 last = nproc;
1235 for (child_tid = tid + skip; child_tid < (int)last; child_tid += skip) {
1236 kmp_info_t *child_thr = team->t.t_threads[child_tid];
1237 kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1238 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) "
1239 "releasing T#%d(%d:%d) go(%p): %u => %u\n",
1240 gtid, team->t.t_id, tid,
1241 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
1242 child_tid, &child_bar->b_go, child_bar->b_go,
1243 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1244 // Release child using child's b_go flag
1245 ANNOTATE_BARRIER_BEGIN(child_thr);
1246 kmp_flag_64<> flag(&child_bar->b_go, child_thr);
1247 flag.release();
1248 }
1249 }
1250 }
1251 #if KMP_BARRIER_ICV_PUSH
1252 if (propagate_icvs && !KMP_MASTER_TID(tid))
1253 // non-leaves copy ICVs from fixed ICVs to local dest
1254 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1255 &thr_bar->th_fixed_icvs);
1256 #endif // KMP_BARRIER_ICV_PUSH
1257 }
1258 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for "
1259 "barrier type %d\n",
1260 gtid, team->t.t_id, tid, bt));
1261 }
1262
1263 // End of Barrier Algorithms
1264
1265 // type traits for cancellable value
1266 // if cancellable is true, then is_cancellable is a normal boolean variable
1267 // if cancellable is false, then is_cancellable is a compile time constant
1268 template <bool cancellable> struct is_cancellable {};
1269 template <> struct is_cancellable<true> {
1270 bool value;
is_cancellableis_cancellable1271 is_cancellable() : value(false) {}
is_cancellableis_cancellable1272 is_cancellable(bool b) : value(b) {}
operator =is_cancellable1273 is_cancellable &operator=(bool b) {
1274 value = b;
1275 return *this;
1276 }
operator boolis_cancellable1277 operator bool() const { return value; }
1278 };
1279 template <> struct is_cancellable<false> {
operator =is_cancellable1280 is_cancellable &operator=(bool b) { return *this; }
operator boolis_cancellable1281 constexpr operator bool() const { return false; }
1282 };
1283
1284 // Internal function to do a barrier.
1285 /* If is_split is true, do a split barrier, otherwise, do a plain barrier
1286 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split
1287 barrier
1288 When cancellable = false,
1289 Returns 0 if master thread, 1 if worker thread.
1290 When cancellable = true
1291 Returns 0 if not cancelled, 1 if cancelled. */
1292 template <bool cancellable = false>
__kmp_barrier_template(enum barrier_type bt,int gtid,int is_split,size_t reduce_size,void * reduce_data,void (* reduce)(void *,void *))1293 static int __kmp_barrier_template(enum barrier_type bt, int gtid, int is_split,
1294 size_t reduce_size, void *reduce_data,
1295 void (*reduce)(void *, void *)) {
1296 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
1297 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1298 int tid = __kmp_tid_from_gtid(gtid);
1299 kmp_info_t *this_thr = __kmp_threads[gtid];
1300 kmp_team_t *team = this_thr->th.th_team;
1301 int status = 0;
1302 is_cancellable<cancellable> cancelled;
1303 #if OMPT_SUPPORT && OMPT_OPTIONAL
1304 ompt_data_t *my_task_data;
1305 ompt_data_t *my_parallel_data;
1306 void *return_address;
1307 ompt_sync_region_t barrier_kind;
1308 #endif
1309
1310 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n", gtid,
1311 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1312
1313 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1314 #if OMPT_SUPPORT
1315 if (ompt_enabled.enabled) {
1316 #if OMPT_OPTIONAL
1317 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1318 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1319 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid);
1320 barrier_kind = __ompt_get_barrier_kind(bt, this_thr);
1321 if (ompt_enabled.ompt_callback_sync_region) {
1322 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1323 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1324 return_address);
1325 }
1326 if (ompt_enabled.ompt_callback_sync_region_wait) {
1327 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1328 barrier_kind, ompt_scope_begin, my_parallel_data, my_task_data,
1329 return_address);
1330 }
1331 #endif
1332 // It is OK to report the barrier state after the barrier begin callback.
1333 // According to the OMPT specification, a compliant implementation may
1334 // even delay reporting this state until the barrier begins to wait.
1335 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1336 }
1337 #endif
1338
1339 if (!team->t.t_serialized) {
1340 #if USE_ITT_BUILD
1341 // This value will be used in itt notify events below.
1342 void *itt_sync_obj = NULL;
1343 #if USE_ITT_NOTIFY
1344 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1345 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1346 #endif
1347 #endif /* USE_ITT_BUILD */
1348 if (__kmp_tasking_mode == tskm_extra_barrier) {
1349 __kmp_tasking_barrier(team, this_thr, gtid);
1350 KA_TRACE(15,
1351 ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n", gtid,
1352 __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1353 }
1354
1355 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1356 access it when the team struct is not guaranteed to exist. */
1357 // See note about the corresponding code in __kmp_join_barrier() being
1358 // performance-critical.
1359 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1360 #if KMP_USE_MONITOR
1361 this_thr->th.th_team_bt_intervals =
1362 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1363 this_thr->th.th_team_bt_set =
1364 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1365 #else
1366 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1367 #endif
1368 }
1369
1370 #if USE_ITT_BUILD
1371 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1372 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1373 #endif /* USE_ITT_BUILD */
1374 #if USE_DEBUGGER
1375 // Let the debugger know: the thread arrived to the barrier and waiting.
1376 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1377 team->t.t_bar[bt].b_master_arrived += 1;
1378 } else {
1379 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1380 } // if
1381 #endif /* USE_DEBUGGER */
1382 if (reduce != NULL) {
1383 // KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1384 this_thr->th.th_local.reduce_data = reduce_data;
1385 }
1386
1387 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1388 // use 0 to only setup the current team if nthreads > 1
1389 __kmp_task_team_setup(this_thr, team, 0);
1390
1391 if (cancellable) {
1392 cancelled = __kmp_linear_barrier_gather_cancellable(
1393 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1394 } else {
1395 switch (__kmp_barrier_gather_pattern[bt]) {
1396 case bp_hyper_bar: {
1397 // don't set branch bits to 0; use linear
1398 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1399 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid,
1400 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1401 break;
1402 }
1403 case bp_hierarchical_bar: {
1404 __kmp_hierarchical_barrier_gather(
1405 bt, this_thr, gtid, tid, reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1406 break;
1407 }
1408 case bp_tree_bar: {
1409 // don't set branch bits to 0; use linear
1410 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]);
1411 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid,
1412 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1413 break;
1414 }
1415 default: {
1416 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid,
1417 reduce USE_ITT_BUILD_ARG(itt_sync_obj));
1418 }
1419 }
1420 }
1421
1422 KMP_MB();
1423
1424 if (KMP_MASTER_TID(tid)) {
1425 status = 0;
1426 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1427 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1428 }
1429 #if USE_DEBUGGER
1430 // Let the debugger know: All threads are arrived and starting leaving the
1431 // barrier.
1432 team->t.t_bar[bt].b_team_arrived += 1;
1433 #endif
1434
1435 if (__kmp_omp_cancellation) {
1436 kmp_int32 cancel_request = KMP_ATOMIC_LD_RLX(&team->t.t_cancel_request);
1437 // Reset cancellation flag for worksharing constructs
1438 if (cancel_request == cancel_loop ||
1439 cancel_request == cancel_sections) {
1440 KMP_ATOMIC_ST_RLX(&team->t.t_cancel_request, cancel_noreq);
1441 }
1442 }
1443 #if USE_ITT_BUILD
1444 /* TODO: In case of split reduction barrier, master thread may send
1445 acquired event early, before the final summation into the shared
1446 variable is done (final summation can be a long operation for array
1447 reductions). */
1448 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1449 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1450 #endif /* USE_ITT_BUILD */
1451 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1452 // Barrier - report frame end (only if active_level == 1)
1453 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1454 __kmp_forkjoin_frames_mode &&
1455 (this_thr->th.th_teams_microtask == NULL || // either not in teams
1456 this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1457 team->t.t_active_level == 1) {
1458 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
1459 kmp_uint64 cur_time = __itt_get_timestamp();
1460 kmp_info_t **other_threads = team->t.t_threads;
1461 int nproc = this_thr->th.th_team_nproc;
1462 int i;
1463 switch (__kmp_forkjoin_frames_mode) {
1464 case 1:
1465 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1466 loc, nproc);
1467 this_thr->th.th_frame_time = cur_time;
1468 break;
1469 case 2: // AC 2015-01-19: currently does not work for hierarchical (to
1470 // be fixed)
1471 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time,
1472 1, loc, nproc);
1473 break;
1474 case 3:
1475 if (__itt_metadata_add_ptr) {
1476 // Initialize with master's wait time
1477 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1478 // Set arrive time to zero to be able to check it in
1479 // __kmp_invoke_task(); the same is done inside the loop below
1480 this_thr->th.th_bar_arrive_time = 0;
1481 for (i = 1; i < nproc; ++i) {
1482 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1483 other_threads[i]->th.th_bar_arrive_time = 0;
1484 }
1485 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1486 cur_time, delta,
1487 (kmp_uint64)(reduce != NULL));
1488 }
1489 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1490 loc, nproc);
1491 this_thr->th.th_frame_time = cur_time;
1492 break;
1493 }
1494 }
1495 #endif /* USE_ITT_BUILD */
1496 } else {
1497 status = 1;
1498 #if USE_ITT_BUILD
1499 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1500 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1501 #endif /* USE_ITT_BUILD */
1502 }
1503 if ((status == 1 || !is_split) && !cancelled) {
1504 if (cancellable) {
1505 cancelled = __kmp_linear_barrier_release_cancellable(
1506 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1507 } else {
1508 switch (__kmp_barrier_release_pattern[bt]) {
1509 case bp_hyper_bar: {
1510 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1511 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1512 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1513 break;
1514 }
1515 case bp_hierarchical_bar: {
1516 __kmp_hierarchical_barrier_release(
1517 bt, this_thr, gtid, tid, FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1518 break;
1519 }
1520 case bp_tree_bar: {
1521 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1522 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1523 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1524 break;
1525 }
1526 default: {
1527 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1528 FALSE USE_ITT_BUILD_ARG(itt_sync_obj));
1529 }
1530 }
1531 }
1532 if (__kmp_tasking_mode != tskm_immediate_exec && !cancelled) {
1533 __kmp_task_team_sync(this_thr, team);
1534 }
1535 }
1536
1537 #if USE_ITT_BUILD
1538 /* GEH: TODO: Move this under if-condition above and also include in
1539 __kmp_end_split_barrier(). This will more accurately represent the actual
1540 release time of the threads for split barriers. */
1541 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1542 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1543 #endif /* USE_ITT_BUILD */
1544 } else { // Team is serialized.
1545 status = 0;
1546 if (__kmp_tasking_mode != tskm_immediate_exec) {
1547 if (this_thr->th.th_task_team != NULL) {
1548 #if USE_ITT_NOTIFY
1549 void *itt_sync_obj = NULL;
1550 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1551 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1552 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1553 }
1554 #endif
1555
1556 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks ==
1557 TRUE);
1558 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1559 __kmp_task_team_setup(this_thr, team, 0);
1560
1561 #if USE_ITT_BUILD
1562 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1563 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1564 #endif /* USE_ITT_BUILD */
1565 }
1566 }
1567 }
1568 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1569 gtid, __kmp_team_from_gtid(gtid)->t.t_id,
1570 __kmp_tid_from_gtid(gtid), status));
1571
1572 #if OMPT_SUPPORT
1573 if (ompt_enabled.enabled) {
1574 #if OMPT_OPTIONAL
1575 if (ompt_enabled.ompt_callback_sync_region_wait) {
1576 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1577 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1578 return_address);
1579 }
1580 if (ompt_enabled.ompt_callback_sync_region) {
1581 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1582 barrier_kind, ompt_scope_end, my_parallel_data, my_task_data,
1583 return_address);
1584 }
1585 #endif
1586 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1587 }
1588 #endif
1589 ANNOTATE_BARRIER_END(&team->t.t_bar);
1590
1591 if (cancellable)
1592 return (int)cancelled;
1593 return status;
1594 }
1595
1596 // Returns 0 if master thread, 1 if worker thread.
__kmp_barrier(enum barrier_type bt,int gtid,int is_split,size_t reduce_size,void * reduce_data,void (* reduce)(void *,void *))1597 int __kmp_barrier(enum barrier_type bt, int gtid, int is_split,
1598 size_t reduce_size, void *reduce_data,
1599 void (*reduce)(void *, void *)) {
1600 return __kmp_barrier_template<>(bt, gtid, is_split, reduce_size, reduce_data,
1601 reduce);
1602 }
1603
1604 #if defined(KMP_GOMP_COMPAT)
1605 // Returns 1 if cancelled, 0 otherwise
__kmp_barrier_gomp_cancel(int gtid)1606 int __kmp_barrier_gomp_cancel(int gtid) {
1607 if (__kmp_omp_cancellation) {
1608 int cancelled = __kmp_barrier_template<true>(bs_plain_barrier, gtid, FALSE,
1609 0, NULL, NULL);
1610 if (cancelled) {
1611 int tid = __kmp_tid_from_gtid(gtid);
1612 kmp_info_t *this_thr = __kmp_threads[gtid];
1613 if (KMP_MASTER_TID(tid)) {
1614 // Master does not need to revert anything
1615 } else {
1616 // Workers need to revert their private b_arrived flag
1617 this_thr->th.th_bar[bs_plain_barrier].bb.b_arrived -=
1618 KMP_BARRIER_STATE_BUMP;
1619 }
1620 }
1621 return cancelled;
1622 }
1623 __kmp_barrier(bs_plain_barrier, gtid, FALSE, 0, NULL, NULL);
1624 return FALSE;
1625 }
1626 #endif
1627
__kmp_end_split_barrier(enum barrier_type bt,int gtid)1628 void __kmp_end_split_barrier(enum barrier_type bt, int gtid) {
1629 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1630 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1631 int tid = __kmp_tid_from_gtid(gtid);
1632 kmp_info_t *this_thr = __kmp_threads[gtid];
1633 kmp_team_t *team = this_thr->th.th_team;
1634
1635 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1636 if (!team->t.t_serialized) {
1637 if (KMP_MASTER_GTID(gtid)) {
1638 switch (__kmp_barrier_release_pattern[bt]) {
1639 case bp_hyper_bar: {
1640 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1641 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid,
1642 FALSE USE_ITT_BUILD_ARG(NULL));
1643 break;
1644 }
1645 case bp_hierarchical_bar: {
1646 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid,
1647 FALSE USE_ITT_BUILD_ARG(NULL));
1648 break;
1649 }
1650 case bp_tree_bar: {
1651 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1652 __kmp_tree_barrier_release(bt, this_thr, gtid, tid,
1653 FALSE USE_ITT_BUILD_ARG(NULL));
1654 break;
1655 }
1656 default: {
1657 __kmp_linear_barrier_release(bt, this_thr, gtid, tid,
1658 FALSE USE_ITT_BUILD_ARG(NULL));
1659 }
1660 }
1661 if (__kmp_tasking_mode != tskm_immediate_exec) {
1662 __kmp_task_team_sync(this_thr, team);
1663 } // if
1664 }
1665 }
1666 ANNOTATE_BARRIER_END(&team->t.t_bar);
1667 }
1668
__kmp_join_barrier(int gtid)1669 void __kmp_join_barrier(int gtid) {
1670 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
1671 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1672 kmp_info_t *this_thr = __kmp_threads[gtid];
1673 kmp_team_t *team;
1674 kmp_uint nproc;
1675 kmp_info_t *master_thread;
1676 int tid;
1677 #ifdef KMP_DEBUG
1678 int team_id;
1679 #endif /* KMP_DEBUG */
1680 #if USE_ITT_BUILD
1681 void *itt_sync_obj = NULL;
1682 #if USE_ITT_NOTIFY
1683 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1684 // Get object created at fork_barrier
1685 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1686 #endif
1687 #endif /* USE_ITT_BUILD */
1688 KMP_MB();
1689
1690 // Get current info
1691 team = this_thr->th.th_team;
1692 nproc = this_thr->th.th_team_nproc;
1693 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1694 tid = __kmp_tid_from_gtid(gtid);
1695 #ifdef KMP_DEBUG
1696 team_id = team->t.t_id;
1697 #endif /* KMP_DEBUG */
1698 master_thread = this_thr->th.th_team_master;
1699 #ifdef KMP_DEBUG
1700 if (master_thread != team->t.t_threads[0]) {
1701 __kmp_print_structure();
1702 }
1703 #endif /* KMP_DEBUG */
1704 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1705 KMP_MB();
1706
1707 // Verify state
1708 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1709 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1710 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1711 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1712 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n",
1713 gtid, team_id, tid));
1714
1715 ANNOTATE_BARRIER_BEGIN(&team->t.t_bar);
1716 #if OMPT_SUPPORT
1717 if (ompt_enabled.enabled) {
1718 #if OMPT_OPTIONAL
1719 ompt_data_t *my_task_data;
1720 ompt_data_t *my_parallel_data;
1721 void *codeptr = NULL;
1722 int ds_tid = this_thr->th.th_info.ds.ds_tid;
1723 if (KMP_MASTER_TID(ds_tid) &&
1724 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
1725 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
1726 codeptr = team->t.ompt_team_info.master_return_address;
1727 my_task_data = OMPT_CUR_TASK_DATA(this_thr);
1728 my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr);
1729 if (ompt_enabled.ompt_callback_sync_region) {
1730 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
1731 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1732 my_task_data, codeptr);
1733 }
1734 if (ompt_enabled.ompt_callback_sync_region_wait) {
1735 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
1736 ompt_sync_region_barrier_implicit, ompt_scope_begin, my_parallel_data,
1737 my_task_data, codeptr);
1738 }
1739 if (!KMP_MASTER_TID(ds_tid))
1740 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr);
1741 #endif
1742 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier_implicit;
1743 }
1744 #endif
1745
1746 if (__kmp_tasking_mode == tskm_extra_barrier) {
1747 __kmp_tasking_barrier(team, this_thr, gtid);
1748 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid,
1749 team_id, tid));
1750 }
1751 #ifdef KMP_DEBUG
1752 if (__kmp_tasking_mode != tskm_immediate_exec) {
1753 KA_TRACE(20, ("__kmp_join_barrier: T#%d, old team = %d, old task_team = "
1754 "%p, th_task_team = %p\n",
1755 __kmp_gtid_from_thread(this_thr), team_id,
1756 team->t.t_task_team[this_thr->th.th_task_state],
1757 this_thr->th.th_task_team));
1758 KMP_DEBUG_ASSERT(this_thr->th.th_task_team ==
1759 team->t.t_task_team[this_thr->th.th_task_state]);
1760 }
1761 #endif /* KMP_DEBUG */
1762
1763 /* Copy the blocktime info to the thread, where __kmp_wait_template() can
1764 access it when the team struct is not guaranteed to exist. Doing these
1765 loads causes a cache miss slows down EPCC parallel by 2x. As a workaround,
1766 we do not perform the copy if blocktime=infinite, since the values are not
1767 used by __kmp_wait_template() in that case. */
1768 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1769 #if KMP_USE_MONITOR
1770 this_thr->th.th_team_bt_intervals =
1771 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1772 this_thr->th.th_team_bt_set =
1773 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1774 #else
1775 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1776 #endif
1777 }
1778
1779 #if USE_ITT_BUILD
1780 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1781 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1782 #endif /* USE_ITT_BUILD */
1783
1784 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1785 case bp_hyper_bar: {
1786 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1787 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1788 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1789 break;
1790 }
1791 case bp_hierarchical_bar: {
1792 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1793 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1794 break;
1795 }
1796 case bp_tree_bar: {
1797 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1798 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1799 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1800 break;
1801 }
1802 default: {
1803 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid,
1804 NULL USE_ITT_BUILD_ARG(itt_sync_obj));
1805 }
1806 }
1807
1808 /* From this point on, the team data structure may be deallocated at any time
1809 by the master thread - it is unsafe to reference it in any of the worker
1810 threads. Any per-team data items that need to be referenced before the
1811 end of the barrier should be moved to the kmp_task_team_t structs. */
1812 if (KMP_MASTER_TID(tid)) {
1813 if (__kmp_tasking_mode != tskm_immediate_exec) {
1814 __kmp_task_team_wait(this_thr, team USE_ITT_BUILD_ARG(itt_sync_obj));
1815 }
1816 if (__kmp_display_affinity) {
1817 KMP_CHECK_UPDATE(team->t.t_display_affinity, 0);
1818 }
1819 #if KMP_STATS_ENABLED
1820 // Have master thread flag the workers to indicate they are now waiting for
1821 // next parallel region, Also wake them up so they switch their timers to
1822 // idle.
1823 for (int i = 0; i < team->t.t_nproc; ++i) {
1824 kmp_info_t *team_thread = team->t.t_threads[i];
1825 if (team_thread == this_thr)
1826 continue;
1827 team_thread->th.th_stats->setIdleFlag();
1828 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME &&
1829 team_thread->th.th_sleep_loc != NULL)
1830 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread),
1831 team_thread->th.th_sleep_loc);
1832 }
1833 #endif
1834 #if USE_ITT_BUILD
1835 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1836 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1837 #endif /* USE_ITT_BUILD */
1838
1839 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1840 // Join barrier - report frame end
1841 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) &&
1842 __kmp_forkjoin_frames_mode &&
1843 (this_thr->th.th_teams_microtask == NULL || // either not in teams
1844 this_thr->th.th_teams_size.nteams == 1) && // or inside single team
1845 team->t.t_active_level == 1) {
1846 kmp_uint64 cur_time = __itt_get_timestamp();
1847 ident_t *loc = team->t.t_ident;
1848 kmp_info_t **other_threads = team->t.t_threads;
1849 int nproc = this_thr->th.th_team_nproc;
1850 int i;
1851 switch (__kmp_forkjoin_frames_mode) {
1852 case 1:
1853 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1854 loc, nproc);
1855 break;
1856 case 2:
1857 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1,
1858 loc, nproc);
1859 break;
1860 case 3:
1861 if (__itt_metadata_add_ptr) {
1862 // Initialize with master's wait time
1863 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
1864 // Set arrive time to zero to be able to check it in
1865 // __kmp_invoke_task(); the same is done inside the loop below
1866 this_thr->th.th_bar_arrive_time = 0;
1867 for (i = 1; i < nproc; ++i) {
1868 delta += (cur_time - other_threads[i]->th.th_bar_arrive_time);
1869 other_threads[i]->th.th_bar_arrive_time = 0;
1870 }
1871 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time,
1872 cur_time, delta, 0);
1873 }
1874 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0,
1875 loc, nproc);
1876 this_thr->th.th_frame_time = cur_time;
1877 break;
1878 }
1879 }
1880 #endif /* USE_ITT_BUILD */
1881 }
1882 #if USE_ITT_BUILD
1883 else {
1884 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1885 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1886 }
1887 #endif /* USE_ITT_BUILD */
1888
1889 #if KMP_DEBUG
1890 if (KMP_MASTER_TID(tid)) {
1891 KA_TRACE(
1892 15,
1893 ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1894 gtid, team_id, tid, nproc));
1895 }
1896 #endif /* KMP_DEBUG */
1897
1898 // TODO now, mark worker threads as done so they may be disbanded
1899 KMP_MB(); // Flush all pending memory write invalidates.
1900 KA_TRACE(10,
1901 ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
1902
1903 ANNOTATE_BARRIER_END(&team->t.t_bar);
1904 }
1905
1906 // TODO release worker threads' fork barriers as we are ready instead of all at
1907 // once
__kmp_fork_barrier(int gtid,int tid)1908 void __kmp_fork_barrier(int gtid, int tid) {
1909 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
1910 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
1911 kmp_info_t *this_thr = __kmp_threads[gtid];
1912 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1913 #if USE_ITT_BUILD
1914 void *itt_sync_obj = NULL;
1915 #endif /* USE_ITT_BUILD */
1916 if (team)
1917 ANNOTATE_BARRIER_END(&team->t.t_bar);
1918
1919 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
1920 (team != NULL) ? team->t.t_id : -1, tid));
1921
1922 // th_team pointer only valid for master thread here
1923 if (KMP_MASTER_TID(tid)) {
1924 #if USE_ITT_BUILD && USE_ITT_NOTIFY
1925 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1926 // Create itt barrier object
1927 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1928 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1929 }
1930 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1931
1932 #ifdef KMP_DEBUG
1933 kmp_info_t **other_threads = team->t.t_threads;
1934 int i;
1935
1936 // Verify state
1937 KMP_MB();
1938
1939 for (i = 1; i < team->t.t_nproc; ++i) {
1940 KA_TRACE(500,
1941 ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go "
1942 "== %u.\n",
1943 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1944 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1945 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1946 KMP_DEBUG_ASSERT(
1947 (TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go) &
1948 ~(KMP_BARRIER_SLEEP_STATE)) == KMP_INIT_BARRIER_STATE);
1949 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1950 }
1951 #endif
1952
1953 if (__kmp_tasking_mode != tskm_immediate_exec) {
1954 // 0 indicates setup current task team if nthreads > 1
1955 __kmp_task_team_setup(this_thr, team, 0);
1956 }
1957
1958 /* The master thread may have changed its blocktime between the join barrier
1959 and the fork barrier. Copy the blocktime info to the thread, where
1960 __kmp_wait_template() can access it when the team struct is not
1961 guaranteed to exist. */
1962 // See note about the corresponding code in __kmp_join_barrier() being
1963 // performance-critical
1964 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1965 #if KMP_USE_MONITOR
1966 this_thr->th.th_team_bt_intervals =
1967 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1968 this_thr->th.th_team_bt_set =
1969 team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1970 #else
1971 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL(team, tid);
1972 #endif
1973 }
1974 } // master
1975
1976 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1977 case bp_hyper_bar: {
1978 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1979 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1980 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1981 break;
1982 }
1983 case bp_hierarchical_bar: {
1984 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1985 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1986 break;
1987 }
1988 case bp_tree_bar: {
1989 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1990 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1991 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1992 break;
1993 }
1994 default: {
1995 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid,
1996 TRUE USE_ITT_BUILD_ARG(itt_sync_obj));
1997 }
1998 }
1999
2000 #if OMPT_SUPPORT
2001 if (ompt_enabled.enabled &&
2002 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) {
2003 int ds_tid = this_thr->th.th_info.ds.ds_tid;
2004 ompt_data_t *task_data = (team)
2005 ? OMPT_CUR_TASK_DATA(this_thr)
2006 : &(this_thr->th.ompt_thread_info.task_data);
2007 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
2008 #if OMPT_OPTIONAL
2009 void *codeptr = NULL;
2010 if (KMP_MASTER_TID(ds_tid) &&
2011 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) ||
2012 ompt_callbacks.ompt_callback(ompt_callback_sync_region)))
2013 codeptr = team->t.ompt_team_info.master_return_address;
2014 if (ompt_enabled.ompt_callback_sync_region_wait) {
2015 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)(
2016 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2017 codeptr);
2018 }
2019 if (ompt_enabled.ompt_callback_sync_region) {
2020 ompt_callbacks.ompt_callback(ompt_callback_sync_region)(
2021 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data,
2022 codeptr);
2023 }
2024 #endif
2025 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) {
2026 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)(
2027 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial?
2028 }
2029 }
2030 #endif
2031
2032 // Early exit for reaping threads releasing forkjoin barrier
2033 if (TCR_4(__kmp_global.g.g_done)) {
2034 this_thr->th.th_task_team = NULL;
2035
2036 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2037 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2038 if (!KMP_MASTER_TID(tid)) {
2039 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2040 if (itt_sync_obj)
2041 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
2042 }
2043 }
2044 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2045 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
2046 return;
2047 }
2048
2049 /* We can now assume that a valid team structure has been allocated by the
2050 master and propagated to all worker threads. The current thread, however,
2051 may not be part of the team, so we can't blindly assume that the team
2052 pointer is non-null. */
2053 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
2054 KMP_DEBUG_ASSERT(team != NULL);
2055 tid = __kmp_tid_from_gtid(gtid);
2056
2057 #if KMP_BARRIER_ICV_PULL
2058 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2059 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2060 implicit task has this data before this function is called. We cannot
2061 modify __kmp_fork_call() to look at the fixed ICVs in the master's thread
2062 struct, because it is not always the case that the threads arrays have
2063 been allocated when __kmp_fork_call() is executed. */
2064 {
2065 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
2066 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
2067 // Copy the initial ICVs from the master's thread struct to the implicit
2068 // task for this tid.
2069 KA_TRACE(10,
2070 ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
2071 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team,
2072 tid, FALSE);
2073 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
2074 &team->t.t_threads[0]
2075 ->th.th_bar[bs_forkjoin_barrier]
2076 .bb.th_fixed_icvs);
2077 }
2078 }
2079 #endif // KMP_BARRIER_ICV_PULL
2080
2081 if (__kmp_tasking_mode != tskm_immediate_exec) {
2082 __kmp_task_team_sync(this_thr, team);
2083 }
2084
2085 #if KMP_AFFINITY_SUPPORTED
2086 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
2087 if (proc_bind == proc_bind_intel) {
2088 // Call dynamic affinity settings
2089 if (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
2090 __kmp_balanced_affinity(this_thr, team->t.t_nproc);
2091 }
2092 } else if (proc_bind != proc_bind_false) {
2093 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
2094 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
2095 __kmp_gtid_from_thread(this_thr),
2096 this_thr->th.th_current_place));
2097 } else {
2098 __kmp_affinity_set_place(gtid);
2099 }
2100 }
2101 #endif // KMP_AFFINITY_SUPPORTED
2102 // Perform the display affinity functionality
2103 if (__kmp_display_affinity) {
2104 if (team->t.t_display_affinity
2105 #if KMP_AFFINITY_SUPPORTED
2106 || (__kmp_affinity_type == affinity_balanced && team->t.t_size_changed)
2107 #endif
2108 ) {
2109 // NULL means use the affinity-format-var ICV
2110 __kmp_aux_display_affinity(gtid, NULL);
2111 this_thr->th.th_prev_num_threads = team->t.t_nproc;
2112 this_thr->th.th_prev_level = team->t.t_level;
2113 }
2114 }
2115 if (!KMP_MASTER_TID(tid))
2116 KMP_CHECK_UPDATE(this_thr->th.th_def_allocator, team->t.t_def_allocator);
2117
2118 #if USE_ITT_BUILD && USE_ITT_NOTIFY
2119 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
2120 if (!KMP_MASTER_TID(tid)) {
2121 // Get correct barrier object
2122 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
2123 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
2124 } // (prepare called inside barrier_release)
2125 }
2126 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
2127 ANNOTATE_BARRIER_END(&team->t.t_bar);
2128 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid,
2129 team->t.t_id, tid));
2130 }
2131
__kmp_setup_icv_copy(kmp_team_t * team,int new_nproc,kmp_internal_control_t * new_icvs,ident_t * loc)2132 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc,
2133 kmp_internal_control_t *new_icvs, ident_t *loc) {
2134 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
2135
2136 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
2137 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
2138
2139 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
2140 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's
2141 implicit task has this data before this function is called. */
2142 #if KMP_BARRIER_ICV_PULL
2143 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains
2144 untouched), where all of the worker threads can access them and make their
2145 own copies after the barrier. */
2146 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2147 // allocated at this point
2148 copy_icvs(
2149 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs,
2150 new_icvs);
2151 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n", 0,
2152 team->t.t_threads[0], team));
2153 #elif KMP_BARRIER_ICV_PUSH
2154 // The ICVs will be propagated in the fork barrier, so nothing needs to be
2155 // done here.
2156 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n", 0,
2157 team->t.t_threads[0], team));
2158 #else
2159 // Copy the ICVs to each of the non-master threads. This takes O(nthreads)
2160 // time.
2161 ngo_load(new_icvs);
2162 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be
2163 // allocated at this point
2164 for (int f = 1; f < new_nproc; ++f) { // Skip the master thread
2165 // TODO: GEH - pass in better source location info since usually NULL here
2166 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2167 f, team->t.t_threads[f], team));
2168 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
2169 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
2170 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
2171 f, team->t.t_threads[f], team));
2172 }
2173 ngo_sync();
2174 #endif // KMP_BARRIER_ICV_PULL
2175 }
2176