1 /*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 /* Dynamic scheduling initialization and dispatch.
14 *
15 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16 * it may change values between parallel regions. __kmp_max_nth
17 * is the largest value __kmp_nth may take, 1 is the smallest.
18 */
19
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41
__kmp_dispatch_deo_error(int * gtid_ref,int * cid_ref,ident_t * loc_ref)42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43 kmp_info_t *th;
44
45 KMP_DEBUG_ASSERT(gtid_ref);
46
47 if (__kmp_env_consistency_check) {
48 th = __kmp_threads[*gtid_ref];
49 if (th->th.th_root->r.r_active &&
50 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56 }
57 }
58 }
59
__kmp_dispatch_dxo_error(int * gtid_ref,int * cid_ref,ident_t * loc_ref)60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61 kmp_info_t *th;
62
63 if (__kmp_env_consistency_check) {
64 th = __kmp_threads[*gtid_ref];
65 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67 }
68 }
69 }
70
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
__kmp_get_monotonicity(ident_t * loc,enum sched_type schedule,bool use_hier=false)72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73 bool use_hier = false) {
74 // Pick up the nonmonotonic/monotonic bits from the scheduling type
75 // TODO: make nonmonotonic when static_steal is fixed
76 int monotonicity = SCHEDULE_MONOTONIC;
77
78 // Let default be monotonic for executables
79 // compiled with OpenMP* 4.5 or less compilers
80 if (loc->get_openmp_version() < 50)
81 monotonicity = SCHEDULE_MONOTONIC;
82
83 if (use_hier)
84 monotonicity = SCHEDULE_MONOTONIC;
85 else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86 monotonicity = SCHEDULE_NONMONOTONIC;
87 else if (SCHEDULE_HAS_MONOTONIC(schedule))
88 monotonicity = SCHEDULE_MONOTONIC;
89
90 return monotonicity;
91 }
92
93 // Initialize a dispatch_private_info_template<T> buffer for a particular
94 // type of schedule,chunk. The loop description is found in lb (lower bound),
95 // ub (upper bound), and st (stride). nproc is the number of threads relevant
96 // to the scheduling (often the number of threads in a team, but not always if
97 // hierarchical scheduling is used). tid is the id of the thread calling
98 // the function within the group of nproc threads. It will have a value
99 // between 0 and nproc - 1. This is often just the thread id within a team, but
100 // is not necessarily the case when using hierarchical scheduling.
101 // loc is the source file location of the corresponding loop
102 // gtid is the global thread id
103 template <typename T>
__kmp_dispatch_init_algorithm(ident_t * loc,int gtid,dispatch_private_info_template<T> * pr,enum sched_type schedule,T lb,T ub,typename traits_t<T>::signed_t st,kmp_uint64 * cur_chunk,typename traits_t<T>::signed_t chunk,T nproc,T tid)104 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
105 dispatch_private_info_template<T> *pr,
106 enum sched_type schedule, T lb, T ub,
107 typename traits_t<T>::signed_t st,
108 #if USE_ITT_BUILD
109 kmp_uint64 *cur_chunk,
110 #endif
111 typename traits_t<T>::signed_t chunk,
112 T nproc, T tid) {
113 typedef typename traits_t<T>::unsigned_t UT;
114 typedef typename traits_t<T>::floating_t DBL;
115
116 int active;
117 T tc;
118 kmp_info_t *th;
119 kmp_team_t *team;
120 int monotonicity;
121 bool use_hier;
122
123 #ifdef KMP_DEBUG
124 typedef typename traits_t<T>::signed_t ST;
125 {
126 char *buff;
127 // create format specifiers before the debug output
128 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
129 "pr:%%p lb:%%%s ub:%%%s st:%%%s "
130 "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
131 traits_t<T>::spec, traits_t<T>::spec,
132 traits_t<ST>::spec, traits_t<ST>::spec,
133 traits_t<T>::spec, traits_t<T>::spec);
134 KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
135 __kmp_str_free(&buff);
136 }
137 #endif
138 /* setup data */
139 th = __kmp_threads[gtid];
140 team = th->th.th_team;
141 active = !team->t.t_serialized;
142
143 #if USE_ITT_BUILD
144 int itt_need_metadata_reporting =
145 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
146 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
147 team->t.t_active_level == 1;
148 #endif
149
150 #if KMP_USE_HIER_SCHED
151 use_hier = pr->flags.use_hier;
152 #else
153 use_hier = false;
154 #endif
155
156 /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
157 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
158 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
159
160 /* Pick up the nomerge/ordered bits from the scheduling type */
161 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
162 pr->flags.nomerge = TRUE;
163 schedule =
164 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
165 } else {
166 pr->flags.nomerge = FALSE;
167 }
168 pr->type_size = traits_t<T>::type_size; // remember the size of variables
169 if (kmp_ord_lower & schedule) {
170 pr->flags.ordered = TRUE;
171 schedule =
172 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
173 } else {
174 pr->flags.ordered = FALSE;
175 }
176 // Ordered overrides nonmonotonic
177 if (pr->flags.ordered) {
178 monotonicity = SCHEDULE_MONOTONIC;
179 }
180
181 if (schedule == kmp_sch_static) {
182 schedule = __kmp_static;
183 } else {
184 if (schedule == kmp_sch_runtime) {
185 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
186 // not specified)
187 schedule = team->t.t_sched.r_sched_type;
188 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
189 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
190 // Detail the schedule if needed (global controls are differentiated
191 // appropriately)
192 if (schedule == kmp_sch_guided_chunked) {
193 schedule = __kmp_guided;
194 } else if (schedule == kmp_sch_static) {
195 schedule = __kmp_static;
196 }
197 // Use the chunk size specified by OMP_SCHEDULE (or default if not
198 // specified)
199 chunk = team->t.t_sched.chunk;
200 #if USE_ITT_BUILD
201 if (cur_chunk)
202 *cur_chunk = chunk;
203 #endif
204 #ifdef KMP_DEBUG
205 {
206 char *buff;
207 // create format specifiers before the debug output
208 buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
209 "schedule:%%d chunk:%%%s\n",
210 traits_t<ST>::spec);
211 KD_TRACE(10, (buff, gtid, schedule, chunk));
212 __kmp_str_free(&buff);
213 }
214 #endif
215 } else {
216 if (schedule == kmp_sch_guided_chunked) {
217 schedule = __kmp_guided;
218 }
219 if (chunk <= 0) {
220 chunk = KMP_DEFAULT_CHUNK;
221 }
222 }
223
224 if (schedule == kmp_sch_auto) {
225 // mapping and differentiation: in the __kmp_do_serial_initialize()
226 schedule = __kmp_auto;
227 #ifdef KMP_DEBUG
228 {
229 char *buff;
230 // create format specifiers before the debug output
231 buff = __kmp_str_format(
232 "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
233 "schedule:%%d chunk:%%%s\n",
234 traits_t<ST>::spec);
235 KD_TRACE(10, (buff, gtid, schedule, chunk));
236 __kmp_str_free(&buff);
237 }
238 #endif
239 }
240 #if KMP_STATIC_STEAL_ENABLED
241 // map nonmonotonic:dynamic to static steal
242 if (schedule == kmp_sch_dynamic_chunked) {
243 if (monotonicity == SCHEDULE_NONMONOTONIC)
244 schedule = kmp_sch_static_steal;
245 }
246 #endif
247 /* guided analytical not safe for too many threads */
248 if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
249 schedule = kmp_sch_guided_iterative_chunked;
250 KMP_WARNING(DispatchManyThreads);
251 }
252 if (schedule == kmp_sch_runtime_simd) {
253 // compiler provides simd_width in the chunk parameter
254 schedule = team->t.t_sched.r_sched_type;
255 monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
256 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
257 // Detail the schedule if needed (global controls are differentiated
258 // appropriately)
259 if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
260 schedule == __kmp_static) {
261 schedule = kmp_sch_static_balanced_chunked;
262 } else {
263 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
264 schedule = kmp_sch_guided_simd;
265 }
266 chunk = team->t.t_sched.chunk * chunk;
267 }
268 #if USE_ITT_BUILD
269 if (cur_chunk)
270 *cur_chunk = chunk;
271 #endif
272 #ifdef KMP_DEBUG
273 {
274 char *buff;
275 // create format specifiers before the debug output
276 buff = __kmp_str_format(
277 "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
278 " chunk:%%%s\n",
279 traits_t<ST>::spec);
280 KD_TRACE(10, (buff, gtid, schedule, chunk));
281 __kmp_str_free(&buff);
282 }
283 #endif
284 }
285 pr->u.p.parm1 = chunk;
286 }
287 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
288 "unknown scheduling type");
289
290 pr->u.p.count = 0;
291
292 if (__kmp_env_consistency_check) {
293 if (st == 0) {
294 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
295 (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
296 }
297 }
298 // compute trip count
299 if (st == 1) { // most common case
300 if (ub >= lb) {
301 tc = ub - lb + 1;
302 } else { // ub < lb
303 tc = 0; // zero-trip
304 }
305 } else if (st < 0) {
306 if (lb >= ub) {
307 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
308 // where the division needs to be unsigned regardless of the result type
309 tc = (UT)(lb - ub) / (-st) + 1;
310 } else { // lb < ub
311 tc = 0; // zero-trip
312 }
313 } else { // st > 0
314 if (ub >= lb) {
315 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
316 // where the division needs to be unsigned regardless of the result type
317 tc = (UT)(ub - lb) / st + 1;
318 } else { // ub < lb
319 tc = 0; // zero-trip
320 }
321 }
322
323 #if KMP_STATS_ENABLED
324 if (KMP_MASTER_GTID(gtid)) {
325 KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
326 }
327 #endif
328
329 pr->u.p.lb = lb;
330 pr->u.p.ub = ub;
331 pr->u.p.st = st;
332 pr->u.p.tc = tc;
333
334 #if KMP_OS_WINDOWS
335 pr->u.p.last_upper = ub + st;
336 #endif /* KMP_OS_WINDOWS */
337
338 /* NOTE: only the active parallel region(s) has active ordered sections */
339
340 if (active) {
341 if (pr->flags.ordered) {
342 pr->ordered_bumped = 0;
343 pr->u.p.ordered_lower = 1;
344 pr->u.p.ordered_upper = 0;
345 }
346 }
347
348 switch (schedule) {
349 #if (KMP_STATIC_STEAL_ENABLED)
350 case kmp_sch_static_steal: {
351 T ntc, init;
352
353 KD_TRACE(100,
354 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
355 gtid));
356
357 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
358 if (nproc > 1 && ntc >= nproc) {
359 KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
360 T id = tid;
361 T small_chunk, extras;
362
363 small_chunk = ntc / nproc;
364 extras = ntc % nproc;
365
366 init = id * small_chunk + (id < extras ? id : extras);
367 pr->u.p.count = init;
368 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
369
370 pr->u.p.parm2 = lb;
371 // parm3 is the number of times to attempt stealing which is
372 // proportional to the number of chunks per thread up until
373 // the maximum value of nproc.
374 pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
375 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
376 pr->u.p.st = st;
377 if (traits_t<T>::type_size > 4) {
378 // AC: TODO: check if 16-byte CAS available and use it to
379 // improve performance (probably wait for explicit request
380 // before spending time on this).
381 // For now use dynamically allocated per-thread lock,
382 // free memory in __kmp_dispatch_next when status==0.
383 KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
384 pr->u.p.th_steal_lock =
385 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
386 __kmp_init_lock(pr->u.p.th_steal_lock);
387 }
388 break;
389 } else {
390 /* too few chunks: switching to kmp_sch_dynamic_chunked */
391 schedule = kmp_sch_dynamic_chunked;
392 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
393 "kmp_sch_dynamic_chunked\n",
394 gtid));
395 if (pr->u.p.parm1 <= 0)
396 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
397 break;
398 } // if
399 } // case
400 #endif
401 case kmp_sch_static_balanced: {
402 T init, limit;
403
404 KD_TRACE(
405 100,
406 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
407 gtid));
408
409 if (nproc > 1) {
410 T id = tid;
411
412 if (tc < nproc) {
413 if (id < tc) {
414 init = id;
415 limit = id;
416 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
417 } else {
418 pr->u.p.count = 1; /* means no more chunks to execute */
419 pr->u.p.parm1 = FALSE;
420 break;
421 }
422 } else {
423 T small_chunk = tc / nproc;
424 T extras = tc % nproc;
425 init = id * small_chunk + (id < extras ? id : extras);
426 limit = init + small_chunk - (id < extras ? 0 : 1);
427 pr->u.p.parm1 = (id == nproc - 1);
428 }
429 } else {
430 if (tc > 0) {
431 init = 0;
432 limit = tc - 1;
433 pr->u.p.parm1 = TRUE;
434 } else {
435 // zero trip count
436 pr->u.p.count = 1; /* means no more chunks to execute */
437 pr->u.p.parm1 = FALSE;
438 break;
439 }
440 }
441 #if USE_ITT_BUILD
442 // Calculate chunk for metadata report
443 if (itt_need_metadata_reporting)
444 if (cur_chunk)
445 *cur_chunk = limit - init + 1;
446 #endif
447 if (st == 1) {
448 pr->u.p.lb = lb + init;
449 pr->u.p.ub = lb + limit;
450 } else {
451 // calculated upper bound, "ub" is user-defined upper bound
452 T ub_tmp = lb + limit * st;
453 pr->u.p.lb = lb + init * st;
454 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
455 // it exactly
456 if (st > 0) {
457 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
458 } else {
459 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
460 }
461 }
462 if (pr->flags.ordered) {
463 pr->u.p.ordered_lower = init;
464 pr->u.p.ordered_upper = limit;
465 }
466 break;
467 } // case
468 case kmp_sch_static_balanced_chunked: {
469 // similar to balanced, but chunk adjusted to multiple of simd width
470 T nth = nproc;
471 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
472 " -> falling-through to static_greedy\n",
473 gtid));
474 schedule = kmp_sch_static_greedy;
475 if (nth > 1)
476 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
477 else
478 pr->u.p.parm1 = tc;
479 break;
480 } // case
481 case kmp_sch_guided_simd:
482 case kmp_sch_guided_iterative_chunked: {
483 KD_TRACE(
484 100,
485 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
486 " case\n",
487 gtid));
488
489 if (nproc > 1) {
490 if ((2L * chunk + 1) * nproc >= tc) {
491 /* chunk size too large, switch to dynamic */
492 schedule = kmp_sch_dynamic_chunked;
493 } else {
494 // when remaining iters become less than parm2 - switch to dynamic
495 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
496 *(double *)&pr->u.p.parm3 =
497 guided_flt_param / nproc; // may occupy parm3 and parm4
498 }
499 } else {
500 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
501 "kmp_sch_static_greedy\n",
502 gtid));
503 schedule = kmp_sch_static_greedy;
504 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
505 KD_TRACE(
506 100,
507 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
508 gtid));
509 pr->u.p.parm1 = tc;
510 } // if
511 } // case
512 break;
513 case kmp_sch_guided_analytical_chunked: {
514 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
515 "kmp_sch_guided_analytical_chunked case\n",
516 gtid));
517
518 if (nproc > 1) {
519 if ((2L * chunk + 1) * nproc >= tc) {
520 /* chunk size too large, switch to dynamic */
521 schedule = kmp_sch_dynamic_chunked;
522 } else {
523 /* commonly used term: (2 nproc - 1)/(2 nproc) */
524 DBL x;
525
526 #if KMP_USE_X87CONTROL
527 /* Linux* OS already has 64-bit computation by default for long double,
528 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
529 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
530 instead of the default 53-bit. Even though long double doesn't work
531 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
532 expected to impact the correctness of the algorithm, but this has not
533 been mathematically proven. */
534 // save original FPCW and set precision to 64-bit, as
535 // Windows* OS on IA-32 architecture defaults to 53-bit
536 unsigned int oldFpcw = _control87(0, 0);
537 _control87(_PC_64, _MCW_PC); // 0,0x30000
538 #endif
539 /* value used for comparison in solver for cross-over point */
540 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
541
542 /* crossover point--chunk indexes equal to or greater than
543 this point switch to dynamic-style scheduling */
544 UT cross;
545
546 /* commonly used term: (2 nproc - 1)/(2 nproc) */
547 x = (long double)1.0 - (long double)0.5 / nproc;
548
549 #ifdef KMP_DEBUG
550 { // test natural alignment
551 struct _test_a {
552 char a;
553 union {
554 char b;
555 DBL d;
556 };
557 } t;
558 ptrdiff_t natural_alignment =
559 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
560 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
561 // long)natural_alignment );
562 KMP_DEBUG_ASSERT(
563 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
564 }
565 #endif // KMP_DEBUG
566
567 /* save the term in thread private dispatch structure */
568 *(DBL *)&pr->u.p.parm3 = x;
569
570 /* solve for the crossover point to the nearest integer i for which C_i
571 <= chunk */
572 {
573 UT left, right, mid;
574 long double p;
575
576 /* estimate initial upper and lower bound */
577
578 /* doesn't matter what value right is as long as it is positive, but
579 it affects performance of the solver */
580 right = 229;
581 p = __kmp_pow<UT>(x, right);
582 if (p > target) {
583 do {
584 p *= p;
585 right <<= 1;
586 } while (p > target && right < (1 << 27));
587 /* lower bound is previous (failed) estimate of upper bound */
588 left = right >> 1;
589 } else {
590 left = 0;
591 }
592
593 /* bisection root-finding method */
594 while (left + 1 < right) {
595 mid = (left + right) / 2;
596 if (__kmp_pow<UT>(x, mid) > target) {
597 left = mid;
598 } else {
599 right = mid;
600 }
601 } // while
602 cross = right;
603 }
604 /* assert sanity of computed crossover point */
605 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
606 __kmp_pow<UT>(x, cross) <= target);
607
608 /* save the crossover point in thread private dispatch structure */
609 pr->u.p.parm2 = cross;
610
611 // C75803
612 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
613 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
614 #else
615 #define GUIDED_ANALYTICAL_WORKAROUND (x)
616 #endif
617 /* dynamic-style scheduling offset */
618 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
619 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
620 cross * chunk;
621 #if KMP_USE_X87CONTROL
622 // restore FPCW
623 _control87(oldFpcw, _MCW_PC);
624 #endif
625 } // if
626 } else {
627 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
628 "kmp_sch_static_greedy\n",
629 gtid));
630 schedule = kmp_sch_static_greedy;
631 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
632 pr->u.p.parm1 = tc;
633 } // if
634 } // case
635 break;
636 case kmp_sch_static_greedy:
637 KD_TRACE(
638 100,
639 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
640 gtid));
641 pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
642 break;
643 case kmp_sch_static_chunked:
644 case kmp_sch_dynamic_chunked:
645 if (pr->u.p.parm1 <= 0) {
646 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
647 }
648 KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
649 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
650 gtid));
651 break;
652 case kmp_sch_trapezoidal: {
653 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
654
655 T parm1, parm2, parm3, parm4;
656 KD_TRACE(100,
657 ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
658 gtid));
659
660 parm1 = chunk;
661
662 /* F : size of the first cycle */
663 parm2 = (tc / (2 * nproc));
664
665 if (parm2 < 1) {
666 parm2 = 1;
667 }
668
669 /* L : size of the last cycle. Make sure the last cycle is not larger
670 than the first cycle. */
671 if (parm1 < 1) {
672 parm1 = 1;
673 } else if (parm1 > parm2) {
674 parm1 = parm2;
675 }
676
677 /* N : number of cycles */
678 parm3 = (parm2 + parm1);
679 parm3 = (2 * tc + parm3 - 1) / parm3;
680
681 if (parm3 < 2) {
682 parm3 = 2;
683 }
684
685 /* sigma : decreasing incr of the trapezoid */
686 parm4 = (parm3 - 1);
687 parm4 = (parm2 - parm1) / parm4;
688
689 // pointless check, because parm4 >= 0 always
690 // if ( parm4 < 0 ) {
691 // parm4 = 0;
692 //}
693
694 pr->u.p.parm1 = parm1;
695 pr->u.p.parm2 = parm2;
696 pr->u.p.parm3 = parm3;
697 pr->u.p.parm4 = parm4;
698 } // case
699 break;
700
701 default: {
702 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
703 KMP_HNT(GetNewerLibrary), // Hint
704 __kmp_msg_null // Variadic argument list terminator
705 );
706 } break;
707 } // switch
708 pr->schedule = schedule;
709 }
710
711 #if KMP_USE_HIER_SCHED
712 template <typename T>
713 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
714 typename traits_t<T>::signed_t st);
715 template <>
716 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_int32 lb,kmp_int32 ub,kmp_int32 st)717 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
718 kmp_int32 ub, kmp_int32 st) {
719 __kmp_dispatch_init_hierarchy<kmp_int32>(
720 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
722 }
723 template <>
724 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st)725 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
726 kmp_uint32 ub, kmp_int32 st) {
727 __kmp_dispatch_init_hierarchy<kmp_uint32>(
728 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729 __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
730 }
731 template <>
732 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_int64 lb,kmp_int64 ub,kmp_int64 st)733 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
734 kmp_int64 ub, kmp_int64 st) {
735 __kmp_dispatch_init_hierarchy<kmp_int64>(
736 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
738 }
739 template <>
740 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st)741 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
742 kmp_uint64 ub, kmp_int64 st) {
743 __kmp_dispatch_init_hierarchy<kmp_uint64>(
744 loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
745 __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
746 }
747
748 // free all the hierarchy scheduling memory associated with the team
__kmp_dispatch_free_hierarchies(kmp_team_t * team)749 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
750 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
751 for (int i = 0; i < num_disp_buff; ++i) {
752 // type does not matter here so use kmp_int32
753 auto sh =
754 reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
755 &team->t.t_disp_buffer[i]);
756 if (sh->hier) {
757 sh->hier->deallocate();
758 __kmp_free(sh->hier);
759 }
760 }
761 }
762 #endif
763
764 // UT - unsigned flavor of T, ST - signed flavor of T,
765 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
766 template <typename T>
767 static void
__kmp_dispatch_init(ident_t * loc,int gtid,enum sched_type schedule,T lb,T ub,typename traits_t<T>::signed_t st,typename traits_t<T>::signed_t chunk,int push_ws)768 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
769 T ub, typename traits_t<T>::signed_t st,
770 typename traits_t<T>::signed_t chunk, int push_ws) {
771 typedef typename traits_t<T>::unsigned_t UT;
772
773 int active;
774 kmp_info_t *th;
775 kmp_team_t *team;
776 kmp_uint32 my_buffer_index;
777 dispatch_private_info_template<T> *pr;
778 dispatch_shared_info_template<T> volatile *sh;
779
780 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
781 sizeof(dispatch_private_info));
782 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
783 sizeof(dispatch_shared_info));
784 __kmp_assert_valid_gtid(gtid);
785
786 if (!TCR_4(__kmp_init_parallel))
787 __kmp_parallel_initialize();
788
789 __kmp_resume_if_soft_paused();
790
791 #if INCLUDE_SSC_MARKS
792 SSC_MARK_DISPATCH_INIT();
793 #endif
794 #ifdef KMP_DEBUG
795 typedef typename traits_t<T>::signed_t ST;
796 {
797 char *buff;
798 // create format specifiers before the debug output
799 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
800 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
801 traits_t<ST>::spec, traits_t<T>::spec,
802 traits_t<T>::spec, traits_t<ST>::spec);
803 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
804 __kmp_str_free(&buff);
805 }
806 #endif
807 /* setup data */
808 th = __kmp_threads[gtid];
809 team = th->th.th_team;
810 active = !team->t.t_serialized;
811 th->th.th_ident = loc;
812
813 // Any half-decent optimizer will remove this test when the blocks are empty
814 // since the macros expand to nothing
815 // when statistics are disabled.
816 if (schedule == __kmp_static) {
817 KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
818 } else {
819 KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
820 }
821
822 #if KMP_USE_HIER_SCHED
823 // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
824 // Hierarchical scheduling does not work with ordered, so if ordered is
825 // detected, then revert back to threaded scheduling.
826 bool ordered;
827 enum sched_type my_sched = schedule;
828 my_buffer_index = th->th.th_dispatch->th_disp_index;
829 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
830 &th->th.th_dispatch
831 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
832 my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
833 if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
834 my_sched =
835 (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
836 ordered = (kmp_ord_lower & my_sched);
837 if (pr->flags.use_hier) {
838 if (ordered) {
839 KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected. "
840 "Disabling hierarchical scheduling.\n",
841 gtid));
842 pr->flags.use_hier = FALSE;
843 }
844 }
845 if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
846 // Don't use hierarchical for ordered parallel loops and don't
847 // use the runtime hierarchy if one was specified in the program
848 if (!ordered && !pr->flags.use_hier)
849 __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
850 }
851 #endif // KMP_USE_HIER_SCHED
852
853 #if USE_ITT_BUILD
854 kmp_uint64 cur_chunk = chunk;
855 int itt_need_metadata_reporting =
856 __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
857 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
858 team->t.t_active_level == 1;
859 #endif
860 if (!active) {
861 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
862 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
863 } else {
864 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
865 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
866
867 my_buffer_index = th->th.th_dispatch->th_disp_index++;
868
869 /* What happens when number of threads changes, need to resize buffer? */
870 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
871 &th->th.th_dispatch
872 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
873 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
874 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
875 KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
876 my_buffer_index));
877 }
878
879 __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
880 #if USE_ITT_BUILD
881 &cur_chunk,
882 #endif
883 chunk, (T)th->th.th_team_nproc,
884 (T)th->th.th_info.ds.ds_tid);
885 if (active) {
886 if (pr->flags.ordered == 0) {
887 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
888 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
889 } else {
890 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
891 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
892 }
893 }
894
895 if (active) {
896 /* The name of this buffer should be my_buffer_index when it's free to use
897 * it */
898
899 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
900 "sh->buffer_index:%d\n",
901 gtid, my_buffer_index, sh->buffer_index));
902 __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
903 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
904 // Note: KMP_WAIT() cannot be used there: buffer index and
905 // my_buffer_index are *always* 32-bit integers.
906 KMP_MB(); /* is this necessary? */
907 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
908 "sh->buffer_index:%d\n",
909 gtid, my_buffer_index, sh->buffer_index));
910
911 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
912 th->th.th_dispatch->th_dispatch_sh_current =
913 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
914 #if USE_ITT_BUILD
915 if (pr->flags.ordered) {
916 __kmp_itt_ordered_init(gtid);
917 }
918 // Report loop metadata
919 if (itt_need_metadata_reporting) {
920 // Only report metadata by master of active team at level 1
921 kmp_uint64 schedtype = 0;
922 switch (schedule) {
923 case kmp_sch_static_chunked:
924 case kmp_sch_static_balanced: // Chunk is calculated in the switch above
925 break;
926 case kmp_sch_static_greedy:
927 cur_chunk = pr->u.p.parm1;
928 break;
929 case kmp_sch_dynamic_chunked:
930 schedtype = 1;
931 break;
932 case kmp_sch_guided_iterative_chunked:
933 case kmp_sch_guided_analytical_chunked:
934 case kmp_sch_guided_simd:
935 schedtype = 2;
936 break;
937 default:
938 // Should we put this case under "static"?
939 // case kmp_sch_static_steal:
940 schedtype = 3;
941 break;
942 }
943 __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
944 }
945 #if KMP_USE_HIER_SCHED
946 if (pr->flags.use_hier) {
947 pr->u.p.count = 0;
948 pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
949 }
950 #endif // KMP_USER_HIER_SCHED
951 #endif /* USE_ITT_BUILD */
952 }
953
954 #ifdef KMP_DEBUG
955 {
956 char *buff;
957 // create format specifiers before the debug output
958 buff = __kmp_str_format(
959 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
960 "lb:%%%s ub:%%%s"
961 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
962 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
963 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
964 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
965 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
966 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
967 KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
968 pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
969 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
970 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
971 __kmp_str_free(&buff);
972 }
973 #endif
974 #if (KMP_STATIC_STEAL_ENABLED)
975 // It cannot be guaranteed that after execution of a loop with some other
976 // schedule kind all the parm3 variables will contain the same value. Even if
977 // all parm3 will be the same, it still exists a bad case like using 0 and 1
978 // rather than program life-time increment. So the dedicated variable is
979 // required. The 'static_steal_counter' is used.
980 if (pr->schedule == kmp_sch_static_steal) {
981 // Other threads will inspect this variable when searching for a victim.
982 // This is a flag showing that other threads may steal from this thread
983 // since then.
984 volatile T *p = &pr->u.p.static_steal_counter;
985 *p = *p + 1;
986 }
987 #endif // ( KMP_STATIC_STEAL_ENABLED )
988
989 #if OMPT_SUPPORT && OMPT_OPTIONAL
990 if (ompt_enabled.ompt_callback_work) {
991 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
992 ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
993 ompt_callbacks.ompt_callback(ompt_callback_work)(
994 ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
995 &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
996 }
997 #endif
998 KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
999 }
1000
1001 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1002 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1003 * every chunk of iterations. If the ordered section(s) were not executed
1004 * for this iteration (or every iteration in this chunk), we need to set the
1005 * ordered iteration counters so that the next thread can proceed. */
1006 template <typename UT>
__kmp_dispatch_finish(int gtid,ident_t * loc)1007 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1008 typedef typename traits_t<UT>::signed_t ST;
1009 __kmp_assert_valid_gtid(gtid);
1010 kmp_info_t *th = __kmp_threads[gtid];
1011
1012 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1013 if (!th->th.th_team->t.t_serialized) {
1014
1015 dispatch_private_info_template<UT> *pr =
1016 reinterpret_cast<dispatch_private_info_template<UT> *>(
1017 th->th.th_dispatch->th_dispatch_pr_current);
1018 dispatch_shared_info_template<UT> volatile *sh =
1019 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1020 th->th.th_dispatch->th_dispatch_sh_current);
1021 KMP_DEBUG_ASSERT(pr);
1022 KMP_DEBUG_ASSERT(sh);
1023 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1024 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1025
1026 if (pr->ordered_bumped) {
1027 KD_TRACE(
1028 1000,
1029 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1030 gtid));
1031 pr->ordered_bumped = 0;
1032 } else {
1033 UT lower = pr->u.p.ordered_lower;
1034
1035 #ifdef KMP_DEBUG
1036 {
1037 char *buff;
1038 // create format specifiers before the debug output
1039 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1040 "ordered_iteration:%%%s lower:%%%s\n",
1041 traits_t<UT>::spec, traits_t<UT>::spec);
1042 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1043 __kmp_str_free(&buff);
1044 }
1045 #endif
1046
1047 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1048 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1049 KMP_MB(); /* is this necessary? */
1050 #ifdef KMP_DEBUG
1051 {
1052 char *buff;
1053 // create format specifiers before the debug output
1054 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1055 "ordered_iteration:%%%s lower:%%%s\n",
1056 traits_t<UT>::spec, traits_t<UT>::spec);
1057 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1058 __kmp_str_free(&buff);
1059 }
1060 #endif
1061
1062 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1063 } // if
1064 } // if
1065 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1066 }
1067
1068 #ifdef KMP_GOMP_COMPAT
1069
1070 template <typename UT>
__kmp_dispatch_finish_chunk(int gtid,ident_t * loc)1071 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1072 typedef typename traits_t<UT>::signed_t ST;
1073 __kmp_assert_valid_gtid(gtid);
1074 kmp_info_t *th = __kmp_threads[gtid];
1075
1076 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1077 if (!th->th.th_team->t.t_serialized) {
1078 // int cid;
1079 dispatch_private_info_template<UT> *pr =
1080 reinterpret_cast<dispatch_private_info_template<UT> *>(
1081 th->th.th_dispatch->th_dispatch_pr_current);
1082 dispatch_shared_info_template<UT> volatile *sh =
1083 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1084 th->th.th_dispatch->th_dispatch_sh_current);
1085 KMP_DEBUG_ASSERT(pr);
1086 KMP_DEBUG_ASSERT(sh);
1087 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1088 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1089
1090 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1091 UT lower = pr->u.p.ordered_lower;
1092 UT upper = pr->u.p.ordered_upper;
1093 UT inc = upper - lower + 1;
1094
1095 if (pr->ordered_bumped == inc) {
1096 KD_TRACE(
1097 1000,
1098 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1099 gtid));
1100 pr->ordered_bumped = 0;
1101 } else {
1102 inc -= pr->ordered_bumped;
1103
1104 #ifdef KMP_DEBUG
1105 {
1106 char *buff;
1107 // create format specifiers before the debug output
1108 buff = __kmp_str_format(
1109 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1110 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1111 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1112 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1113 __kmp_str_free(&buff);
1114 }
1115 #endif
1116
1117 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1118 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1119
1120 KMP_MB(); /* is this necessary? */
1121 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1122 "ordered_bumped to zero\n",
1123 gtid));
1124 pr->ordered_bumped = 0;
1125 //!!!!! TODO check if the inc should be unsigned, or signed???
1126 #ifdef KMP_DEBUG
1127 {
1128 char *buff;
1129 // create format specifiers before the debug output
1130 buff = __kmp_str_format(
1131 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1132 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1133 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1134 traits_t<UT>::spec);
1135 KD_TRACE(1000,
1136 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1137 __kmp_str_free(&buff);
1138 }
1139 #endif
1140
1141 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1142 }
1143 // }
1144 }
1145 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1146 }
1147
1148 #endif /* KMP_GOMP_COMPAT */
1149
1150 template <typename T>
__kmp_dispatch_next_algorithm(int gtid,dispatch_private_info_template<T> * pr,dispatch_shared_info_template<T> volatile * sh,kmp_int32 * p_last,T * p_lb,T * p_ub,typename traits_t<T>::signed_t * p_st,T nproc,T tid)1151 int __kmp_dispatch_next_algorithm(int gtid,
1152 dispatch_private_info_template<T> *pr,
1153 dispatch_shared_info_template<T> volatile *sh,
1154 kmp_int32 *p_last, T *p_lb, T *p_ub,
1155 typename traits_t<T>::signed_t *p_st, T nproc,
1156 T tid) {
1157 typedef typename traits_t<T>::unsigned_t UT;
1158 typedef typename traits_t<T>::signed_t ST;
1159 typedef typename traits_t<T>::floating_t DBL;
1160 int status = 0;
1161 kmp_int32 last = 0;
1162 T start;
1163 ST incr;
1164 UT limit, trip, init;
1165 kmp_info_t *th = __kmp_threads[gtid];
1166 kmp_team_t *team = th->th.th_team;
1167
1168 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1169 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1170 KMP_DEBUG_ASSERT(pr);
1171 KMP_DEBUG_ASSERT(sh);
1172 KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1173 #ifdef KMP_DEBUG
1174 {
1175 char *buff;
1176 // create format specifiers before the debug output
1177 buff =
1178 __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1179 "sh:%%p nproc:%%%s tid:%%%s\n",
1180 traits_t<T>::spec, traits_t<T>::spec);
1181 KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1182 __kmp_str_free(&buff);
1183 }
1184 #endif
1185
1186 // zero trip count
1187 if (pr->u.p.tc == 0) {
1188 KD_TRACE(10,
1189 ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1190 "zero status:%d\n",
1191 gtid, status));
1192 return 0;
1193 }
1194
1195 switch (pr->schedule) {
1196 #if (KMP_STATIC_STEAL_ENABLED)
1197 case kmp_sch_static_steal: {
1198 T chunk = pr->u.p.parm1;
1199
1200 KD_TRACE(100,
1201 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1202 gtid));
1203
1204 trip = pr->u.p.tc - 1;
1205
1206 if (traits_t<T>::type_size > 4) {
1207 // use lock for 8-byte and CAS for 4-byte induction
1208 // variable. TODO (optional): check and use 16-byte CAS
1209 kmp_lock_t *lck = pr->u.p.th_steal_lock;
1210 KMP_DEBUG_ASSERT(lck != NULL);
1211 if (pr->u.p.count < (UT)pr->u.p.ub) {
1212 __kmp_acquire_lock(lck, gtid);
1213 // try to get own chunk of iterations
1214 init = (pr->u.p.count)++;
1215 status = (init < (UT)pr->u.p.ub);
1216 __kmp_release_lock(lck, gtid);
1217 } else {
1218 status = 0; // no own chunks
1219 }
1220 if (!status) { // try to steal
1221 kmp_info_t **other_threads = team->t.t_threads;
1222 int while_limit = pr->u.p.parm3;
1223 int while_index = 0;
1224 T id = pr->u.p.static_steal_counter; // loop id
1225 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1226 __kmp_dispatch_num_buffers; // current loop index
1227 // note: victim thread can potentially execute another loop
1228 // TODO: algorithm of searching for a victim
1229 // should be cleaned up and measured
1230 while ((!status) && (while_limit != ++while_index)) {
1231 dispatch_private_info_template<T> *victim;
1232 T remaining;
1233 T victimIdx = pr->u.p.parm4;
1234 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1235 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1236 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1237 KMP_DEBUG_ASSERT(victim);
1238 while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1239 oldVictimIdx != victimIdx) {
1240 victimIdx = (victimIdx + 1) % nproc;
1241 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1242 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1243 KMP_DEBUG_ASSERT(victim);
1244 }
1245 if (victim == pr || id != victim->u.p.static_steal_counter) {
1246 continue; // try once more (nproc attempts in total)
1247 // no victim is ready yet to participate in stealing
1248 // because no victim passed kmp_init_dispatch yet
1249 }
1250 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1251 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1252 continue; // not enough chunks to steal, goto next victim
1253 }
1254
1255 lck = victim->u.p.th_steal_lock;
1256 KMP_ASSERT(lck != NULL);
1257 __kmp_acquire_lock(lck, gtid);
1258 limit = victim->u.p.ub; // keep initial ub
1259 if (victim->u.p.count >= limit ||
1260 (remaining = limit - victim->u.p.count) < 2) {
1261 __kmp_release_lock(lck, gtid);
1262 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1263 continue; // not enough chunks to steal
1264 }
1265 // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
1266 // by 1
1267 if (remaining > 3) {
1268 // steal 1/4 of remaining
1269 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1270 init = (victim->u.p.ub -= (remaining >> 2));
1271 } else {
1272 // steal 1 chunk of 2 or 3 remaining
1273 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1274 init = (victim->u.p.ub -= 1);
1275 }
1276 __kmp_release_lock(lck, gtid);
1277
1278 KMP_DEBUG_ASSERT(init + 1 <= limit);
1279 pr->u.p.parm4 = victimIdx; // remember victim to steal from
1280 status = 1;
1281 while_index = 0;
1282 // now update own count and ub with stolen range but init chunk
1283 __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1284 pr->u.p.count = init + 1;
1285 pr->u.p.ub = limit;
1286 __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1287 } // while (search for victim)
1288 } // if (try to find victim and steal)
1289 } else {
1290 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1291 typedef union {
1292 struct {
1293 UT count;
1294 T ub;
1295 } p;
1296 kmp_int64 b;
1297 } union_i4;
1298 // All operations on 'count' or 'ub' must be combined atomically
1299 // together.
1300 {
1301 union_i4 vold, vnew;
1302 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1303 vnew = vold;
1304 vnew.p.count++;
1305 while (!KMP_COMPARE_AND_STORE_ACQ64(
1306 (volatile kmp_int64 *)&pr->u.p.count,
1307 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1308 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1309 KMP_CPU_PAUSE();
1310 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1311 vnew = vold;
1312 vnew.p.count++;
1313 }
1314 vnew = vold;
1315 init = vnew.p.count;
1316 status = (init < (UT)vnew.p.ub);
1317 }
1318
1319 if (!status) {
1320 kmp_info_t **other_threads = team->t.t_threads;
1321 int while_limit = pr->u.p.parm3;
1322 int while_index = 0;
1323 T id = pr->u.p.static_steal_counter; // loop id
1324 int idx = (th->th.th_dispatch->th_disp_index - 1) %
1325 __kmp_dispatch_num_buffers; // current loop index
1326 // note: victim thread can potentially execute another loop
1327 // TODO: algorithm of searching for a victim
1328 // should be cleaned up and measured
1329 while ((!status) && (while_limit != ++while_index)) {
1330 dispatch_private_info_template<T> *victim;
1331 union_i4 vold, vnew;
1332 kmp_int32 remaining;
1333 T victimIdx = pr->u.p.parm4;
1334 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1335 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1336 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1337 KMP_DEBUG_ASSERT(victim);
1338 while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1339 oldVictimIdx != victimIdx) {
1340 victimIdx = (victimIdx + 1) % nproc;
1341 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1342 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1343 KMP_DEBUG_ASSERT(victim);
1344 }
1345 if (victim == pr || id != victim->u.p.static_steal_counter) {
1346 continue; // try once more (nproc attempts in total)
1347 // no victim is ready yet to participate in stealing
1348 // because no victim passed kmp_init_dispatch yet
1349 }
1350 pr->u.p.parm4 = victimIdx; // new victim found
1351 while (1) { // CAS loop if victim has enough chunks to steal
1352 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1353 vnew = vold;
1354
1355 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1356 if (vnew.p.count >= (UT)vnew.p.ub ||
1357 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1358 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1359 break; // not enough chunks to steal, goto next victim
1360 }
1361 if (remaining > 3) {
1362 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1363 } else {
1364 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1365 }
1366 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1367 // TODO: Should this be acquire or release?
1368 if (KMP_COMPARE_AND_STORE_ACQ64(
1369 (volatile kmp_int64 *)&victim->u.p.count,
1370 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1371 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1372 // stealing succeeded
1373 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1374 vold.p.ub - vnew.p.ub);
1375 status = 1;
1376 while_index = 0;
1377 // now update own count and ub
1378 init = vnew.p.ub;
1379 vold.p.count = init + 1;
1380 #if KMP_ARCH_X86
1381 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1382 #else
1383 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1384 #endif
1385 break;
1386 } // if (check CAS result)
1387 KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1388 } // while (try to steal from particular victim)
1389 } // while (search for victim)
1390 } // if (try to find victim and steal)
1391 } // if (4-byte induction variable)
1392 if (!status) {
1393 *p_lb = 0;
1394 *p_ub = 0;
1395 if (p_st != NULL)
1396 *p_st = 0;
1397 } else {
1398 start = pr->u.p.parm2;
1399 init *= chunk;
1400 limit = chunk + init - 1;
1401 incr = pr->u.p.st;
1402 KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1403
1404 KMP_DEBUG_ASSERT(init <= trip);
1405 if ((last = (limit >= trip)) != 0)
1406 limit = trip;
1407 if (p_st != NULL)
1408 *p_st = incr;
1409
1410 if (incr == 1) {
1411 *p_lb = start + init;
1412 *p_ub = start + limit;
1413 } else {
1414 *p_lb = start + init * incr;
1415 *p_ub = start + limit * incr;
1416 }
1417
1418 if (pr->flags.ordered) {
1419 pr->u.p.ordered_lower = init;
1420 pr->u.p.ordered_upper = limit;
1421 } // if
1422 } // if
1423 break;
1424 } // case
1425 #endif // ( KMP_STATIC_STEAL_ENABLED )
1426 case kmp_sch_static_balanced: {
1427 KD_TRACE(
1428 10,
1429 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1430 gtid));
1431 /* check if thread has any iteration to do */
1432 if ((status = !pr->u.p.count) != 0) {
1433 pr->u.p.count = 1;
1434 *p_lb = pr->u.p.lb;
1435 *p_ub = pr->u.p.ub;
1436 last = pr->u.p.parm1;
1437 if (p_st != NULL)
1438 *p_st = pr->u.p.st;
1439 } else { /* no iterations to do */
1440 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1441 }
1442 } // case
1443 break;
1444 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1445 merged here */
1446 case kmp_sch_static_chunked: {
1447 T parm1;
1448
1449 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1450 "kmp_sch_static_[affinity|chunked] case\n",
1451 gtid));
1452 parm1 = pr->u.p.parm1;
1453
1454 trip = pr->u.p.tc - 1;
1455 init = parm1 * (pr->u.p.count + tid);
1456
1457 if ((status = (init <= trip)) != 0) {
1458 start = pr->u.p.lb;
1459 incr = pr->u.p.st;
1460 limit = parm1 + init - 1;
1461
1462 if ((last = (limit >= trip)) != 0)
1463 limit = trip;
1464
1465 if (p_st != NULL)
1466 *p_st = incr;
1467
1468 pr->u.p.count += nproc;
1469
1470 if (incr == 1) {
1471 *p_lb = start + init;
1472 *p_ub = start + limit;
1473 } else {
1474 *p_lb = start + init * incr;
1475 *p_ub = start + limit * incr;
1476 }
1477
1478 if (pr->flags.ordered) {
1479 pr->u.p.ordered_lower = init;
1480 pr->u.p.ordered_upper = limit;
1481 } // if
1482 } // if
1483 } // case
1484 break;
1485
1486 case kmp_sch_dynamic_chunked: {
1487 T chunk = pr->u.p.parm1;
1488
1489 KD_TRACE(
1490 100,
1491 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1492 gtid));
1493
1494 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1495 trip = pr->u.p.tc - 1;
1496
1497 if ((status = (init <= trip)) == 0) {
1498 *p_lb = 0;
1499 *p_ub = 0;
1500 if (p_st != NULL)
1501 *p_st = 0;
1502 } else {
1503 start = pr->u.p.lb;
1504 limit = chunk + init - 1;
1505 incr = pr->u.p.st;
1506
1507 if ((last = (limit >= trip)) != 0)
1508 limit = trip;
1509
1510 if (p_st != NULL)
1511 *p_st = incr;
1512
1513 if (incr == 1) {
1514 *p_lb = start + init;
1515 *p_ub = start + limit;
1516 } else {
1517 *p_lb = start + init * incr;
1518 *p_ub = start + limit * incr;
1519 }
1520
1521 if (pr->flags.ordered) {
1522 pr->u.p.ordered_lower = init;
1523 pr->u.p.ordered_upper = limit;
1524 } // if
1525 } // if
1526 } // case
1527 break;
1528
1529 case kmp_sch_guided_iterative_chunked: {
1530 T chunkspec = pr->u.p.parm1;
1531 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1532 "iterative case\n",
1533 gtid));
1534 trip = pr->u.p.tc;
1535 // Start atomic part of calculations
1536 while (1) {
1537 ST remaining; // signed, because can be < 0
1538 init = sh->u.s.iteration; // shared value
1539 remaining = trip - init;
1540 if (remaining <= 0) { // AC: need to compare with 0 first
1541 // nothing to do, don't try atomic op
1542 status = 0;
1543 break;
1544 }
1545 if ((T)remaining <
1546 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1547 // use dynamic-style schedule
1548 // atomically increment iterations, get old value
1549 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1550 (ST)chunkspec);
1551 remaining = trip - init;
1552 if (remaining <= 0) {
1553 status = 0; // all iterations got by other threads
1554 } else {
1555 // got some iterations to work on
1556 status = 1;
1557 if ((T)remaining > chunkspec) {
1558 limit = init + chunkspec - 1;
1559 } else {
1560 last = 1; // the last chunk
1561 limit = init + remaining - 1;
1562 } // if
1563 } // if
1564 break;
1565 } // if
1566 limit = init +
1567 (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1568 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1569 (ST)init, (ST)limit)) {
1570 // CAS was successful, chunk obtained
1571 status = 1;
1572 --limit;
1573 break;
1574 } // if
1575 } // while
1576 if (status != 0) {
1577 start = pr->u.p.lb;
1578 incr = pr->u.p.st;
1579 if (p_st != NULL)
1580 *p_st = incr;
1581 *p_lb = start + init * incr;
1582 *p_ub = start + limit * incr;
1583 if (pr->flags.ordered) {
1584 pr->u.p.ordered_lower = init;
1585 pr->u.p.ordered_upper = limit;
1586 } // if
1587 } else {
1588 *p_lb = 0;
1589 *p_ub = 0;
1590 if (p_st != NULL)
1591 *p_st = 0;
1592 } // if
1593 } // case
1594 break;
1595
1596 case kmp_sch_guided_simd: {
1597 // same as iterative but curr-chunk adjusted to be multiple of given
1598 // chunk
1599 T chunk = pr->u.p.parm1;
1600 KD_TRACE(100,
1601 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1602 gtid));
1603 trip = pr->u.p.tc;
1604 // Start atomic part of calculations
1605 while (1) {
1606 ST remaining; // signed, because can be < 0
1607 init = sh->u.s.iteration; // shared value
1608 remaining = trip - init;
1609 if (remaining <= 0) { // AC: need to compare with 0 first
1610 status = 0; // nothing to do, don't try atomic op
1611 break;
1612 }
1613 KMP_DEBUG_ASSERT(init % chunk == 0);
1614 // compare with K*nproc*(chunk+1), K=2 by default
1615 if ((T)remaining < pr->u.p.parm2) {
1616 // use dynamic-style schedule
1617 // atomically increment iterations, get old value
1618 init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1619 (ST)chunk);
1620 remaining = trip - init;
1621 if (remaining <= 0) {
1622 status = 0; // all iterations got by other threads
1623 } else {
1624 // got some iterations to work on
1625 status = 1;
1626 if ((T)remaining > chunk) {
1627 limit = init + chunk - 1;
1628 } else {
1629 last = 1; // the last chunk
1630 limit = init + remaining - 1;
1631 } // if
1632 } // if
1633 break;
1634 } // if
1635 // divide by K*nproc
1636 UT span = remaining * (*(double *)&pr->u.p.parm3);
1637 UT rem = span % chunk;
1638 if (rem) // adjust so that span%chunk == 0
1639 span += chunk - rem;
1640 limit = init + span;
1641 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1642 (ST)init, (ST)limit)) {
1643 // CAS was successful, chunk obtained
1644 status = 1;
1645 --limit;
1646 break;
1647 } // if
1648 } // while
1649 if (status != 0) {
1650 start = pr->u.p.lb;
1651 incr = pr->u.p.st;
1652 if (p_st != NULL)
1653 *p_st = incr;
1654 *p_lb = start + init * incr;
1655 *p_ub = start + limit * incr;
1656 if (pr->flags.ordered) {
1657 pr->u.p.ordered_lower = init;
1658 pr->u.p.ordered_upper = limit;
1659 } // if
1660 } else {
1661 *p_lb = 0;
1662 *p_ub = 0;
1663 if (p_st != NULL)
1664 *p_st = 0;
1665 } // if
1666 } // case
1667 break;
1668
1669 case kmp_sch_guided_analytical_chunked: {
1670 T chunkspec = pr->u.p.parm1;
1671 UT chunkIdx;
1672 #if KMP_USE_X87CONTROL
1673 /* for storing original FPCW value for Windows* OS on
1674 IA-32 architecture 8-byte version */
1675 unsigned int oldFpcw;
1676 unsigned int fpcwSet = 0;
1677 #endif
1678 KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1679 "kmp_sch_guided_analytical_chunked case\n",
1680 gtid));
1681
1682 trip = pr->u.p.tc;
1683
1684 KMP_DEBUG_ASSERT(nproc > 1);
1685 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1686
1687 while (1) { /* this while loop is a safeguard against unexpected zero
1688 chunk sizes */
1689 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1690 if (chunkIdx >= (UT)pr->u.p.parm2) {
1691 --trip;
1692 /* use dynamic-style scheduling */
1693 init = chunkIdx * chunkspec + pr->u.p.count;
1694 /* need to verify init > 0 in case of overflow in the above
1695 * calculation */
1696 if ((status = (init > 0 && init <= trip)) != 0) {
1697 limit = init + chunkspec - 1;
1698
1699 if ((last = (limit >= trip)) != 0)
1700 limit = trip;
1701 }
1702 break;
1703 } else {
1704 /* use exponential-style scheduling */
1705 /* The following check is to workaround the lack of long double precision on
1706 Windows* OS.
1707 This check works around the possible effect that init != 0 for chunkIdx == 0.
1708 */
1709 #if KMP_USE_X87CONTROL
1710 /* If we haven't already done so, save original
1711 FPCW and set precision to 64-bit, as Windows* OS
1712 on IA-32 architecture defaults to 53-bit */
1713 if (!fpcwSet) {
1714 oldFpcw = _control87(0, 0);
1715 _control87(_PC_64, _MCW_PC);
1716 fpcwSet = 0x30000;
1717 }
1718 #endif
1719 if (chunkIdx) {
1720 init = __kmp_dispatch_guided_remaining<T>(
1721 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1722 KMP_DEBUG_ASSERT(init);
1723 init = trip - init;
1724 } else
1725 init = 0;
1726 limit = trip - __kmp_dispatch_guided_remaining<T>(
1727 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1728 KMP_ASSERT(init <= limit);
1729 if (init < limit) {
1730 KMP_DEBUG_ASSERT(limit <= trip);
1731 --limit;
1732 status = 1;
1733 break;
1734 } // if
1735 } // if
1736 } // while (1)
1737 #if KMP_USE_X87CONTROL
1738 /* restore FPCW if necessary
1739 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1740 */
1741 if (fpcwSet && (oldFpcw & fpcwSet))
1742 _control87(oldFpcw, _MCW_PC);
1743 #endif
1744 if (status != 0) {
1745 start = pr->u.p.lb;
1746 incr = pr->u.p.st;
1747 if (p_st != NULL)
1748 *p_st = incr;
1749 *p_lb = start + init * incr;
1750 *p_ub = start + limit * incr;
1751 if (pr->flags.ordered) {
1752 pr->u.p.ordered_lower = init;
1753 pr->u.p.ordered_upper = limit;
1754 }
1755 } else {
1756 *p_lb = 0;
1757 *p_ub = 0;
1758 if (p_st != NULL)
1759 *p_st = 0;
1760 }
1761 } // case
1762 break;
1763
1764 case kmp_sch_trapezoidal: {
1765 UT index;
1766 T parm2 = pr->u.p.parm2;
1767 T parm3 = pr->u.p.parm3;
1768 T parm4 = pr->u.p.parm4;
1769 KD_TRACE(100,
1770 ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1771 gtid));
1772
1773 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1774
1775 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1776 trip = pr->u.p.tc - 1;
1777
1778 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1779 *p_lb = 0;
1780 *p_ub = 0;
1781 if (p_st != NULL)
1782 *p_st = 0;
1783 } else {
1784 start = pr->u.p.lb;
1785 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1786 incr = pr->u.p.st;
1787
1788 if ((last = (limit >= trip)) != 0)
1789 limit = trip;
1790
1791 if (p_st != NULL)
1792 *p_st = incr;
1793
1794 if (incr == 1) {
1795 *p_lb = start + init;
1796 *p_ub = start + limit;
1797 } else {
1798 *p_lb = start + init * incr;
1799 *p_ub = start + limit * incr;
1800 }
1801
1802 if (pr->flags.ordered) {
1803 pr->u.p.ordered_lower = init;
1804 pr->u.p.ordered_upper = limit;
1805 } // if
1806 } // if
1807 } // case
1808 break;
1809 default: {
1810 status = 0; // to avoid complaints on uninitialized variable use
1811 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1812 KMP_HNT(GetNewerLibrary), // Hint
1813 __kmp_msg_null // Variadic argument list terminator
1814 );
1815 } break;
1816 } // switch
1817 if (p_last)
1818 *p_last = last;
1819 #ifdef KMP_DEBUG
1820 if (pr->flags.ordered) {
1821 char *buff;
1822 // create format specifiers before the debug output
1823 buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1824 "ordered_lower:%%%s ordered_upper:%%%s\n",
1825 traits_t<UT>::spec, traits_t<UT>::spec);
1826 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1827 __kmp_str_free(&buff);
1828 }
1829 {
1830 char *buff;
1831 // create format specifiers before the debug output
1832 buff = __kmp_str_format(
1833 "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1834 "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1835 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1836 KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1837 __kmp_str_free(&buff);
1838 }
1839 #endif
1840 return status;
1841 }
1842
1843 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1844 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1845 is not called. */
1846 #if OMPT_SUPPORT && OMPT_OPTIONAL
1847 #define OMPT_LOOP_END \
1848 if (status == 0) { \
1849 if (ompt_enabled.ompt_callback_work) { \
1850 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1851 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); \
1852 ompt_callbacks.ompt_callback(ompt_callback_work)( \
1853 ompt_work_loop, ompt_scope_end, &(team_info->parallel_data), \
1854 &(task_info->task_data), 0, codeptr); \
1855 } \
1856 }
1857 // TODO: implement count
1858 #else
1859 #define OMPT_LOOP_END // no-op
1860 #endif
1861
1862 #if KMP_STATS_ENABLED
1863 #define KMP_STATS_LOOP_END \
1864 { \
1865 kmp_int64 u, l, t, i; \
1866 l = (kmp_int64)(*p_lb); \
1867 u = (kmp_int64)(*p_ub); \
1868 i = (kmp_int64)(pr->u.p.st); \
1869 if (status == 0) { \
1870 t = 0; \
1871 KMP_POP_PARTITIONED_TIMER(); \
1872 } else if (i == 1) { \
1873 if (u >= l) \
1874 t = u - l + 1; \
1875 else \
1876 t = 0; \
1877 } else if (i < 0) { \
1878 if (l >= u) \
1879 t = (l - u) / (-i) + 1; \
1880 else \
1881 t = 0; \
1882 } else { \
1883 if (u >= l) \
1884 t = (u - l) / i + 1; \
1885 else \
1886 t = 0; \
1887 } \
1888 KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t); \
1889 }
1890 #else
1891 #define KMP_STATS_LOOP_END /* Nothing */
1892 #endif
1893
1894 template <typename T>
__kmp_dispatch_next(ident_t * loc,int gtid,kmp_int32 * p_last,T * p_lb,T * p_ub,typename traits_t<T>::signed_t * p_st,void * codeptr)1895 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1896 T *p_lb, T *p_ub,
1897 typename traits_t<T>::signed_t *p_st
1898 #if OMPT_SUPPORT && OMPT_OPTIONAL
1899 ,
1900 void *codeptr
1901 #endif
1902 ) {
1903
1904 typedef typename traits_t<T>::unsigned_t UT;
1905 typedef typename traits_t<T>::signed_t ST;
1906 // This is potentially slightly misleading, schedule(runtime) will appear here
1907 // even if the actual runtime schedule is static. (Which points out a
1908 // disadvantage of schedule(runtime): even when static scheduling is used it
1909 // costs more than a compile time choice to use static scheduling would.)
1910 KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1911
1912 int status;
1913 dispatch_private_info_template<T> *pr;
1914 __kmp_assert_valid_gtid(gtid);
1915 kmp_info_t *th = __kmp_threads[gtid];
1916 kmp_team_t *team = th->th.th_team;
1917
1918 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1919 KD_TRACE(
1920 1000,
1921 ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1922 gtid, p_lb, p_ub, p_st, p_last));
1923
1924 if (team->t.t_serialized) {
1925 /* NOTE: serialize this dispatch because we are not at the active level */
1926 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1927 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1928 KMP_DEBUG_ASSERT(pr);
1929
1930 if ((status = (pr->u.p.tc != 0)) == 0) {
1931 *p_lb = 0;
1932 *p_ub = 0;
1933 // if ( p_last != NULL )
1934 // *p_last = 0;
1935 if (p_st != NULL)
1936 *p_st = 0;
1937 if (__kmp_env_consistency_check) {
1938 if (pr->pushed_ws != ct_none) {
1939 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1940 }
1941 }
1942 } else if (pr->flags.nomerge) {
1943 kmp_int32 last;
1944 T start;
1945 UT limit, trip, init;
1946 ST incr;
1947 T chunk = pr->u.p.parm1;
1948
1949 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1950 gtid));
1951
1952 init = chunk * pr->u.p.count++;
1953 trip = pr->u.p.tc - 1;
1954
1955 if ((status = (init <= trip)) == 0) {
1956 *p_lb = 0;
1957 *p_ub = 0;
1958 // if ( p_last != NULL )
1959 // *p_last = 0;
1960 if (p_st != NULL)
1961 *p_st = 0;
1962 if (__kmp_env_consistency_check) {
1963 if (pr->pushed_ws != ct_none) {
1964 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1965 }
1966 }
1967 } else {
1968 start = pr->u.p.lb;
1969 limit = chunk + init - 1;
1970 incr = pr->u.p.st;
1971
1972 if ((last = (limit >= trip)) != 0) {
1973 limit = trip;
1974 #if KMP_OS_WINDOWS
1975 pr->u.p.last_upper = pr->u.p.ub;
1976 #endif /* KMP_OS_WINDOWS */
1977 }
1978 if (p_last != NULL)
1979 *p_last = last;
1980 if (p_st != NULL)
1981 *p_st = incr;
1982 if (incr == 1) {
1983 *p_lb = start + init;
1984 *p_ub = start + limit;
1985 } else {
1986 *p_lb = start + init * incr;
1987 *p_ub = start + limit * incr;
1988 }
1989
1990 if (pr->flags.ordered) {
1991 pr->u.p.ordered_lower = init;
1992 pr->u.p.ordered_upper = limit;
1993 #ifdef KMP_DEBUG
1994 {
1995 char *buff;
1996 // create format specifiers before the debug output
1997 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1998 "ordered_lower:%%%s ordered_upper:%%%s\n",
1999 traits_t<UT>::spec, traits_t<UT>::spec);
2000 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2001 pr->u.p.ordered_upper));
2002 __kmp_str_free(&buff);
2003 }
2004 #endif
2005 } // if
2006 } // if
2007 } else {
2008 pr->u.p.tc = 0;
2009 *p_lb = pr->u.p.lb;
2010 *p_ub = pr->u.p.ub;
2011 #if KMP_OS_WINDOWS
2012 pr->u.p.last_upper = *p_ub;
2013 #endif /* KMP_OS_WINDOWS */
2014 if (p_last != NULL)
2015 *p_last = TRUE;
2016 if (p_st != NULL)
2017 *p_st = pr->u.p.st;
2018 } // if
2019 #ifdef KMP_DEBUG
2020 {
2021 char *buff;
2022 // create format specifiers before the debug output
2023 buff = __kmp_str_format(
2024 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2025 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
2026 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2027 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2028 (p_last ? *p_last : 0), status));
2029 __kmp_str_free(&buff);
2030 }
2031 #endif
2032 #if INCLUDE_SSC_MARKS
2033 SSC_MARK_DISPATCH_NEXT();
2034 #endif
2035 OMPT_LOOP_END;
2036 KMP_STATS_LOOP_END;
2037 return status;
2038 } else {
2039 kmp_int32 last = 0;
2040 dispatch_shared_info_template<T> volatile *sh;
2041
2042 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2043 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2044
2045 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2046 th->th.th_dispatch->th_dispatch_pr_current);
2047 KMP_DEBUG_ASSERT(pr);
2048 sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2049 th->th.th_dispatch->th_dispatch_sh_current);
2050 KMP_DEBUG_ASSERT(sh);
2051
2052 #if KMP_USE_HIER_SCHED
2053 if (pr->flags.use_hier)
2054 status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2055 else
2056 #endif // KMP_USE_HIER_SCHED
2057 status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2058 p_st, th->th.th_team_nproc,
2059 th->th.th_info.ds.ds_tid);
2060 // status == 0: no more iterations to execute
2061 if (status == 0) {
2062 UT num_done;
2063
2064 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2065 #ifdef KMP_DEBUG
2066 {
2067 char *buff;
2068 // create format specifiers before the debug output
2069 buff = __kmp_str_format(
2070 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2071 traits_t<UT>::spec);
2072 KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2073 __kmp_str_free(&buff);
2074 }
2075 #endif
2076
2077 #if KMP_USE_HIER_SCHED
2078 pr->flags.use_hier = FALSE;
2079 #endif
2080 if ((ST)num_done == th->th.th_team_nproc - 1) {
2081 #if (KMP_STATIC_STEAL_ENABLED)
2082 if (pr->schedule == kmp_sch_static_steal &&
2083 traits_t<T>::type_size > 4) {
2084 int i;
2085 int idx = (th->th.th_dispatch->th_disp_index - 1) %
2086 __kmp_dispatch_num_buffers; // current loop index
2087 kmp_info_t **other_threads = team->t.t_threads;
2088 // loop complete, safe to destroy locks used for stealing
2089 for (i = 0; i < th->th.th_team_nproc; ++i) {
2090 dispatch_private_info_template<T> *buf =
2091 reinterpret_cast<dispatch_private_info_template<T> *>(
2092 &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2093 kmp_lock_t *lck = buf->u.p.th_steal_lock;
2094 KMP_ASSERT(lck != NULL);
2095 __kmp_destroy_lock(lck);
2096 __kmp_free(lck);
2097 buf->u.p.th_steal_lock = NULL;
2098 }
2099 }
2100 #endif
2101 /* NOTE: release this buffer to be reused */
2102
2103 KMP_MB(); /* Flush all pending memory write invalidates. */
2104
2105 sh->u.s.num_done = 0;
2106 sh->u.s.iteration = 0;
2107
2108 /* TODO replace with general release procedure? */
2109 if (pr->flags.ordered) {
2110 sh->u.s.ordered_iteration = 0;
2111 }
2112
2113 KMP_MB(); /* Flush all pending memory write invalidates. */
2114
2115 sh->buffer_index += __kmp_dispatch_num_buffers;
2116 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2117 gtid, sh->buffer_index));
2118
2119 KMP_MB(); /* Flush all pending memory write invalidates. */
2120
2121 } // if
2122 if (__kmp_env_consistency_check) {
2123 if (pr->pushed_ws != ct_none) {
2124 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2125 }
2126 }
2127
2128 th->th.th_dispatch->th_deo_fcn = NULL;
2129 th->th.th_dispatch->th_dxo_fcn = NULL;
2130 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2131 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2132 } // if (status == 0)
2133 #if KMP_OS_WINDOWS
2134 else if (last) {
2135 pr->u.p.last_upper = pr->u.p.ub;
2136 }
2137 #endif /* KMP_OS_WINDOWS */
2138 if (p_last != NULL && status != 0)
2139 *p_last = last;
2140 } // if
2141
2142 #ifdef KMP_DEBUG
2143 {
2144 char *buff;
2145 // create format specifiers before the debug output
2146 buff = __kmp_str_format(
2147 "__kmp_dispatch_next: T#%%d normal case: "
2148 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2149 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2150 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2151 (p_last ? *p_last : 0), status));
2152 __kmp_str_free(&buff);
2153 }
2154 #endif
2155 #if INCLUDE_SSC_MARKS
2156 SSC_MARK_DISPATCH_NEXT();
2157 #endif
2158 OMPT_LOOP_END;
2159 KMP_STATS_LOOP_END;
2160 return status;
2161 }
2162
2163 template <typename T>
__kmp_dist_get_bounds(ident_t * loc,kmp_int32 gtid,kmp_int32 * plastiter,T * plower,T * pupper,typename traits_t<T>::signed_t incr)2164 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2165 kmp_int32 *plastiter, T *plower, T *pupper,
2166 typename traits_t<T>::signed_t incr) {
2167 typedef typename traits_t<T>::unsigned_t UT;
2168 kmp_uint32 team_id;
2169 kmp_uint32 nteams;
2170 UT trip_count;
2171 kmp_team_t *team;
2172 kmp_info_t *th;
2173
2174 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2175 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2176 #ifdef KMP_DEBUG
2177 typedef typename traits_t<T>::signed_t ST;
2178 {
2179 char *buff;
2180 // create format specifiers before the debug output
2181 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2182 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2183 traits_t<T>::spec, traits_t<T>::spec,
2184 traits_t<ST>::spec, traits_t<T>::spec);
2185 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2186 __kmp_str_free(&buff);
2187 }
2188 #endif
2189
2190 if (__kmp_env_consistency_check) {
2191 if (incr == 0) {
2192 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2193 loc);
2194 }
2195 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2196 // The loop is illegal.
2197 // Some zero-trip loops maintained by compiler, e.g.:
2198 // for(i=10;i<0;++i) // lower >= upper - run-time check
2199 // for(i=0;i>10;--i) // lower <= upper - run-time check
2200 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2201 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2202 // Compiler does not check the following illegal loops:
2203 // for(i=0;i<10;i+=incr) // where incr<0
2204 // for(i=10;i>0;i-=incr) // where incr<0
2205 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2206 }
2207 }
2208 __kmp_assert_valid_gtid(gtid);
2209 th = __kmp_threads[gtid];
2210 team = th->th.th_team;
2211 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2212 nteams = th->th.th_teams_size.nteams;
2213 team_id = team->t.t_master_tid;
2214 KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2215
2216 // compute global trip count
2217 if (incr == 1) {
2218 trip_count = *pupper - *plower + 1;
2219 } else if (incr == -1) {
2220 trip_count = *plower - *pupper + 1;
2221 } else if (incr > 0) {
2222 // upper-lower can exceed the limit of signed type
2223 trip_count = (UT)(*pupper - *plower) / incr + 1;
2224 } else {
2225 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2226 }
2227
2228 if (trip_count <= nteams) {
2229 KMP_DEBUG_ASSERT(
2230 __kmp_static == kmp_sch_static_greedy ||
2231 __kmp_static ==
2232 kmp_sch_static_balanced); // Unknown static scheduling type.
2233 // only some teams get single iteration, others get nothing
2234 if (team_id < trip_count) {
2235 *pupper = *plower = *plower + team_id * incr;
2236 } else {
2237 *plower = *pupper + incr; // zero-trip loop
2238 }
2239 if (plastiter != NULL)
2240 *plastiter = (team_id == trip_count - 1);
2241 } else {
2242 if (__kmp_static == kmp_sch_static_balanced) {
2243 UT chunk = trip_count / nteams;
2244 UT extras = trip_count % nteams;
2245 *plower +=
2246 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2247 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2248 if (plastiter != NULL)
2249 *plastiter = (team_id == nteams - 1);
2250 } else {
2251 T chunk_inc_count =
2252 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2253 T upper = *pupper;
2254 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2255 // Unknown static scheduling type.
2256 *plower += team_id * chunk_inc_count;
2257 *pupper = *plower + chunk_inc_count - incr;
2258 // Check/correct bounds if needed
2259 if (incr > 0) {
2260 if (*pupper < *plower)
2261 *pupper = traits_t<T>::max_value;
2262 if (plastiter != NULL)
2263 *plastiter = *plower <= upper && *pupper > upper - incr;
2264 if (*pupper > upper)
2265 *pupper = upper; // tracker C73258
2266 } else {
2267 if (*pupper > *plower)
2268 *pupper = traits_t<T>::min_value;
2269 if (plastiter != NULL)
2270 *plastiter = *plower >= upper && *pupper < upper - incr;
2271 if (*pupper < upper)
2272 *pupper = upper; // tracker C73258
2273 }
2274 }
2275 }
2276 }
2277
2278 //-----------------------------------------------------------------------------
2279 // Dispatch routines
2280 // Transfer call to template< type T >
2281 // __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2282 // T lb, T ub, ST st, ST chunk )
2283 extern "C" {
2284
2285 /*!
2286 @ingroup WORK_SHARING
2287 @{
2288 @param loc Source location
2289 @param gtid Global thread id
2290 @param schedule Schedule type
2291 @param lb Lower bound
2292 @param ub Upper bound
2293 @param st Step (or increment if you prefer)
2294 @param chunk The chunk size to block with
2295
2296 This function prepares the runtime to start a dynamically scheduled for loop,
2297 saving the loop arguments.
2298 These functions are all identical apart from the types of the arguments.
2299 */
2300
__kmpc_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk)2301 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2302 enum sched_type schedule, kmp_int32 lb,
2303 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2304 KMP_DEBUG_ASSERT(__kmp_init_serial);
2305 #if OMPT_SUPPORT && OMPT_OPTIONAL
2306 OMPT_STORE_RETURN_ADDRESS(gtid);
2307 #endif
2308 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2309 }
2310 /*!
2311 See @ref __kmpc_dispatch_init_4
2312 */
__kmpc_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk)2313 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2314 enum sched_type schedule, kmp_uint32 lb,
2315 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2316 KMP_DEBUG_ASSERT(__kmp_init_serial);
2317 #if OMPT_SUPPORT && OMPT_OPTIONAL
2318 OMPT_STORE_RETURN_ADDRESS(gtid);
2319 #endif
2320 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2321 }
2322
2323 /*!
2324 See @ref __kmpc_dispatch_init_4
2325 */
__kmpc_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk)2326 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2327 enum sched_type schedule, kmp_int64 lb,
2328 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2329 KMP_DEBUG_ASSERT(__kmp_init_serial);
2330 #if OMPT_SUPPORT && OMPT_OPTIONAL
2331 OMPT_STORE_RETURN_ADDRESS(gtid);
2332 #endif
2333 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2334 }
2335
2336 /*!
2337 See @ref __kmpc_dispatch_init_4
2338 */
__kmpc_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk)2339 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2340 enum sched_type schedule, kmp_uint64 lb,
2341 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2342 KMP_DEBUG_ASSERT(__kmp_init_serial);
2343 #if OMPT_SUPPORT && OMPT_OPTIONAL
2344 OMPT_STORE_RETURN_ADDRESS(gtid);
2345 #endif
2346 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2347 }
2348
2349 /*!
2350 See @ref __kmpc_dispatch_init_4
2351
2352 Difference from __kmpc_dispatch_init set of functions is these functions
2353 are called for composite distribute parallel for construct. Thus before
2354 regular iterations dispatching we need to calc per-team iteration space.
2355
2356 These functions are all identical apart from the types of the arguments.
2357 */
__kmpc_dist_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk)2358 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2359 enum sched_type schedule, kmp_int32 *p_last,
2360 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2361 kmp_int32 chunk) {
2362 KMP_DEBUG_ASSERT(__kmp_init_serial);
2363 #if OMPT_SUPPORT && OMPT_OPTIONAL
2364 OMPT_STORE_RETURN_ADDRESS(gtid);
2365 #endif
2366 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2367 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2368 }
2369
__kmpc_dist_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk)2370 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2371 enum sched_type schedule, kmp_int32 *p_last,
2372 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2373 kmp_int32 chunk) {
2374 KMP_DEBUG_ASSERT(__kmp_init_serial);
2375 #if OMPT_SUPPORT && OMPT_OPTIONAL
2376 OMPT_STORE_RETURN_ADDRESS(gtid);
2377 #endif
2378 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2379 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2380 }
2381
__kmpc_dist_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk)2382 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2383 enum sched_type schedule, kmp_int32 *p_last,
2384 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2385 kmp_int64 chunk) {
2386 KMP_DEBUG_ASSERT(__kmp_init_serial);
2387 #if OMPT_SUPPORT && OMPT_OPTIONAL
2388 OMPT_STORE_RETURN_ADDRESS(gtid);
2389 #endif
2390 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2391 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2392 }
2393
__kmpc_dist_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk)2394 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2395 enum sched_type schedule, kmp_int32 *p_last,
2396 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2397 kmp_int64 chunk) {
2398 KMP_DEBUG_ASSERT(__kmp_init_serial);
2399 #if OMPT_SUPPORT && OMPT_OPTIONAL
2400 OMPT_STORE_RETURN_ADDRESS(gtid);
2401 #endif
2402 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2403 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2404 }
2405
2406 /*!
2407 @param loc Source code location
2408 @param gtid Global thread id
2409 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2410 otherwise
2411 @param p_lb Pointer to the lower bound for the next chunk of work
2412 @param p_ub Pointer to the upper bound for the next chunk of work
2413 @param p_st Pointer to the stride for the next chunk of work
2414 @return one if there is work to be done, zero otherwise
2415
2416 Get the next dynamically allocated chunk of work for this thread.
2417 If there is no more work, then the lb,ub and stride need not be modified.
2418 */
__kmpc_dispatch_next_4(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_int32 * p_lb,kmp_int32 * p_ub,kmp_int32 * p_st)2419 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2420 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2421 #if OMPT_SUPPORT && OMPT_OPTIONAL
2422 OMPT_STORE_RETURN_ADDRESS(gtid);
2423 #endif
2424 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2425 #if OMPT_SUPPORT && OMPT_OPTIONAL
2426 ,
2427 OMPT_LOAD_RETURN_ADDRESS(gtid)
2428 #endif
2429 );
2430 }
2431
2432 /*!
2433 See @ref __kmpc_dispatch_next_4
2434 */
__kmpc_dispatch_next_4u(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_uint32 * p_lb,kmp_uint32 * p_ub,kmp_int32 * p_st)2435 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2436 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2437 kmp_int32 *p_st) {
2438 #if OMPT_SUPPORT && OMPT_OPTIONAL
2439 OMPT_STORE_RETURN_ADDRESS(gtid);
2440 #endif
2441 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2442 #if OMPT_SUPPORT && OMPT_OPTIONAL
2443 ,
2444 OMPT_LOAD_RETURN_ADDRESS(gtid)
2445 #endif
2446 );
2447 }
2448
2449 /*!
2450 See @ref __kmpc_dispatch_next_4
2451 */
__kmpc_dispatch_next_8(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_int64 * p_lb,kmp_int64 * p_ub,kmp_int64 * p_st)2452 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2453 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2454 #if OMPT_SUPPORT && OMPT_OPTIONAL
2455 OMPT_STORE_RETURN_ADDRESS(gtid);
2456 #endif
2457 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2458 #if OMPT_SUPPORT && OMPT_OPTIONAL
2459 ,
2460 OMPT_LOAD_RETURN_ADDRESS(gtid)
2461 #endif
2462 );
2463 }
2464
2465 /*!
2466 See @ref __kmpc_dispatch_next_4
2467 */
__kmpc_dispatch_next_8u(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_uint64 * p_lb,kmp_uint64 * p_ub,kmp_int64 * p_st)2468 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2469 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2470 kmp_int64 *p_st) {
2471 #if OMPT_SUPPORT && OMPT_OPTIONAL
2472 OMPT_STORE_RETURN_ADDRESS(gtid);
2473 #endif
2474 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2475 #if OMPT_SUPPORT && OMPT_OPTIONAL
2476 ,
2477 OMPT_LOAD_RETURN_ADDRESS(gtid)
2478 #endif
2479 );
2480 }
2481
2482 /*!
2483 @param loc Source code location
2484 @param gtid Global thread id
2485
2486 Mark the end of a dynamic loop.
2487 */
__kmpc_dispatch_fini_4(ident_t * loc,kmp_int32 gtid)2488 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2489 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2490 }
2491
2492 /*!
2493 See @ref __kmpc_dispatch_fini_4
2494 */
__kmpc_dispatch_fini_8(ident_t * loc,kmp_int32 gtid)2495 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2496 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2497 }
2498
2499 /*!
2500 See @ref __kmpc_dispatch_fini_4
2501 */
__kmpc_dispatch_fini_4u(ident_t * loc,kmp_int32 gtid)2502 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2503 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2504 }
2505
2506 /*!
2507 See @ref __kmpc_dispatch_fini_4
2508 */
__kmpc_dispatch_fini_8u(ident_t * loc,kmp_int32 gtid)2509 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2510 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2511 }
2512 /*! @} */
2513
2514 //-----------------------------------------------------------------------------
2515 // Non-template routines from kmp_dispatch.cpp used in other sources
2516
__kmp_eq_4(kmp_uint32 value,kmp_uint32 checker)2517 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2518 return value == checker;
2519 }
2520
__kmp_neq_4(kmp_uint32 value,kmp_uint32 checker)2521 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2522 return value != checker;
2523 }
2524
__kmp_lt_4(kmp_uint32 value,kmp_uint32 checker)2525 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2526 return value < checker;
2527 }
2528
__kmp_ge_4(kmp_uint32 value,kmp_uint32 checker)2529 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2530 return value >= checker;
2531 }
2532
__kmp_le_4(kmp_uint32 value,kmp_uint32 checker)2533 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2534 return value <= checker;
2535 }
2536
2537 kmp_uint32
__kmp_wait_4(volatile kmp_uint32 * spinner,kmp_uint32 checker,kmp_uint32 (* pred)(kmp_uint32,kmp_uint32),void * obj)2538 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2539 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2540 void *obj // Higher-level synchronization object, or NULL.
2541 ) {
2542 // note: we may not belong to a team at this point
2543 volatile kmp_uint32 *spin = spinner;
2544 kmp_uint32 check = checker;
2545 kmp_uint32 spins;
2546 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2547 kmp_uint32 r;
2548
2549 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2550 KMP_INIT_YIELD(spins);
2551 // main wait spin loop
2552 while (!f(r = TCR_4(*spin), check)) {
2553 KMP_FSYNC_SPIN_PREPARE(obj);
2554 /* GEH - remove this since it was accidentally introduced when kmp_wait was
2555 split. It causes problems with infinite recursion because of exit lock */
2556 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2557 __kmp_abort_thread(); */
2558 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2559 }
2560 KMP_FSYNC_SPIN_ACQUIRED(obj);
2561 return r;
2562 }
2563
__kmp_wait_4_ptr(void * spinner,kmp_uint32 checker,kmp_uint32 (* pred)(void *,kmp_uint32),void * obj)2564 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2565 kmp_uint32 (*pred)(void *, kmp_uint32),
2566 void *obj // Higher-level synchronization object, or NULL.
2567 ) {
2568 // note: we may not belong to a team at this point
2569 void *spin = spinner;
2570 kmp_uint32 check = checker;
2571 kmp_uint32 spins;
2572 kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2573
2574 KMP_FSYNC_SPIN_INIT(obj, spin);
2575 KMP_INIT_YIELD(spins);
2576 // main wait spin loop
2577 while (!f(spin, check)) {
2578 KMP_FSYNC_SPIN_PREPARE(obj);
2579 /* if we have waited a bit, or are noversubscribed, yield */
2580 /* pause is in the following code */
2581 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2582 }
2583 KMP_FSYNC_SPIN_ACQUIRED(obj);
2584 }
2585
2586 } // extern "C"
2587
2588 #ifdef KMP_GOMP_COMPAT
2589
__kmp_aux_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk,int push_ws)2590 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2591 enum sched_type schedule, kmp_int32 lb,
2592 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2593 int push_ws) {
2594 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2595 push_ws);
2596 }
2597
__kmp_aux_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk,int push_ws)2598 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2599 enum sched_type schedule, kmp_uint32 lb,
2600 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2601 int push_ws) {
2602 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2603 push_ws);
2604 }
2605
__kmp_aux_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk,int push_ws)2606 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2607 enum sched_type schedule, kmp_int64 lb,
2608 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2609 int push_ws) {
2610 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2611 push_ws);
2612 }
2613
__kmp_aux_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk,int push_ws)2614 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2615 enum sched_type schedule, kmp_uint64 lb,
2616 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2617 int push_ws) {
2618 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2619 push_ws);
2620 }
2621
__kmp_aux_dispatch_fini_chunk_4(ident_t * loc,kmp_int32 gtid)2622 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2623 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2624 }
2625
__kmp_aux_dispatch_fini_chunk_8(ident_t * loc,kmp_int32 gtid)2626 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2627 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2628 }
2629
__kmp_aux_dispatch_fini_chunk_4u(ident_t * loc,kmp_int32 gtid)2630 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2631 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2632 }
2633
__kmp_aux_dispatch_fini_chunk_8u(ident_t * loc,kmp_int32 gtid)2634 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2635 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2636 }
2637
2638 #endif /* KMP_GOMP_COMPAT */
2639
2640 /* ------------------------------------------------------------------------ */
2641