• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 /* Dynamic scheduling initialization and dispatch.
14  *
15  * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
16  *       it may change values between parallel regions.  __kmp_max_nth
17  *       is the largest value __kmp_nth may take, 1 is the smallest.
18  */
19 
20 #include "kmp.h"
21 #include "kmp_error.h"
22 #include "kmp_i18n.h"
23 #include "kmp_itt.h"
24 #include "kmp_stats.h"
25 #include "kmp_str.h"
26 #if KMP_USE_X87CONTROL
27 #include <float.h>
28 #endif
29 #include "kmp_lock.h"
30 #include "kmp_dispatch.h"
31 #if KMP_USE_HIER_SCHED
32 #include "kmp_dispatch_hier.h"
33 #endif
34 
35 #if OMPT_SUPPORT
36 #include "ompt-specific.h"
37 #endif
38 
39 /* ------------------------------------------------------------------------ */
40 /* ------------------------------------------------------------------------ */
41 
__kmp_dispatch_deo_error(int * gtid_ref,int * cid_ref,ident_t * loc_ref)42 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
43   kmp_info_t *th;
44 
45   KMP_DEBUG_ASSERT(gtid_ref);
46 
47   if (__kmp_env_consistency_check) {
48     th = __kmp_threads[*gtid_ref];
49     if (th->th.th_root->r.r_active &&
50         (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
51 #if KMP_USE_DYNAMIC_LOCK
52       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
53 #else
54       __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
55 #endif
56     }
57   }
58 }
59 
__kmp_dispatch_dxo_error(int * gtid_ref,int * cid_ref,ident_t * loc_ref)60 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
61   kmp_info_t *th;
62 
63   if (__kmp_env_consistency_check) {
64     th = __kmp_threads[*gtid_ref];
65     if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
66       __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
67     }
68   }
69 }
70 
71 // Returns either SCHEDULE_MONOTONIC or SCHEDULE_NONMONOTONIC
__kmp_get_monotonicity(ident_t * loc,enum sched_type schedule,bool use_hier=false)72 static inline int __kmp_get_monotonicity(ident_t *loc, enum sched_type schedule,
73                                          bool use_hier = false) {
74   // Pick up the nonmonotonic/monotonic bits from the scheduling type
75   // TODO: make nonmonotonic when static_steal is fixed
76   int monotonicity = SCHEDULE_MONOTONIC;
77 
78   // Let default be monotonic for executables
79   // compiled with OpenMP* 4.5 or less compilers
80   if (loc->get_openmp_version() < 50)
81     monotonicity = SCHEDULE_MONOTONIC;
82 
83   if (use_hier)
84     monotonicity = SCHEDULE_MONOTONIC;
85   else if (SCHEDULE_HAS_NONMONOTONIC(schedule))
86     monotonicity = SCHEDULE_NONMONOTONIC;
87   else if (SCHEDULE_HAS_MONOTONIC(schedule))
88     monotonicity = SCHEDULE_MONOTONIC;
89 
90   return monotonicity;
91 }
92 
93 // Initialize a dispatch_private_info_template<T> buffer for a particular
94 // type of schedule,chunk.  The loop description is found in lb (lower bound),
95 // ub (upper bound), and st (stride).  nproc is the number of threads relevant
96 // to the scheduling (often the number of threads in a team, but not always if
97 // hierarchical scheduling is used).  tid is the id of the thread calling
98 // the function within the group of nproc threads.  It will have a value
99 // between 0 and nproc - 1.  This is often just the thread id within a team, but
100 // is not necessarily the case when using hierarchical scheduling.
101 // loc is the source file location of the corresponding loop
102 // gtid is the global thread id
103 template <typename T>
__kmp_dispatch_init_algorithm(ident_t * loc,int gtid,dispatch_private_info_template<T> * pr,enum sched_type schedule,T lb,T ub,typename traits_t<T>::signed_t st,kmp_uint64 * cur_chunk,typename traits_t<T>::signed_t chunk,T nproc,T tid)104 void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
105                                    dispatch_private_info_template<T> *pr,
106                                    enum sched_type schedule, T lb, T ub,
107                                    typename traits_t<T>::signed_t st,
108 #if USE_ITT_BUILD
109                                    kmp_uint64 *cur_chunk,
110 #endif
111                                    typename traits_t<T>::signed_t chunk,
112                                    T nproc, T tid) {
113   typedef typename traits_t<T>::unsigned_t UT;
114   typedef typename traits_t<T>::floating_t DBL;
115 
116   int active;
117   T tc;
118   kmp_info_t *th;
119   kmp_team_t *team;
120   int monotonicity;
121   bool use_hier;
122 
123 #ifdef KMP_DEBUG
124   typedef typename traits_t<T>::signed_t ST;
125   {
126     char *buff;
127     // create format specifiers before the debug output
128     buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d called "
129                             "pr:%%p lb:%%%s ub:%%%s st:%%%s "
130                             "schedule:%%d chunk:%%%s nproc:%%%s tid:%%%s\n",
131                             traits_t<T>::spec, traits_t<T>::spec,
132                             traits_t<ST>::spec, traits_t<ST>::spec,
133                             traits_t<T>::spec, traits_t<T>::spec);
134     KD_TRACE(10, (buff, gtid, pr, lb, ub, st, schedule, chunk, nproc, tid));
135     __kmp_str_free(&buff);
136   }
137 #endif
138   /* setup data */
139   th = __kmp_threads[gtid];
140   team = th->th.th_team;
141   active = !team->t.t_serialized;
142 
143 #if USE_ITT_BUILD
144   int itt_need_metadata_reporting =
145       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
146       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
147       team->t.t_active_level == 1;
148 #endif
149 
150 #if KMP_USE_HIER_SCHED
151   use_hier = pr->flags.use_hier;
152 #else
153   use_hier = false;
154 #endif
155 
156   /* Pick up the nonmonotonic/monotonic bits from the scheduling type */
157   monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
158   schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
159 
160   /* Pick up the nomerge/ordered bits from the scheduling type */
161   if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
162     pr->flags.nomerge = TRUE;
163     schedule =
164         (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
165   } else {
166     pr->flags.nomerge = FALSE;
167   }
168   pr->type_size = traits_t<T>::type_size; // remember the size of variables
169   if (kmp_ord_lower & schedule) {
170     pr->flags.ordered = TRUE;
171     schedule =
172         (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
173   } else {
174     pr->flags.ordered = FALSE;
175   }
176   // Ordered overrides nonmonotonic
177   if (pr->flags.ordered) {
178     monotonicity = SCHEDULE_MONOTONIC;
179   }
180 
181   if (schedule == kmp_sch_static) {
182     schedule = __kmp_static;
183   } else {
184     if (schedule == kmp_sch_runtime) {
185       // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
186       // not specified)
187       schedule = team->t.t_sched.r_sched_type;
188       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
189       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
190       // Detail the schedule if needed (global controls are differentiated
191       // appropriately)
192       if (schedule == kmp_sch_guided_chunked) {
193         schedule = __kmp_guided;
194       } else if (schedule == kmp_sch_static) {
195         schedule = __kmp_static;
196       }
197       // Use the chunk size specified by OMP_SCHEDULE (or default if not
198       // specified)
199       chunk = team->t.t_sched.chunk;
200 #if USE_ITT_BUILD
201       if (cur_chunk)
202         *cur_chunk = chunk;
203 #endif
204 #ifdef KMP_DEBUG
205       {
206         char *buff;
207         // create format specifiers before the debug output
208         buff = __kmp_str_format("__kmp_dispatch_init_algorithm: T#%%d new: "
209                                 "schedule:%%d chunk:%%%s\n",
210                                 traits_t<ST>::spec);
211         KD_TRACE(10, (buff, gtid, schedule, chunk));
212         __kmp_str_free(&buff);
213       }
214 #endif
215     } else {
216       if (schedule == kmp_sch_guided_chunked) {
217         schedule = __kmp_guided;
218       }
219       if (chunk <= 0) {
220         chunk = KMP_DEFAULT_CHUNK;
221       }
222     }
223 
224     if (schedule == kmp_sch_auto) {
225       // mapping and differentiation: in the __kmp_do_serial_initialize()
226       schedule = __kmp_auto;
227 #ifdef KMP_DEBUG
228       {
229         char *buff;
230         // create format specifiers before the debug output
231         buff = __kmp_str_format(
232             "__kmp_dispatch_init_algorithm: kmp_sch_auto: T#%%d new: "
233             "schedule:%%d chunk:%%%s\n",
234             traits_t<ST>::spec);
235         KD_TRACE(10, (buff, gtid, schedule, chunk));
236         __kmp_str_free(&buff);
237       }
238 #endif
239     }
240 #if KMP_STATIC_STEAL_ENABLED
241     // map nonmonotonic:dynamic to static steal
242     if (schedule == kmp_sch_dynamic_chunked) {
243       if (monotonicity == SCHEDULE_NONMONOTONIC)
244         schedule = kmp_sch_static_steal;
245     }
246 #endif
247     /* guided analytical not safe for too many threads */
248     if (schedule == kmp_sch_guided_analytical_chunked && nproc > 1 << 20) {
249       schedule = kmp_sch_guided_iterative_chunked;
250       KMP_WARNING(DispatchManyThreads);
251     }
252     if (schedule == kmp_sch_runtime_simd) {
253       // compiler provides simd_width in the chunk parameter
254       schedule = team->t.t_sched.r_sched_type;
255       monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier);
256       schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
257       // Detail the schedule if needed (global controls are differentiated
258       // appropriately)
259       if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
260           schedule == __kmp_static) {
261         schedule = kmp_sch_static_balanced_chunked;
262       } else {
263         if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
264           schedule = kmp_sch_guided_simd;
265         }
266         chunk = team->t.t_sched.chunk * chunk;
267       }
268 #if USE_ITT_BUILD
269       if (cur_chunk)
270         *cur_chunk = chunk;
271 #endif
272 #ifdef KMP_DEBUG
273       {
274         char *buff;
275         // create format specifiers before the debug output
276         buff = __kmp_str_format(
277             "__kmp_dispatch_init_algorithm: T#%%d new: schedule:%%d"
278             " chunk:%%%s\n",
279             traits_t<ST>::spec);
280         KD_TRACE(10, (buff, gtid, schedule, chunk));
281         __kmp_str_free(&buff);
282       }
283 #endif
284     }
285     pr->u.p.parm1 = chunk;
286   }
287   KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
288               "unknown scheduling type");
289 
290   pr->u.p.count = 0;
291 
292   if (__kmp_env_consistency_check) {
293     if (st == 0) {
294       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
295                             (pr->flags.ordered ? ct_pdo_ordered : ct_pdo), loc);
296     }
297   }
298   // compute trip count
299   if (st == 1) { // most common case
300     if (ub >= lb) {
301       tc = ub - lb + 1;
302     } else { // ub < lb
303       tc = 0; // zero-trip
304     }
305   } else if (st < 0) {
306     if (lb >= ub) {
307       // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
308       // where the division needs to be unsigned regardless of the result type
309       tc = (UT)(lb - ub) / (-st) + 1;
310     } else { // lb < ub
311       tc = 0; // zero-trip
312     }
313   } else { // st > 0
314     if (ub >= lb) {
315       // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
316       // where the division needs to be unsigned regardless of the result type
317       tc = (UT)(ub - lb) / st + 1;
318     } else { // ub < lb
319       tc = 0; // zero-trip
320     }
321   }
322 
323 #if KMP_STATS_ENABLED
324   if (KMP_MASTER_GTID(gtid)) {
325     KMP_COUNT_VALUE(OMP_loop_dynamic_total_iterations, tc);
326   }
327 #endif
328 
329   pr->u.p.lb = lb;
330   pr->u.p.ub = ub;
331   pr->u.p.st = st;
332   pr->u.p.tc = tc;
333 
334 #if KMP_OS_WINDOWS
335   pr->u.p.last_upper = ub + st;
336 #endif /* KMP_OS_WINDOWS */
337 
338   /* NOTE: only the active parallel region(s) has active ordered sections */
339 
340   if (active) {
341     if (pr->flags.ordered) {
342       pr->ordered_bumped = 0;
343       pr->u.p.ordered_lower = 1;
344       pr->u.p.ordered_upper = 0;
345     }
346   }
347 
348   switch (schedule) {
349 #if (KMP_STATIC_STEAL_ENABLED)
350   case kmp_sch_static_steal: {
351     T ntc, init;
352 
353     KD_TRACE(100,
354              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_steal case\n",
355               gtid));
356 
357     ntc = (tc % chunk ? 1 : 0) + tc / chunk;
358     if (nproc > 1 && ntc >= nproc) {
359       KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL);
360       T id = tid;
361       T small_chunk, extras;
362 
363       small_chunk = ntc / nproc;
364       extras = ntc % nproc;
365 
366       init = id * small_chunk + (id < extras ? id : extras);
367       pr->u.p.count = init;
368       pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
369 
370       pr->u.p.parm2 = lb;
371       // parm3 is the number of times to attempt stealing which is
372       // proportional to the number of chunks per thread up until
373       // the maximum value of nproc.
374       pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc);
375       pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
376       pr->u.p.st = st;
377       if (traits_t<T>::type_size > 4) {
378         // AC: TODO: check if 16-byte CAS available and use it to
379         // improve performance (probably wait for explicit request
380         // before spending time on this).
381         // For now use dynamically allocated per-thread lock,
382         // free memory in __kmp_dispatch_next when status==0.
383         KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL);
384         pr->u.p.th_steal_lock =
385             (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
386         __kmp_init_lock(pr->u.p.th_steal_lock);
387       }
388       break;
389     } else {
390       /* too few chunks: switching to kmp_sch_dynamic_chunked */
391       schedule = kmp_sch_dynamic_chunked;
392       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d switching to "
393                      "kmp_sch_dynamic_chunked\n",
394                       gtid));
395       if (pr->u.p.parm1 <= 0)
396         pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
397       break;
398     } // if
399   } // case
400 #endif
401   case kmp_sch_static_balanced: {
402     T init, limit;
403 
404     KD_TRACE(
405         100,
406         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_balanced case\n",
407          gtid));
408 
409     if (nproc > 1) {
410       T id = tid;
411 
412       if (tc < nproc) {
413         if (id < tc) {
414           init = id;
415           limit = id;
416           pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
417         } else {
418           pr->u.p.count = 1; /* means no more chunks to execute */
419           pr->u.p.parm1 = FALSE;
420           break;
421         }
422       } else {
423         T small_chunk = tc / nproc;
424         T extras = tc % nproc;
425         init = id * small_chunk + (id < extras ? id : extras);
426         limit = init + small_chunk - (id < extras ? 0 : 1);
427         pr->u.p.parm1 = (id == nproc - 1);
428       }
429     } else {
430       if (tc > 0) {
431         init = 0;
432         limit = tc - 1;
433         pr->u.p.parm1 = TRUE;
434       } else {
435         // zero trip count
436         pr->u.p.count = 1; /* means no more chunks to execute */
437         pr->u.p.parm1 = FALSE;
438         break;
439       }
440     }
441 #if USE_ITT_BUILD
442     // Calculate chunk for metadata report
443     if (itt_need_metadata_reporting)
444       if (cur_chunk)
445         *cur_chunk = limit - init + 1;
446 #endif
447     if (st == 1) {
448       pr->u.p.lb = lb + init;
449       pr->u.p.ub = lb + limit;
450     } else {
451       // calculated upper bound, "ub" is user-defined upper bound
452       T ub_tmp = lb + limit * st;
453       pr->u.p.lb = lb + init * st;
454       // adjust upper bound to "ub" if needed, so that MS lastprivate will match
455       // it exactly
456       if (st > 0) {
457         pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
458       } else {
459         pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
460       }
461     }
462     if (pr->flags.ordered) {
463       pr->u.p.ordered_lower = init;
464       pr->u.p.ordered_upper = limit;
465     }
466     break;
467   } // case
468   case kmp_sch_static_balanced_chunked: {
469     // similar to balanced, but chunk adjusted to multiple of simd width
470     T nth = nproc;
471     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d runtime(simd:static)"
472                    " -> falling-through to static_greedy\n",
473                    gtid));
474     schedule = kmp_sch_static_greedy;
475     if (nth > 1)
476       pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
477     else
478       pr->u.p.parm1 = tc;
479     break;
480   } // case
481   case kmp_sch_guided_simd:
482   case kmp_sch_guided_iterative_chunked: {
483     KD_TRACE(
484         100,
485         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_guided_iterative_chunked"
486          " case\n",
487          gtid));
488 
489     if (nproc > 1) {
490       if ((2L * chunk + 1) * nproc >= tc) {
491         /* chunk size too large, switch to dynamic */
492         schedule = kmp_sch_dynamic_chunked;
493       } else {
494         // when remaining iters become less than parm2 - switch to dynamic
495         pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
496         *(double *)&pr->u.p.parm3 =
497             guided_flt_param / nproc; // may occupy parm3 and parm4
498       }
499     } else {
500       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
501                      "kmp_sch_static_greedy\n",
502                      gtid));
503       schedule = kmp_sch_static_greedy;
504       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
505       KD_TRACE(
506           100,
507           ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
508            gtid));
509       pr->u.p.parm1 = tc;
510     } // if
511   } // case
512   break;
513   case kmp_sch_guided_analytical_chunked: {
514     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
515                    "kmp_sch_guided_analytical_chunked case\n",
516                    gtid));
517 
518     if (nproc > 1) {
519       if ((2L * chunk + 1) * nproc >= tc) {
520         /* chunk size too large, switch to dynamic */
521         schedule = kmp_sch_dynamic_chunked;
522       } else {
523         /* commonly used term: (2 nproc - 1)/(2 nproc) */
524         DBL x;
525 
526 #if KMP_USE_X87CONTROL
527         /* Linux* OS already has 64-bit computation by default for long double,
528            and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
529            Windows* OS on IA-32 architecture, we need to set precision to 64-bit
530            instead of the default 53-bit. Even though long double doesn't work
531            on Windows* OS on Intel(R) 64, the resulting lack of precision is not
532            expected to impact the correctness of the algorithm, but this has not
533            been mathematically proven. */
534         // save original FPCW and set precision to 64-bit, as
535         // Windows* OS on IA-32 architecture defaults to 53-bit
536         unsigned int oldFpcw = _control87(0, 0);
537         _control87(_PC_64, _MCW_PC); // 0,0x30000
538 #endif
539         /* value used for comparison in solver for cross-over point */
540         long double target = ((long double)chunk * 2 + 1) * nproc / tc;
541 
542         /* crossover point--chunk indexes equal to or greater than
543            this point switch to dynamic-style scheduling */
544         UT cross;
545 
546         /* commonly used term: (2 nproc - 1)/(2 nproc) */
547         x = (long double)1.0 - (long double)0.5 / nproc;
548 
549 #ifdef KMP_DEBUG
550         { // test natural alignment
551           struct _test_a {
552             char a;
553             union {
554               char b;
555               DBL d;
556             };
557           } t;
558           ptrdiff_t natural_alignment =
559               (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
560           //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
561           // long)natural_alignment );
562           KMP_DEBUG_ASSERT(
563               (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
564         }
565 #endif // KMP_DEBUG
566 
567         /* save the term in thread private dispatch structure */
568         *(DBL *)&pr->u.p.parm3 = x;
569 
570         /* solve for the crossover point to the nearest integer i for which C_i
571            <= chunk */
572         {
573           UT left, right, mid;
574           long double p;
575 
576           /* estimate initial upper and lower bound */
577 
578           /* doesn't matter what value right is as long as it is positive, but
579              it affects performance of the solver */
580           right = 229;
581           p = __kmp_pow<UT>(x, right);
582           if (p > target) {
583             do {
584               p *= p;
585               right <<= 1;
586             } while (p > target && right < (1 << 27));
587             /* lower bound is previous (failed) estimate of upper bound */
588             left = right >> 1;
589           } else {
590             left = 0;
591           }
592 
593           /* bisection root-finding method */
594           while (left + 1 < right) {
595             mid = (left + right) / 2;
596             if (__kmp_pow<UT>(x, mid) > target) {
597               left = mid;
598             } else {
599               right = mid;
600             }
601           } // while
602           cross = right;
603         }
604         /* assert sanity of computed crossover point */
605         KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
606                    __kmp_pow<UT>(x, cross) <= target);
607 
608         /* save the crossover point in thread private dispatch structure */
609         pr->u.p.parm2 = cross;
610 
611 // C75803
612 #if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
613 #define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
614 #else
615 #define GUIDED_ANALYTICAL_WORKAROUND (x)
616 #endif
617         /* dynamic-style scheduling offset */
618         pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
619                                  tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
620                         cross * chunk;
621 #if KMP_USE_X87CONTROL
622         // restore FPCW
623         _control87(oldFpcw, _MCW_PC);
624 #endif
625       } // if
626     } else {
627       KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d falling-through to "
628                      "kmp_sch_static_greedy\n",
629                      gtid));
630       schedule = kmp_sch_static_greedy;
631       /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
632       pr->u.p.parm1 = tc;
633     } // if
634   } // case
635   break;
636   case kmp_sch_static_greedy:
637     KD_TRACE(
638         100,
639         ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_static_greedy case\n",
640          gtid));
641     pr->u.p.parm1 = (nproc > 1) ? (tc + nproc - 1) / nproc : tc;
642     break;
643   case kmp_sch_static_chunked:
644   case kmp_sch_dynamic_chunked:
645     if (pr->u.p.parm1 <= 0) {
646       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
647     }
648     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
649                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
650                    gtid));
651     break;
652   case kmp_sch_trapezoidal: {
653     /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
654 
655     T parm1, parm2, parm3, parm4;
656     KD_TRACE(100,
657              ("__kmp_dispatch_init_algorithm: T#%d kmp_sch_trapezoidal case\n",
658               gtid));
659 
660     parm1 = chunk;
661 
662     /* F : size of the first cycle */
663     parm2 = (tc / (2 * nproc));
664 
665     if (parm2 < 1) {
666       parm2 = 1;
667     }
668 
669     /* L : size of the last cycle.  Make sure the last cycle is not larger
670        than the first cycle. */
671     if (parm1 < 1) {
672       parm1 = 1;
673     } else if (parm1 > parm2) {
674       parm1 = parm2;
675     }
676 
677     /* N : number of cycles */
678     parm3 = (parm2 + parm1);
679     parm3 = (2 * tc + parm3 - 1) / parm3;
680 
681     if (parm3 < 2) {
682       parm3 = 2;
683     }
684 
685     /* sigma : decreasing incr of the trapezoid */
686     parm4 = (parm3 - 1);
687     parm4 = (parm2 - parm1) / parm4;
688 
689     // pointless check, because parm4 >= 0 always
690     // if ( parm4 < 0 ) {
691     //    parm4 = 0;
692     //}
693 
694     pr->u.p.parm1 = parm1;
695     pr->u.p.parm2 = parm2;
696     pr->u.p.parm3 = parm3;
697     pr->u.p.parm4 = parm4;
698   } // case
699   break;
700 
701   default: {
702     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
703                 KMP_HNT(GetNewerLibrary), // Hint
704                 __kmp_msg_null // Variadic argument list terminator
705                 );
706   } break;
707   } // switch
708   pr->schedule = schedule;
709 }
710 
711 #if KMP_USE_HIER_SCHED
712 template <typename T>
713 inline void __kmp_dispatch_init_hier_runtime(ident_t *loc, T lb, T ub,
714                                              typename traits_t<T>::signed_t st);
715 template <>
716 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_int32 lb,kmp_int32 ub,kmp_int32 st)717 __kmp_dispatch_init_hier_runtime<kmp_int32>(ident_t *loc, kmp_int32 lb,
718                                             kmp_int32 ub, kmp_int32 st) {
719   __kmp_dispatch_init_hierarchy<kmp_int32>(
720       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
721       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
722 }
723 template <>
724 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st)725 __kmp_dispatch_init_hier_runtime<kmp_uint32>(ident_t *loc, kmp_uint32 lb,
726                                              kmp_uint32 ub, kmp_int32 st) {
727   __kmp_dispatch_init_hierarchy<kmp_uint32>(
728       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
729       __kmp_hier_scheds.scheds, __kmp_hier_scheds.small_chunks, lb, ub, st);
730 }
731 template <>
732 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_int64 lb,kmp_int64 ub,kmp_int64 st)733 __kmp_dispatch_init_hier_runtime<kmp_int64>(ident_t *loc, kmp_int64 lb,
734                                             kmp_int64 ub, kmp_int64 st) {
735   __kmp_dispatch_init_hierarchy<kmp_int64>(
736       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
737       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
738 }
739 template <>
740 inline void
__kmp_dispatch_init_hier_runtime(ident_t * loc,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st)741 __kmp_dispatch_init_hier_runtime<kmp_uint64>(ident_t *loc, kmp_uint64 lb,
742                                              kmp_uint64 ub, kmp_int64 st) {
743   __kmp_dispatch_init_hierarchy<kmp_uint64>(
744       loc, __kmp_hier_scheds.size, __kmp_hier_scheds.layers,
745       __kmp_hier_scheds.scheds, __kmp_hier_scheds.large_chunks, lb, ub, st);
746 }
747 
748 // free all the hierarchy scheduling memory associated with the team
__kmp_dispatch_free_hierarchies(kmp_team_t * team)749 void __kmp_dispatch_free_hierarchies(kmp_team_t *team) {
750   int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
751   for (int i = 0; i < num_disp_buff; ++i) {
752     // type does not matter here so use kmp_int32
753     auto sh =
754         reinterpret_cast<dispatch_shared_info_template<kmp_int32> volatile *>(
755             &team->t.t_disp_buffer[i]);
756     if (sh->hier) {
757       sh->hier->deallocate();
758       __kmp_free(sh->hier);
759     }
760   }
761 }
762 #endif
763 
764 // UT - unsigned flavor of T, ST - signed flavor of T,
765 // DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
766 template <typename T>
767 static void
__kmp_dispatch_init(ident_t * loc,int gtid,enum sched_type schedule,T lb,T ub,typename traits_t<T>::signed_t st,typename traits_t<T>::signed_t chunk,int push_ws)768 __kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
769                     T ub, typename traits_t<T>::signed_t st,
770                     typename traits_t<T>::signed_t chunk, int push_ws) {
771   typedef typename traits_t<T>::unsigned_t UT;
772 
773   int active;
774   kmp_info_t *th;
775   kmp_team_t *team;
776   kmp_uint32 my_buffer_index;
777   dispatch_private_info_template<T> *pr;
778   dispatch_shared_info_template<T> volatile *sh;
779 
780   KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
781                    sizeof(dispatch_private_info));
782   KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
783                    sizeof(dispatch_shared_info));
784   __kmp_assert_valid_gtid(gtid);
785 
786   if (!TCR_4(__kmp_init_parallel))
787     __kmp_parallel_initialize();
788 
789   __kmp_resume_if_soft_paused();
790 
791 #if INCLUDE_SSC_MARKS
792   SSC_MARK_DISPATCH_INIT();
793 #endif
794 #ifdef KMP_DEBUG
795   typedef typename traits_t<T>::signed_t ST;
796   {
797     char *buff;
798     // create format specifiers before the debug output
799     buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
800                             "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
801                             traits_t<ST>::spec, traits_t<T>::spec,
802                             traits_t<T>::spec, traits_t<ST>::spec);
803     KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
804     __kmp_str_free(&buff);
805   }
806 #endif
807   /* setup data */
808   th = __kmp_threads[gtid];
809   team = th->th.th_team;
810   active = !team->t.t_serialized;
811   th->th.th_ident = loc;
812 
813   // Any half-decent optimizer will remove this test when the blocks are empty
814   // since the macros expand to nothing
815   // when statistics are disabled.
816   if (schedule == __kmp_static) {
817     KMP_COUNT_BLOCK(OMP_LOOP_STATIC);
818   } else {
819     KMP_COUNT_BLOCK(OMP_LOOP_DYNAMIC);
820   }
821 
822 #if KMP_USE_HIER_SCHED
823   // Initialize the scheduling hierarchy if requested in OMP_SCHEDULE envirable
824   // Hierarchical scheduling does not work with ordered, so if ordered is
825   // detected, then revert back to threaded scheduling.
826   bool ordered;
827   enum sched_type my_sched = schedule;
828   my_buffer_index = th->th.th_dispatch->th_disp_index;
829   pr = reinterpret_cast<dispatch_private_info_template<T> *>(
830       &th->th.th_dispatch
831            ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
832   my_sched = SCHEDULE_WITHOUT_MODIFIERS(my_sched);
833   if ((my_sched >= kmp_nm_lower) && (my_sched < kmp_nm_upper))
834     my_sched =
835         (enum sched_type)(((int)my_sched) - (kmp_nm_lower - kmp_sch_lower));
836   ordered = (kmp_ord_lower & my_sched);
837   if (pr->flags.use_hier) {
838     if (ordered) {
839       KD_TRACE(100, ("__kmp_dispatch_init: T#%d ordered loop detected.  "
840                      "Disabling hierarchical scheduling.\n",
841                      gtid));
842       pr->flags.use_hier = FALSE;
843     }
844   }
845   if (schedule == kmp_sch_runtime && __kmp_hier_scheds.size > 0) {
846     // Don't use hierarchical for ordered parallel loops and don't
847     // use the runtime hierarchy if one was specified in the program
848     if (!ordered && !pr->flags.use_hier)
849       __kmp_dispatch_init_hier_runtime<T>(loc, lb, ub, st);
850   }
851 #endif // KMP_USE_HIER_SCHED
852 
853 #if USE_ITT_BUILD
854   kmp_uint64 cur_chunk = chunk;
855   int itt_need_metadata_reporting =
856       __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
857       KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL &&
858       team->t.t_active_level == 1;
859 #endif
860   if (!active) {
861     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
862         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
863   } else {
864     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
865                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
866 
867     my_buffer_index = th->th.th_dispatch->th_disp_index++;
868 
869     /* What happens when number of threads changes, need to resize buffer? */
870     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
871         &th->th.th_dispatch
872              ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
873     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
874         &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
875     KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid,
876                   my_buffer_index));
877   }
878 
879   __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st,
880 #if USE_ITT_BUILD
881                                 &cur_chunk,
882 #endif
883                                 chunk, (T)th->th.th_team_nproc,
884                                 (T)th->th.th_info.ds.ds_tid);
885   if (active) {
886     if (pr->flags.ordered == 0) {
887       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
888       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
889     } else {
890       th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
891       th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
892     }
893   }
894 
895   if (active) {
896     /* The name of this buffer should be my_buffer_index when it's free to use
897      * it */
898 
899     KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
900                    "sh->buffer_index:%d\n",
901                    gtid, my_buffer_index, sh->buffer_index));
902     __kmp_wait<kmp_uint32>(&sh->buffer_index, my_buffer_index,
903                            __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
904     // Note: KMP_WAIT() cannot be used there: buffer index and
905     // my_buffer_index are *always* 32-bit integers.
906     KMP_MB(); /* is this necessary? */
907     KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
908                    "sh->buffer_index:%d\n",
909                    gtid, my_buffer_index, sh->buffer_index));
910 
911     th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
912     th->th.th_dispatch->th_dispatch_sh_current =
913         CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
914 #if USE_ITT_BUILD
915     if (pr->flags.ordered) {
916       __kmp_itt_ordered_init(gtid);
917     }
918     // Report loop metadata
919     if (itt_need_metadata_reporting) {
920       // Only report metadata by master of active team at level 1
921       kmp_uint64 schedtype = 0;
922       switch (schedule) {
923       case kmp_sch_static_chunked:
924       case kmp_sch_static_balanced: // Chunk is calculated in the switch above
925         break;
926       case kmp_sch_static_greedy:
927         cur_chunk = pr->u.p.parm1;
928         break;
929       case kmp_sch_dynamic_chunked:
930         schedtype = 1;
931         break;
932       case kmp_sch_guided_iterative_chunked:
933       case kmp_sch_guided_analytical_chunked:
934       case kmp_sch_guided_simd:
935         schedtype = 2;
936         break;
937       default:
938         // Should we put this case under "static"?
939         // case kmp_sch_static_steal:
940         schedtype = 3;
941         break;
942       }
943       __kmp_itt_metadata_loop(loc, schedtype, pr->u.p.tc, cur_chunk);
944     }
945 #if KMP_USE_HIER_SCHED
946     if (pr->flags.use_hier) {
947       pr->u.p.count = 0;
948       pr->u.p.ub = pr->u.p.lb = pr->u.p.st = pr->u.p.tc = 0;
949     }
950 #endif // KMP_USER_HIER_SCHED
951 #endif /* USE_ITT_BUILD */
952   }
953 
954 #ifdef KMP_DEBUG
955   {
956     char *buff;
957     // create format specifiers before the debug output
958     buff = __kmp_str_format(
959         "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
960         "lb:%%%s ub:%%%s"
961         " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
962         " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
963         traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
964         traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
965         traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
966         traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
967     KD_TRACE(10, (buff, gtid, pr->schedule, pr->flags.ordered, pr->u.p.lb,
968                   pr->u.p.ub, pr->u.p.st, pr->u.p.tc, pr->u.p.count,
969                   pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
970                   pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4));
971     __kmp_str_free(&buff);
972   }
973 #endif
974 #if (KMP_STATIC_STEAL_ENABLED)
975   // It cannot be guaranteed that after execution of a loop with some other
976   // schedule kind all the parm3 variables will contain the same value. Even if
977   // all parm3 will be the same, it still exists a bad case like using 0 and 1
978   // rather than program life-time increment. So the dedicated variable is
979   // required. The 'static_steal_counter' is used.
980   if (pr->schedule == kmp_sch_static_steal) {
981     // Other threads will inspect this variable when searching for a victim.
982     // This is a flag showing that other threads may steal from this thread
983     // since then.
984     volatile T *p = &pr->u.p.static_steal_counter;
985     *p = *p + 1;
986   }
987 #endif // ( KMP_STATIC_STEAL_ENABLED )
988 
989 #if OMPT_SUPPORT && OMPT_OPTIONAL
990   if (ompt_enabled.ompt_callback_work) {
991     ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
992     ompt_task_info_t *task_info = __ompt_get_task_info_object(0);
993     ompt_callbacks.ompt_callback(ompt_callback_work)(
994         ompt_work_loop, ompt_scope_begin, &(team_info->parallel_data),
995         &(task_info->task_data), pr->u.p.tc, OMPT_LOAD_RETURN_ADDRESS(gtid));
996   }
997 #endif
998   KMP_PUSH_PARTITIONED_TIMER(OMP_loop_dynamic);
999 }
1000 
1001 /* For ordered loops, either __kmp_dispatch_finish() should be called after
1002  * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1003  * every chunk of iterations.  If the ordered section(s) were not executed
1004  * for this iteration (or every iteration in this chunk), we need to set the
1005  * ordered iteration counters so that the next thread can proceed. */
1006 template <typename UT>
__kmp_dispatch_finish(int gtid,ident_t * loc)1007 static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1008   typedef typename traits_t<UT>::signed_t ST;
1009   __kmp_assert_valid_gtid(gtid);
1010   kmp_info_t *th = __kmp_threads[gtid];
1011 
1012   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1013   if (!th->th.th_team->t.t_serialized) {
1014 
1015     dispatch_private_info_template<UT> *pr =
1016         reinterpret_cast<dispatch_private_info_template<UT> *>(
1017             th->th.th_dispatch->th_dispatch_pr_current);
1018     dispatch_shared_info_template<UT> volatile *sh =
1019         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1020             th->th.th_dispatch->th_dispatch_sh_current);
1021     KMP_DEBUG_ASSERT(pr);
1022     KMP_DEBUG_ASSERT(sh);
1023     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1024                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1025 
1026     if (pr->ordered_bumped) {
1027       KD_TRACE(
1028           1000,
1029           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1030            gtid));
1031       pr->ordered_bumped = 0;
1032     } else {
1033       UT lower = pr->u.p.ordered_lower;
1034 
1035 #ifdef KMP_DEBUG
1036       {
1037         char *buff;
1038         // create format specifiers before the debug output
1039         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1040                                 "ordered_iteration:%%%s lower:%%%s\n",
1041                                 traits_t<UT>::spec, traits_t<UT>::spec);
1042         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1043         __kmp_str_free(&buff);
1044       }
1045 #endif
1046 
1047       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1048                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1049       KMP_MB(); /* is this necessary? */
1050 #ifdef KMP_DEBUG
1051       {
1052         char *buff;
1053         // create format specifiers before the debug output
1054         buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1055                                 "ordered_iteration:%%%s lower:%%%s\n",
1056                                 traits_t<UT>::spec, traits_t<UT>::spec);
1057         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1058         __kmp_str_free(&buff);
1059       }
1060 #endif
1061 
1062       test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
1063     } // if
1064   } // if
1065   KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
1066 }
1067 
1068 #ifdef KMP_GOMP_COMPAT
1069 
1070 template <typename UT>
__kmp_dispatch_finish_chunk(int gtid,ident_t * loc)1071 static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1072   typedef typename traits_t<UT>::signed_t ST;
1073   __kmp_assert_valid_gtid(gtid);
1074   kmp_info_t *th = __kmp_threads[gtid];
1075 
1076   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1077   if (!th->th.th_team->t.t_serialized) {
1078     //        int cid;
1079     dispatch_private_info_template<UT> *pr =
1080         reinterpret_cast<dispatch_private_info_template<UT> *>(
1081             th->th.th_dispatch->th_dispatch_pr_current);
1082     dispatch_shared_info_template<UT> volatile *sh =
1083         reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1084             th->th.th_dispatch->th_dispatch_sh_current);
1085     KMP_DEBUG_ASSERT(pr);
1086     KMP_DEBUG_ASSERT(sh);
1087     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1088                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1089 
1090     //        for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1091     UT lower = pr->u.p.ordered_lower;
1092     UT upper = pr->u.p.ordered_upper;
1093     UT inc = upper - lower + 1;
1094 
1095     if (pr->ordered_bumped == inc) {
1096       KD_TRACE(
1097           1000,
1098           ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1099            gtid));
1100       pr->ordered_bumped = 0;
1101     } else {
1102       inc -= pr->ordered_bumped;
1103 
1104 #ifdef KMP_DEBUG
1105       {
1106         char *buff;
1107         // create format specifiers before the debug output
1108         buff = __kmp_str_format(
1109             "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1110             "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1111             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1112         KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1113         __kmp_str_free(&buff);
1114       }
1115 #endif
1116 
1117       __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
1118                      __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1119 
1120       KMP_MB(); /* is this necessary? */
1121       KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1122                       "ordered_bumped to zero\n",
1123                       gtid));
1124       pr->ordered_bumped = 0;
1125 //!!!!! TODO check if the inc should be unsigned, or signed???
1126 #ifdef KMP_DEBUG
1127       {
1128         char *buff;
1129         // create format specifiers before the debug output
1130         buff = __kmp_str_format(
1131             "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1132             "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1133             traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1134             traits_t<UT>::spec);
1135         KD_TRACE(1000,
1136                  (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1137         __kmp_str_free(&buff);
1138       }
1139 #endif
1140 
1141       test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
1142     }
1143     //        }
1144   }
1145   KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
1146 }
1147 
1148 #endif /* KMP_GOMP_COMPAT */
1149 
1150 template <typename T>
__kmp_dispatch_next_algorithm(int gtid,dispatch_private_info_template<T> * pr,dispatch_shared_info_template<T> volatile * sh,kmp_int32 * p_last,T * p_lb,T * p_ub,typename traits_t<T>::signed_t * p_st,T nproc,T tid)1151 int __kmp_dispatch_next_algorithm(int gtid,
1152                                   dispatch_private_info_template<T> *pr,
1153                                   dispatch_shared_info_template<T> volatile *sh,
1154                                   kmp_int32 *p_last, T *p_lb, T *p_ub,
1155                                   typename traits_t<T>::signed_t *p_st, T nproc,
1156                                   T tid) {
1157   typedef typename traits_t<T>::unsigned_t UT;
1158   typedef typename traits_t<T>::signed_t ST;
1159   typedef typename traits_t<T>::floating_t DBL;
1160   int status = 0;
1161   kmp_int32 last = 0;
1162   T start;
1163   ST incr;
1164   UT limit, trip, init;
1165   kmp_info_t *th = __kmp_threads[gtid];
1166   kmp_team_t *team = th->th.th_team;
1167 
1168   KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1169                    &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
1170   KMP_DEBUG_ASSERT(pr);
1171   KMP_DEBUG_ASSERT(sh);
1172   KMP_DEBUG_ASSERT(tid >= 0 && tid < nproc);
1173 #ifdef KMP_DEBUG
1174   {
1175     char *buff;
1176     // create format specifiers before the debug output
1177     buff =
1178         __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d called pr:%%p "
1179                          "sh:%%p nproc:%%%s tid:%%%s\n",
1180                          traits_t<T>::spec, traits_t<T>::spec);
1181     KD_TRACE(10, (buff, gtid, pr, sh, nproc, tid));
1182     __kmp_str_free(&buff);
1183   }
1184 #endif
1185 
1186   // zero trip count
1187   if (pr->u.p.tc == 0) {
1188     KD_TRACE(10,
1189              ("__kmp_dispatch_next_algorithm: T#%d early exit trip count is "
1190               "zero status:%d\n",
1191               gtid, status));
1192     return 0;
1193   }
1194 
1195   switch (pr->schedule) {
1196 #if (KMP_STATIC_STEAL_ENABLED)
1197   case kmp_sch_static_steal: {
1198     T chunk = pr->u.p.parm1;
1199 
1200     KD_TRACE(100,
1201              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n",
1202               gtid));
1203 
1204     trip = pr->u.p.tc - 1;
1205 
1206     if (traits_t<T>::type_size > 4) {
1207       // use lock for 8-byte and CAS for 4-byte induction
1208       // variable. TODO (optional): check and use 16-byte CAS
1209       kmp_lock_t *lck = pr->u.p.th_steal_lock;
1210       KMP_DEBUG_ASSERT(lck != NULL);
1211       if (pr->u.p.count < (UT)pr->u.p.ub) {
1212         __kmp_acquire_lock(lck, gtid);
1213         // try to get own chunk of iterations
1214         init = (pr->u.p.count)++;
1215         status = (init < (UT)pr->u.p.ub);
1216         __kmp_release_lock(lck, gtid);
1217       } else {
1218         status = 0; // no own chunks
1219       }
1220       if (!status) { // try to steal
1221         kmp_info_t **other_threads = team->t.t_threads;
1222         int while_limit = pr->u.p.parm3;
1223         int while_index = 0;
1224         T id = pr->u.p.static_steal_counter; // loop id
1225         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1226                   __kmp_dispatch_num_buffers; // current loop index
1227         // note: victim thread can potentially execute another loop
1228         // TODO: algorithm of searching for a victim
1229         // should be cleaned up and measured
1230         while ((!status) && (while_limit != ++while_index)) {
1231           dispatch_private_info_template<T> *victim;
1232           T remaining;
1233           T victimIdx = pr->u.p.parm4;
1234           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1235           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1236               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1237           KMP_DEBUG_ASSERT(victim);
1238           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1239                  oldVictimIdx != victimIdx) {
1240             victimIdx = (victimIdx + 1) % nproc;
1241             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1242                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1243             KMP_DEBUG_ASSERT(victim);
1244           }
1245           if (victim == pr || id != victim->u.p.static_steal_counter) {
1246             continue; // try once more (nproc attempts in total)
1247             // no victim is ready yet to participate in stealing
1248             // because no victim passed kmp_init_dispatch yet
1249           }
1250           if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1251             pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1252             continue; // not enough chunks to steal, goto next victim
1253           }
1254 
1255           lck = victim->u.p.th_steal_lock;
1256           KMP_ASSERT(lck != NULL);
1257           __kmp_acquire_lock(lck, gtid);
1258           limit = victim->u.p.ub; // keep initial ub
1259           if (victim->u.p.count >= limit ||
1260               (remaining = limit - victim->u.p.count) < 2) {
1261             __kmp_release_lock(lck, gtid);
1262             pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1263             continue; // not enough chunks to steal
1264           }
1265           // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or
1266           // by 1
1267           if (remaining > 3) {
1268             // steal 1/4 of remaining
1269             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2);
1270             init = (victim->u.p.ub -= (remaining >> 2));
1271           } else {
1272             // steal 1 chunk of 2 or 3 remaining
1273             KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1);
1274             init = (victim->u.p.ub -= 1);
1275           }
1276           __kmp_release_lock(lck, gtid);
1277 
1278           KMP_DEBUG_ASSERT(init + 1 <= limit);
1279           pr->u.p.parm4 = victimIdx; // remember victim to steal from
1280           status = 1;
1281           while_index = 0;
1282           // now update own count and ub with stolen range but init chunk
1283           __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid);
1284           pr->u.p.count = init + 1;
1285           pr->u.p.ub = limit;
1286           __kmp_release_lock(pr->u.p.th_steal_lock, gtid);
1287         } // while (search for victim)
1288       } // if (try to find victim and steal)
1289     } else {
1290       // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1291       typedef union {
1292         struct {
1293           UT count;
1294           T ub;
1295         } p;
1296         kmp_int64 b;
1297       } union_i4;
1298       // All operations on 'count' or 'ub' must be combined atomically
1299       // together.
1300       {
1301         union_i4 vold, vnew;
1302         vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1303         vnew = vold;
1304         vnew.p.count++;
1305         while (!KMP_COMPARE_AND_STORE_ACQ64(
1306             (volatile kmp_int64 *)&pr->u.p.count,
1307             *VOLATILE_CAST(kmp_int64 *) & vold.b,
1308             *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1309           KMP_CPU_PAUSE();
1310           vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1311           vnew = vold;
1312           vnew.p.count++;
1313         }
1314         vnew = vold;
1315         init = vnew.p.count;
1316         status = (init < (UT)vnew.p.ub);
1317       }
1318 
1319       if (!status) {
1320         kmp_info_t **other_threads = team->t.t_threads;
1321         int while_limit = pr->u.p.parm3;
1322         int while_index = 0;
1323         T id = pr->u.p.static_steal_counter; // loop id
1324         int idx = (th->th.th_dispatch->th_disp_index - 1) %
1325                   __kmp_dispatch_num_buffers; // current loop index
1326         // note: victim thread can potentially execute another loop
1327         // TODO: algorithm of searching for a victim
1328         // should be cleaned up and measured
1329         while ((!status) && (while_limit != ++while_index)) {
1330           dispatch_private_info_template<T> *victim;
1331           union_i4 vold, vnew;
1332           kmp_int32 remaining;
1333           T victimIdx = pr->u.p.parm4;
1334           T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1335           victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1336               &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1337           KMP_DEBUG_ASSERT(victim);
1338           while ((victim == pr || id != victim->u.p.static_steal_counter) &&
1339                  oldVictimIdx != victimIdx) {
1340             victimIdx = (victimIdx + 1) % nproc;
1341             victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1342                 &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]);
1343             KMP_DEBUG_ASSERT(victim);
1344           }
1345           if (victim == pr || id != victim->u.p.static_steal_counter) {
1346             continue; // try once more (nproc attempts in total)
1347             // no victim is ready yet to participate in stealing
1348             // because no victim passed kmp_init_dispatch yet
1349           }
1350           pr->u.p.parm4 = victimIdx; // new victim found
1351           while (1) { // CAS loop if victim has enough chunks to steal
1352             vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1353             vnew = vold;
1354 
1355             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1356             if (vnew.p.count >= (UT)vnew.p.ub ||
1357                 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1358               pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1359               break; // not enough chunks to steal, goto next victim
1360             }
1361             if (remaining > 3) {
1362               vnew.p.ub -= (remaining >> 2); // try to steal 1/4 of remaining
1363             } else {
1364               vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1365             }
1366             KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1367             // TODO: Should this be acquire or release?
1368             if (KMP_COMPARE_AND_STORE_ACQ64(
1369                     (volatile kmp_int64 *)&victim->u.p.count,
1370                     *VOLATILE_CAST(kmp_int64 *) & vold.b,
1371                     *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1372               // stealing succeeded
1373               KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen,
1374                                         vold.p.ub - vnew.p.ub);
1375               status = 1;
1376               while_index = 0;
1377               // now update own count and ub
1378               init = vnew.p.ub;
1379               vold.p.count = init + 1;
1380 #if KMP_ARCH_X86
1381               KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vold.b);
1382 #else
1383               *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1384 #endif
1385               break;
1386             } // if (check CAS result)
1387             KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt
1388           } // while (try to steal from particular victim)
1389         } // while (search for victim)
1390       } // if (try to find victim and steal)
1391     } // if (4-byte induction variable)
1392     if (!status) {
1393       *p_lb = 0;
1394       *p_ub = 0;
1395       if (p_st != NULL)
1396         *p_st = 0;
1397     } else {
1398       start = pr->u.p.parm2;
1399       init *= chunk;
1400       limit = chunk + init - 1;
1401       incr = pr->u.p.st;
1402       KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1);
1403 
1404       KMP_DEBUG_ASSERT(init <= trip);
1405       if ((last = (limit >= trip)) != 0)
1406         limit = trip;
1407       if (p_st != NULL)
1408         *p_st = incr;
1409 
1410       if (incr == 1) {
1411         *p_lb = start + init;
1412         *p_ub = start + limit;
1413       } else {
1414         *p_lb = start + init * incr;
1415         *p_ub = start + limit * incr;
1416       }
1417 
1418       if (pr->flags.ordered) {
1419         pr->u.p.ordered_lower = init;
1420         pr->u.p.ordered_upper = limit;
1421       } // if
1422     } // if
1423     break;
1424   } // case
1425 #endif // ( KMP_STATIC_STEAL_ENABLED )
1426   case kmp_sch_static_balanced: {
1427     KD_TRACE(
1428         10,
1429         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_balanced case\n",
1430          gtid));
1431     /* check if thread has any iteration to do */
1432     if ((status = !pr->u.p.count) != 0) {
1433       pr->u.p.count = 1;
1434       *p_lb = pr->u.p.lb;
1435       *p_ub = pr->u.p.ub;
1436       last = pr->u.p.parm1;
1437       if (p_st != NULL)
1438         *p_st = pr->u.p.st;
1439     } else { /* no iterations to do */
1440       pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1441     }
1442   } // case
1443   break;
1444   case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1445                                  merged here */
1446   case kmp_sch_static_chunked: {
1447     T parm1;
1448 
1449     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1450                    "kmp_sch_static_[affinity|chunked] case\n",
1451                    gtid));
1452     parm1 = pr->u.p.parm1;
1453 
1454     trip = pr->u.p.tc - 1;
1455     init = parm1 * (pr->u.p.count + tid);
1456 
1457     if ((status = (init <= trip)) != 0) {
1458       start = pr->u.p.lb;
1459       incr = pr->u.p.st;
1460       limit = parm1 + init - 1;
1461 
1462       if ((last = (limit >= trip)) != 0)
1463         limit = trip;
1464 
1465       if (p_st != NULL)
1466         *p_st = incr;
1467 
1468       pr->u.p.count += nproc;
1469 
1470       if (incr == 1) {
1471         *p_lb = start + init;
1472         *p_ub = start + limit;
1473       } else {
1474         *p_lb = start + init * incr;
1475         *p_ub = start + limit * incr;
1476       }
1477 
1478       if (pr->flags.ordered) {
1479         pr->u.p.ordered_lower = init;
1480         pr->u.p.ordered_upper = limit;
1481       } // if
1482     } // if
1483   } // case
1484   break;
1485 
1486   case kmp_sch_dynamic_chunked: {
1487     T chunk = pr->u.p.parm1;
1488 
1489     KD_TRACE(
1490         100,
1491         ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_dynamic_chunked case\n",
1492          gtid));
1493 
1494     init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1495     trip = pr->u.p.tc - 1;
1496 
1497     if ((status = (init <= trip)) == 0) {
1498       *p_lb = 0;
1499       *p_ub = 0;
1500       if (p_st != NULL)
1501         *p_st = 0;
1502     } else {
1503       start = pr->u.p.lb;
1504       limit = chunk + init - 1;
1505       incr = pr->u.p.st;
1506 
1507       if ((last = (limit >= trip)) != 0)
1508         limit = trip;
1509 
1510       if (p_st != NULL)
1511         *p_st = incr;
1512 
1513       if (incr == 1) {
1514         *p_lb = start + init;
1515         *p_ub = start + limit;
1516       } else {
1517         *p_lb = start + init * incr;
1518         *p_ub = start + limit * incr;
1519       }
1520 
1521       if (pr->flags.ordered) {
1522         pr->u.p.ordered_lower = init;
1523         pr->u.p.ordered_upper = limit;
1524       } // if
1525     } // if
1526   } // case
1527   break;
1528 
1529   case kmp_sch_guided_iterative_chunked: {
1530     T chunkspec = pr->u.p.parm1;
1531     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_chunked "
1532                    "iterative case\n",
1533                    gtid));
1534     trip = pr->u.p.tc;
1535     // Start atomic part of calculations
1536     while (1) {
1537       ST remaining; // signed, because can be < 0
1538       init = sh->u.s.iteration; // shared value
1539       remaining = trip - init;
1540       if (remaining <= 0) { // AC: need to compare with 0 first
1541         // nothing to do, don't try atomic op
1542         status = 0;
1543         break;
1544       }
1545       if ((T)remaining <
1546           pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1547         // use dynamic-style schedule
1548         // atomically increment iterations, get old value
1549         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1550                                  (ST)chunkspec);
1551         remaining = trip - init;
1552         if (remaining <= 0) {
1553           status = 0; // all iterations got by other threads
1554         } else {
1555           // got some iterations to work on
1556           status = 1;
1557           if ((T)remaining > chunkspec) {
1558             limit = init + chunkspec - 1;
1559           } else {
1560             last = 1; // the last chunk
1561             limit = init + remaining - 1;
1562           } // if
1563         } // if
1564         break;
1565       } // if
1566       limit = init +
1567               (UT)(remaining * *(double *)&pr->u.p.parm3); // divide by K*nproc
1568       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1569                                (ST)init, (ST)limit)) {
1570         // CAS was successful, chunk obtained
1571         status = 1;
1572         --limit;
1573         break;
1574       } // if
1575     } // while
1576     if (status != 0) {
1577       start = pr->u.p.lb;
1578       incr = pr->u.p.st;
1579       if (p_st != NULL)
1580         *p_st = incr;
1581       *p_lb = start + init * incr;
1582       *p_ub = start + limit * incr;
1583       if (pr->flags.ordered) {
1584         pr->u.p.ordered_lower = init;
1585         pr->u.p.ordered_upper = limit;
1586       } // if
1587     } else {
1588       *p_lb = 0;
1589       *p_ub = 0;
1590       if (p_st != NULL)
1591         *p_st = 0;
1592     } // if
1593   } // case
1594   break;
1595 
1596   case kmp_sch_guided_simd: {
1597     // same as iterative but curr-chunk adjusted to be multiple of given
1598     // chunk
1599     T chunk = pr->u.p.parm1;
1600     KD_TRACE(100,
1601              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_guided_simd case\n",
1602               gtid));
1603     trip = pr->u.p.tc;
1604     // Start atomic part of calculations
1605     while (1) {
1606       ST remaining; // signed, because can be < 0
1607       init = sh->u.s.iteration; // shared value
1608       remaining = trip - init;
1609       if (remaining <= 0) { // AC: need to compare with 0 first
1610         status = 0; // nothing to do, don't try atomic op
1611         break;
1612       }
1613       KMP_DEBUG_ASSERT(init % chunk == 0);
1614       // compare with K*nproc*(chunk+1), K=2 by default
1615       if ((T)remaining < pr->u.p.parm2) {
1616         // use dynamic-style schedule
1617         // atomically increment iterations, get old value
1618         init = test_then_add<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1619                                  (ST)chunk);
1620         remaining = trip - init;
1621         if (remaining <= 0) {
1622           status = 0; // all iterations got by other threads
1623         } else {
1624           // got some iterations to work on
1625           status = 1;
1626           if ((T)remaining > chunk) {
1627             limit = init + chunk - 1;
1628           } else {
1629             last = 1; // the last chunk
1630             limit = init + remaining - 1;
1631           } // if
1632         } // if
1633         break;
1634       } // if
1635       // divide by K*nproc
1636       UT span = remaining * (*(double *)&pr->u.p.parm3);
1637       UT rem = span % chunk;
1638       if (rem) // adjust so that span%chunk == 0
1639         span += chunk - rem;
1640       limit = init + span;
1641       if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
1642                                (ST)init, (ST)limit)) {
1643         // CAS was successful, chunk obtained
1644         status = 1;
1645         --limit;
1646         break;
1647       } // if
1648     } // while
1649     if (status != 0) {
1650       start = pr->u.p.lb;
1651       incr = pr->u.p.st;
1652       if (p_st != NULL)
1653         *p_st = incr;
1654       *p_lb = start + init * incr;
1655       *p_ub = start + limit * incr;
1656       if (pr->flags.ordered) {
1657         pr->u.p.ordered_lower = init;
1658         pr->u.p.ordered_upper = limit;
1659       } // if
1660     } else {
1661       *p_lb = 0;
1662       *p_ub = 0;
1663       if (p_st != NULL)
1664         *p_st = 0;
1665     } // if
1666   } // case
1667   break;
1668 
1669   case kmp_sch_guided_analytical_chunked: {
1670     T chunkspec = pr->u.p.parm1;
1671     UT chunkIdx;
1672 #if KMP_USE_X87CONTROL
1673     /* for storing original FPCW value for Windows* OS on
1674        IA-32 architecture 8-byte version */
1675     unsigned int oldFpcw;
1676     unsigned int fpcwSet = 0;
1677 #endif
1678     KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d "
1679                    "kmp_sch_guided_analytical_chunked case\n",
1680                    gtid));
1681 
1682     trip = pr->u.p.tc;
1683 
1684     KMP_DEBUG_ASSERT(nproc > 1);
1685     KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)nproc < trip);
1686 
1687     while (1) { /* this while loop is a safeguard against unexpected zero
1688                    chunk sizes */
1689       chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1690       if (chunkIdx >= (UT)pr->u.p.parm2) {
1691         --trip;
1692         /* use dynamic-style scheduling */
1693         init = chunkIdx * chunkspec + pr->u.p.count;
1694         /* need to verify init > 0 in case of overflow in the above
1695          * calculation */
1696         if ((status = (init > 0 && init <= trip)) != 0) {
1697           limit = init + chunkspec - 1;
1698 
1699           if ((last = (limit >= trip)) != 0)
1700             limit = trip;
1701         }
1702         break;
1703       } else {
1704 /* use exponential-style scheduling */
1705 /* The following check is to workaround the lack of long double precision on
1706    Windows* OS.
1707    This check works around the possible effect that init != 0 for chunkIdx == 0.
1708  */
1709 #if KMP_USE_X87CONTROL
1710         /* If we haven't already done so, save original
1711            FPCW and set precision to 64-bit, as Windows* OS
1712            on IA-32 architecture defaults to 53-bit */
1713         if (!fpcwSet) {
1714           oldFpcw = _control87(0, 0);
1715           _control87(_PC_64, _MCW_PC);
1716           fpcwSet = 0x30000;
1717         }
1718 #endif
1719         if (chunkIdx) {
1720           init = __kmp_dispatch_guided_remaining<T>(
1721               trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
1722           KMP_DEBUG_ASSERT(init);
1723           init = trip - init;
1724         } else
1725           init = 0;
1726         limit = trip - __kmp_dispatch_guided_remaining<T>(
1727                            trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
1728         KMP_ASSERT(init <= limit);
1729         if (init < limit) {
1730           KMP_DEBUG_ASSERT(limit <= trip);
1731           --limit;
1732           status = 1;
1733           break;
1734         } // if
1735       } // if
1736     } // while (1)
1737 #if KMP_USE_X87CONTROL
1738     /* restore FPCW if necessary
1739        AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1740     */
1741     if (fpcwSet && (oldFpcw & fpcwSet))
1742       _control87(oldFpcw, _MCW_PC);
1743 #endif
1744     if (status != 0) {
1745       start = pr->u.p.lb;
1746       incr = pr->u.p.st;
1747       if (p_st != NULL)
1748         *p_st = incr;
1749       *p_lb = start + init * incr;
1750       *p_ub = start + limit * incr;
1751       if (pr->flags.ordered) {
1752         pr->u.p.ordered_lower = init;
1753         pr->u.p.ordered_upper = limit;
1754       }
1755     } else {
1756       *p_lb = 0;
1757       *p_ub = 0;
1758       if (p_st != NULL)
1759         *p_st = 0;
1760     }
1761   } // case
1762   break;
1763 
1764   case kmp_sch_trapezoidal: {
1765     UT index;
1766     T parm2 = pr->u.p.parm2;
1767     T parm3 = pr->u.p.parm3;
1768     T parm4 = pr->u.p.parm4;
1769     KD_TRACE(100,
1770              ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_trapezoidal case\n",
1771               gtid));
1772 
1773     index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
1774 
1775     init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
1776     trip = pr->u.p.tc - 1;
1777 
1778     if ((status = ((T)index < parm3 && init <= trip)) == 0) {
1779       *p_lb = 0;
1780       *p_ub = 0;
1781       if (p_st != NULL)
1782         *p_st = 0;
1783     } else {
1784       start = pr->u.p.lb;
1785       limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
1786       incr = pr->u.p.st;
1787 
1788       if ((last = (limit >= trip)) != 0)
1789         limit = trip;
1790 
1791       if (p_st != NULL)
1792         *p_st = incr;
1793 
1794       if (incr == 1) {
1795         *p_lb = start + init;
1796         *p_ub = start + limit;
1797       } else {
1798         *p_lb = start + init * incr;
1799         *p_ub = start + limit * incr;
1800       }
1801 
1802       if (pr->flags.ordered) {
1803         pr->u.p.ordered_lower = init;
1804         pr->u.p.ordered_upper = limit;
1805       } // if
1806     } // if
1807   } // case
1808   break;
1809   default: {
1810     status = 0; // to avoid complaints on uninitialized variable use
1811     __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1812                 KMP_HNT(GetNewerLibrary), // Hint
1813                 __kmp_msg_null // Variadic argument list terminator
1814                 );
1815   } break;
1816   } // switch
1817   if (p_last)
1818     *p_last = last;
1819 #ifdef KMP_DEBUG
1820   if (pr->flags.ordered) {
1821     char *buff;
1822     // create format specifiers before the debug output
1823     buff = __kmp_str_format("__kmp_dispatch_next_algorithm: T#%%d "
1824                             "ordered_lower:%%%s ordered_upper:%%%s\n",
1825                             traits_t<UT>::spec, traits_t<UT>::spec);
1826     KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper));
1827     __kmp_str_free(&buff);
1828   }
1829   {
1830     char *buff;
1831     // create format specifiers before the debug output
1832     buff = __kmp_str_format(
1833         "__kmp_dispatch_next_algorithm: T#%%d exit status:%%d p_last:%%d "
1834         "p_lb:%%%s p_ub:%%%s p_st:%%%s\n",
1835         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1836     KD_TRACE(10, (buff, gtid, status, *p_last, *p_lb, *p_ub, *p_st));
1837     __kmp_str_free(&buff);
1838   }
1839 #endif
1840   return status;
1841 }
1842 
1843 /* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1844    work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1845    is not called. */
1846 #if OMPT_SUPPORT && OMPT_OPTIONAL
1847 #define OMPT_LOOP_END                                                          \
1848   if (status == 0) {                                                           \
1849     if (ompt_enabled.ompt_callback_work) {                                     \
1850       ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);              \
1851       ompt_task_info_t *task_info = __ompt_get_task_info_object(0);            \
1852       ompt_callbacks.ompt_callback(ompt_callback_work)(                        \
1853           ompt_work_loop, ompt_scope_end, &(team_info->parallel_data),         \
1854           &(task_info->task_data), 0, codeptr);                                \
1855     }                                                                          \
1856   }
1857 // TODO: implement count
1858 #else
1859 #define OMPT_LOOP_END // no-op
1860 #endif
1861 
1862 #if KMP_STATS_ENABLED
1863 #define KMP_STATS_LOOP_END                                                     \
1864   {                                                                            \
1865     kmp_int64 u, l, t, i;                                                      \
1866     l = (kmp_int64)(*p_lb);                                                    \
1867     u = (kmp_int64)(*p_ub);                                                    \
1868     i = (kmp_int64)(pr->u.p.st);                                               \
1869     if (status == 0) {                                                         \
1870       t = 0;                                                                   \
1871       KMP_POP_PARTITIONED_TIMER();                                             \
1872     } else if (i == 1) {                                                       \
1873       if (u >= l)                                                              \
1874         t = u - l + 1;                                                         \
1875       else                                                                     \
1876         t = 0;                                                                 \
1877     } else if (i < 0) {                                                        \
1878       if (l >= u)                                                              \
1879         t = (l - u) / (-i) + 1;                                                \
1880       else                                                                     \
1881         t = 0;                                                                 \
1882     } else {                                                                   \
1883       if (u >= l)                                                              \
1884         t = (u - l) / i + 1;                                                   \
1885       else                                                                     \
1886         t = 0;                                                                 \
1887     }                                                                          \
1888     KMP_COUNT_VALUE(OMP_loop_dynamic_iterations, t);                           \
1889   }
1890 #else
1891 #define KMP_STATS_LOOP_END /* Nothing */
1892 #endif
1893 
1894 template <typename T>
__kmp_dispatch_next(ident_t * loc,int gtid,kmp_int32 * p_last,T * p_lb,T * p_ub,typename traits_t<T>::signed_t * p_st,void * codeptr)1895 static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1896                                T *p_lb, T *p_ub,
1897                                typename traits_t<T>::signed_t *p_st
1898 #if OMPT_SUPPORT && OMPT_OPTIONAL
1899                                ,
1900                                void *codeptr
1901 #endif
1902                                ) {
1903 
1904   typedef typename traits_t<T>::unsigned_t UT;
1905   typedef typename traits_t<T>::signed_t ST;
1906   // This is potentially slightly misleading, schedule(runtime) will appear here
1907   // even if the actual runtime schedule is static. (Which points out a
1908   // disadvantage of schedule(runtime): even when static scheduling is used it
1909   // costs more than a compile time choice to use static scheduling would.)
1910   KMP_TIME_PARTITIONED_BLOCK(OMP_loop_dynamic_scheduling);
1911 
1912   int status;
1913   dispatch_private_info_template<T> *pr;
1914   __kmp_assert_valid_gtid(gtid);
1915   kmp_info_t *th = __kmp_threads[gtid];
1916   kmp_team_t *team = th->th.th_team;
1917 
1918   KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1919   KD_TRACE(
1920       1000,
1921       ("__kmp_dispatch_next: T#%d called p_lb:%p p_ub:%p p_st:%p p_last: %p\n",
1922        gtid, p_lb, p_ub, p_st, p_last));
1923 
1924   if (team->t.t_serialized) {
1925     /* NOTE: serialize this dispatch because we are not at the active level */
1926     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1927         th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1928     KMP_DEBUG_ASSERT(pr);
1929 
1930     if ((status = (pr->u.p.tc != 0)) == 0) {
1931       *p_lb = 0;
1932       *p_ub = 0;
1933       //            if ( p_last != NULL )
1934       //                *p_last = 0;
1935       if (p_st != NULL)
1936         *p_st = 0;
1937       if (__kmp_env_consistency_check) {
1938         if (pr->pushed_ws != ct_none) {
1939           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1940         }
1941       }
1942     } else if (pr->flags.nomerge) {
1943       kmp_int32 last;
1944       T start;
1945       UT limit, trip, init;
1946       ST incr;
1947       T chunk = pr->u.p.parm1;
1948 
1949       KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1950                      gtid));
1951 
1952       init = chunk * pr->u.p.count++;
1953       trip = pr->u.p.tc - 1;
1954 
1955       if ((status = (init <= trip)) == 0) {
1956         *p_lb = 0;
1957         *p_ub = 0;
1958         //                if ( p_last != NULL )
1959         //                    *p_last = 0;
1960         if (p_st != NULL)
1961           *p_st = 0;
1962         if (__kmp_env_consistency_check) {
1963           if (pr->pushed_ws != ct_none) {
1964             pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1965           }
1966         }
1967       } else {
1968         start = pr->u.p.lb;
1969         limit = chunk + init - 1;
1970         incr = pr->u.p.st;
1971 
1972         if ((last = (limit >= trip)) != 0) {
1973           limit = trip;
1974 #if KMP_OS_WINDOWS
1975           pr->u.p.last_upper = pr->u.p.ub;
1976 #endif /* KMP_OS_WINDOWS */
1977         }
1978         if (p_last != NULL)
1979           *p_last = last;
1980         if (p_st != NULL)
1981           *p_st = incr;
1982         if (incr == 1) {
1983           *p_lb = start + init;
1984           *p_ub = start + limit;
1985         } else {
1986           *p_lb = start + init * incr;
1987           *p_ub = start + limit * incr;
1988         }
1989 
1990         if (pr->flags.ordered) {
1991           pr->u.p.ordered_lower = init;
1992           pr->u.p.ordered_upper = limit;
1993 #ifdef KMP_DEBUG
1994           {
1995             char *buff;
1996             // create format specifiers before the debug output
1997             buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1998                                     "ordered_lower:%%%s ordered_upper:%%%s\n",
1999                                     traits_t<UT>::spec, traits_t<UT>::spec);
2000             KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2001                             pr->u.p.ordered_upper));
2002             __kmp_str_free(&buff);
2003           }
2004 #endif
2005         } // if
2006       } // if
2007     } else {
2008       pr->u.p.tc = 0;
2009       *p_lb = pr->u.p.lb;
2010       *p_ub = pr->u.p.ub;
2011 #if KMP_OS_WINDOWS
2012       pr->u.p.last_upper = *p_ub;
2013 #endif /* KMP_OS_WINDOWS */
2014       if (p_last != NULL)
2015         *p_last = TRUE;
2016       if (p_st != NULL)
2017         *p_st = pr->u.p.st;
2018     } // if
2019 #ifdef KMP_DEBUG
2020     {
2021       char *buff;
2022       // create format specifiers before the debug output
2023       buff = __kmp_str_format(
2024           "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
2025           "p_ub:%%%s p_st:%%%s p_last:%%p %%d  returning:%%d\n",
2026           traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2027       KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last,
2028                     (p_last ? *p_last : 0), status));
2029       __kmp_str_free(&buff);
2030     }
2031 #endif
2032 #if INCLUDE_SSC_MARKS
2033     SSC_MARK_DISPATCH_NEXT();
2034 #endif
2035     OMPT_LOOP_END;
2036     KMP_STATS_LOOP_END;
2037     return status;
2038   } else {
2039     kmp_int32 last = 0;
2040     dispatch_shared_info_template<T> volatile *sh;
2041 
2042     KMP_DEBUG_ASSERT(th->th.th_dispatch ==
2043                      &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
2044 
2045     pr = reinterpret_cast<dispatch_private_info_template<T> *>(
2046         th->th.th_dispatch->th_dispatch_pr_current);
2047     KMP_DEBUG_ASSERT(pr);
2048     sh = reinterpret_cast<dispatch_shared_info_template<T> volatile *>(
2049         th->th.th_dispatch->th_dispatch_sh_current);
2050     KMP_DEBUG_ASSERT(sh);
2051 
2052 #if KMP_USE_HIER_SCHED
2053     if (pr->flags.use_hier)
2054       status = sh->hier->next(loc, gtid, pr, &last, p_lb, p_ub, p_st);
2055     else
2056 #endif // KMP_USE_HIER_SCHED
2057       status = __kmp_dispatch_next_algorithm<T>(gtid, pr, sh, &last, p_lb, p_ub,
2058                                                 p_st, th->th.th_team_nproc,
2059                                                 th->th.th_info.ds.ds_tid);
2060     // status == 0: no more iterations to execute
2061     if (status == 0) {
2062       UT num_done;
2063 
2064       num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2065 #ifdef KMP_DEBUG
2066       {
2067         char *buff;
2068         // create format specifiers before the debug output
2069         buff = __kmp_str_format(
2070             "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2071             traits_t<UT>::spec);
2072         KD_TRACE(10, (buff, gtid, sh->u.s.num_done));
2073         __kmp_str_free(&buff);
2074       }
2075 #endif
2076 
2077 #if KMP_USE_HIER_SCHED
2078       pr->flags.use_hier = FALSE;
2079 #endif
2080       if ((ST)num_done == th->th.th_team_nproc - 1) {
2081 #if (KMP_STATIC_STEAL_ENABLED)
2082         if (pr->schedule == kmp_sch_static_steal &&
2083             traits_t<T>::type_size > 4) {
2084           int i;
2085           int idx = (th->th.th_dispatch->th_disp_index - 1) %
2086                     __kmp_dispatch_num_buffers; // current loop index
2087           kmp_info_t **other_threads = team->t.t_threads;
2088           // loop complete, safe to destroy locks used for stealing
2089           for (i = 0; i < th->th.th_team_nproc; ++i) {
2090             dispatch_private_info_template<T> *buf =
2091                 reinterpret_cast<dispatch_private_info_template<T> *>(
2092                     &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]);
2093             kmp_lock_t *lck = buf->u.p.th_steal_lock;
2094             KMP_ASSERT(lck != NULL);
2095             __kmp_destroy_lock(lck);
2096             __kmp_free(lck);
2097             buf->u.p.th_steal_lock = NULL;
2098           }
2099         }
2100 #endif
2101         /* NOTE: release this buffer to be reused */
2102 
2103         KMP_MB(); /* Flush all pending memory write invalidates.  */
2104 
2105         sh->u.s.num_done = 0;
2106         sh->u.s.iteration = 0;
2107 
2108         /* TODO replace with general release procedure? */
2109         if (pr->flags.ordered) {
2110           sh->u.s.ordered_iteration = 0;
2111         }
2112 
2113         KMP_MB(); /* Flush all pending memory write invalidates.  */
2114 
2115         sh->buffer_index += __kmp_dispatch_num_buffers;
2116         KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2117                        gtid, sh->buffer_index));
2118 
2119         KMP_MB(); /* Flush all pending memory write invalidates.  */
2120 
2121       } // if
2122       if (__kmp_env_consistency_check) {
2123         if (pr->pushed_ws != ct_none) {
2124           pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2125         }
2126       }
2127 
2128       th->th.th_dispatch->th_deo_fcn = NULL;
2129       th->th.th_dispatch->th_dxo_fcn = NULL;
2130       th->th.th_dispatch->th_dispatch_sh_current = NULL;
2131       th->th.th_dispatch->th_dispatch_pr_current = NULL;
2132     } // if (status == 0)
2133 #if KMP_OS_WINDOWS
2134     else if (last) {
2135       pr->u.p.last_upper = pr->u.p.ub;
2136     }
2137 #endif /* KMP_OS_WINDOWS */
2138     if (p_last != NULL && status != 0)
2139       *p_last = last;
2140   } // if
2141 
2142 #ifdef KMP_DEBUG
2143   {
2144     char *buff;
2145     // create format specifiers before the debug output
2146     buff = __kmp_str_format(
2147         "__kmp_dispatch_next: T#%%d normal case: "
2148         "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p (%%d) returning:%%d\n",
2149         traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2150     KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last,
2151                   (p_last ? *p_last : 0), status));
2152     __kmp_str_free(&buff);
2153   }
2154 #endif
2155 #if INCLUDE_SSC_MARKS
2156   SSC_MARK_DISPATCH_NEXT();
2157 #endif
2158   OMPT_LOOP_END;
2159   KMP_STATS_LOOP_END;
2160   return status;
2161 }
2162 
2163 template <typename T>
__kmp_dist_get_bounds(ident_t * loc,kmp_int32 gtid,kmp_int32 * plastiter,T * plower,T * pupper,typename traits_t<T>::signed_t incr)2164 static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2165                                   kmp_int32 *plastiter, T *plower, T *pupper,
2166                                   typename traits_t<T>::signed_t incr) {
2167   typedef typename traits_t<T>::unsigned_t UT;
2168   kmp_uint32 team_id;
2169   kmp_uint32 nteams;
2170   UT trip_count;
2171   kmp_team_t *team;
2172   kmp_info_t *th;
2173 
2174   KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2175   KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2176 #ifdef KMP_DEBUG
2177   typedef typename traits_t<T>::signed_t ST;
2178   {
2179     char *buff;
2180     // create format specifiers before the debug output
2181     buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2182                             "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2183                             traits_t<T>::spec, traits_t<T>::spec,
2184                             traits_t<ST>::spec, traits_t<T>::spec);
2185     KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2186     __kmp_str_free(&buff);
2187   }
2188 #endif
2189 
2190   if (__kmp_env_consistency_check) {
2191     if (incr == 0) {
2192       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2193                             loc);
2194     }
2195     if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2196       // The loop is illegal.
2197       // Some zero-trip loops maintained by compiler, e.g.:
2198       //   for(i=10;i<0;++i) // lower >= upper - run-time check
2199       //   for(i=0;i>10;--i) // lower <= upper - run-time check
2200       //   for(i=0;i>10;++i) // incr > 0       - compile-time check
2201       //   for(i=10;i<0;--i) // incr < 0       - compile-time check
2202       // Compiler does not check the following illegal loops:
2203       //   for(i=0;i<10;i+=incr) // where incr<0
2204       //   for(i=10;i>0;i-=incr) // where incr<0
2205       __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2206     }
2207   }
2208   __kmp_assert_valid_gtid(gtid);
2209   th = __kmp_threads[gtid];
2210   team = th->th.th_team;
2211   KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2212   nteams = th->th.th_teams_size.nteams;
2213   team_id = team->t.t_master_tid;
2214   KMP_DEBUG_ASSERT(nteams == (kmp_uint32)team->t.t_parent->t.t_nproc);
2215 
2216   // compute global trip count
2217   if (incr == 1) {
2218     trip_count = *pupper - *plower + 1;
2219   } else if (incr == -1) {
2220     trip_count = *plower - *pupper + 1;
2221   } else if (incr > 0) {
2222     // upper-lower can exceed the limit of signed type
2223     trip_count = (UT)(*pupper - *plower) / incr + 1;
2224   } else {
2225     trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2226   }
2227 
2228   if (trip_count <= nteams) {
2229     KMP_DEBUG_ASSERT(
2230         __kmp_static == kmp_sch_static_greedy ||
2231         __kmp_static ==
2232             kmp_sch_static_balanced); // Unknown static scheduling type.
2233     // only some teams get single iteration, others get nothing
2234     if (team_id < trip_count) {
2235       *pupper = *plower = *plower + team_id * incr;
2236     } else {
2237       *plower = *pupper + incr; // zero-trip loop
2238     }
2239     if (plastiter != NULL)
2240       *plastiter = (team_id == trip_count - 1);
2241   } else {
2242     if (__kmp_static == kmp_sch_static_balanced) {
2243       UT chunk = trip_count / nteams;
2244       UT extras = trip_count % nteams;
2245       *plower +=
2246           incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2247       *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2248       if (plastiter != NULL)
2249         *plastiter = (team_id == nteams - 1);
2250     } else {
2251       T chunk_inc_count =
2252           (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2253       T upper = *pupper;
2254       KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2255       // Unknown static scheduling type.
2256       *plower += team_id * chunk_inc_count;
2257       *pupper = *plower + chunk_inc_count - incr;
2258       // Check/correct bounds if needed
2259       if (incr > 0) {
2260         if (*pupper < *plower)
2261           *pupper = traits_t<T>::max_value;
2262         if (plastiter != NULL)
2263           *plastiter = *plower <= upper && *pupper > upper - incr;
2264         if (*pupper > upper)
2265           *pupper = upper; // tracker C73258
2266       } else {
2267         if (*pupper > *plower)
2268           *pupper = traits_t<T>::min_value;
2269         if (plastiter != NULL)
2270           *plastiter = *plower >= upper && *pupper < upper - incr;
2271         if (*pupper < upper)
2272           *pupper = upper; // tracker C73258
2273       }
2274     }
2275   }
2276 }
2277 
2278 //-----------------------------------------------------------------------------
2279 // Dispatch routines
2280 //    Transfer call to template< type T >
2281 //    __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2282 //                         T lb, T ub, ST st, ST chunk )
2283 extern "C" {
2284 
2285 /*!
2286 @ingroup WORK_SHARING
2287 @{
2288 @param loc Source location
2289 @param gtid Global thread id
2290 @param schedule Schedule type
2291 @param lb  Lower bound
2292 @param ub  Upper bound
2293 @param st  Step (or increment if you prefer)
2294 @param chunk The chunk size to block with
2295 
2296 This function prepares the runtime to start a dynamically scheduled for loop,
2297 saving the loop arguments.
2298 These functions are all identical apart from the types of the arguments.
2299 */
2300 
__kmpc_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk)2301 void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2302                             enum sched_type schedule, kmp_int32 lb,
2303                             kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2304   KMP_DEBUG_ASSERT(__kmp_init_serial);
2305 #if OMPT_SUPPORT && OMPT_OPTIONAL
2306   OMPT_STORE_RETURN_ADDRESS(gtid);
2307 #endif
2308   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2309 }
2310 /*!
2311 See @ref __kmpc_dispatch_init_4
2312 */
__kmpc_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk)2313 void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2314                              enum sched_type schedule, kmp_uint32 lb,
2315                              kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2316   KMP_DEBUG_ASSERT(__kmp_init_serial);
2317 #if OMPT_SUPPORT && OMPT_OPTIONAL
2318   OMPT_STORE_RETURN_ADDRESS(gtid);
2319 #endif
2320   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2321 }
2322 
2323 /*!
2324 See @ref __kmpc_dispatch_init_4
2325 */
__kmpc_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk)2326 void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2327                             enum sched_type schedule, kmp_int64 lb,
2328                             kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2329   KMP_DEBUG_ASSERT(__kmp_init_serial);
2330 #if OMPT_SUPPORT && OMPT_OPTIONAL
2331   OMPT_STORE_RETURN_ADDRESS(gtid);
2332 #endif
2333   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2334 }
2335 
2336 /*!
2337 See @ref __kmpc_dispatch_init_4
2338 */
__kmpc_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk)2339 void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2340                              enum sched_type schedule, kmp_uint64 lb,
2341                              kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2342   KMP_DEBUG_ASSERT(__kmp_init_serial);
2343 #if OMPT_SUPPORT && OMPT_OPTIONAL
2344   OMPT_STORE_RETURN_ADDRESS(gtid);
2345 #endif
2346   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2347 }
2348 
2349 /*!
2350 See @ref __kmpc_dispatch_init_4
2351 
2352 Difference from __kmpc_dispatch_init set of functions is these functions
2353 are called for composite distribute parallel for construct. Thus before
2354 regular iterations dispatching we need to calc per-team iteration space.
2355 
2356 These functions are all identical apart from the types of the arguments.
2357 */
__kmpc_dist_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk)2358 void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2359                                  enum sched_type schedule, kmp_int32 *p_last,
2360                                  kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2361                                  kmp_int32 chunk) {
2362   KMP_DEBUG_ASSERT(__kmp_init_serial);
2363 #if OMPT_SUPPORT && OMPT_OPTIONAL
2364   OMPT_STORE_RETURN_ADDRESS(gtid);
2365 #endif
2366   __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2367   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2368 }
2369 
__kmpc_dist_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk)2370 void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2371                                   enum sched_type schedule, kmp_int32 *p_last,
2372                                   kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2373                                   kmp_int32 chunk) {
2374   KMP_DEBUG_ASSERT(__kmp_init_serial);
2375 #if OMPT_SUPPORT && OMPT_OPTIONAL
2376   OMPT_STORE_RETURN_ADDRESS(gtid);
2377 #endif
2378   __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2379   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
2380 }
2381 
__kmpc_dist_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk)2382 void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2383                                  enum sched_type schedule, kmp_int32 *p_last,
2384                                  kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2385                                  kmp_int64 chunk) {
2386   KMP_DEBUG_ASSERT(__kmp_init_serial);
2387 #if OMPT_SUPPORT && OMPT_OPTIONAL
2388   OMPT_STORE_RETURN_ADDRESS(gtid);
2389 #endif
2390   __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2391   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2392 }
2393 
__kmpc_dist_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 * p_last,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk)2394 void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2395                                   enum sched_type schedule, kmp_int32 *p_last,
2396                                   kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2397                                   kmp_int64 chunk) {
2398   KMP_DEBUG_ASSERT(__kmp_init_serial);
2399 #if OMPT_SUPPORT && OMPT_OPTIONAL
2400   OMPT_STORE_RETURN_ADDRESS(gtid);
2401 #endif
2402   __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2403   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
2404 }
2405 
2406 /*!
2407 @param loc Source code location
2408 @param gtid Global thread id
2409 @param p_last Pointer to a flag set to one if this is the last chunk or zero
2410 otherwise
2411 @param p_lb   Pointer to the lower bound for the next chunk of work
2412 @param p_ub   Pointer to the upper bound for the next chunk of work
2413 @param p_st   Pointer to the stride for the next chunk of work
2414 @return one if there is work to be done, zero otherwise
2415 
2416 Get the next dynamically allocated chunk of work for this thread.
2417 If there is no more work, then the lb,ub and stride need not be modified.
2418 */
__kmpc_dispatch_next_4(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_int32 * p_lb,kmp_int32 * p_ub,kmp_int32 * p_st)2419 int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2420                            kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2421 #if OMPT_SUPPORT && OMPT_OPTIONAL
2422   OMPT_STORE_RETURN_ADDRESS(gtid);
2423 #endif
2424   return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st
2425 #if OMPT_SUPPORT && OMPT_OPTIONAL
2426                                         ,
2427                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2428 #endif
2429                                             );
2430 }
2431 
2432 /*!
2433 See @ref __kmpc_dispatch_next_4
2434 */
__kmpc_dispatch_next_4u(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_uint32 * p_lb,kmp_uint32 * p_ub,kmp_int32 * p_st)2435 int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2436                             kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2437                             kmp_int32 *p_st) {
2438 #if OMPT_SUPPORT && OMPT_OPTIONAL
2439   OMPT_STORE_RETURN_ADDRESS(gtid);
2440 #endif
2441   return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st
2442 #if OMPT_SUPPORT && OMPT_OPTIONAL
2443                                          ,
2444                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2445 #endif
2446                                              );
2447 }
2448 
2449 /*!
2450 See @ref __kmpc_dispatch_next_4
2451 */
__kmpc_dispatch_next_8(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_int64 * p_lb,kmp_int64 * p_ub,kmp_int64 * p_st)2452 int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2453                            kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2454 #if OMPT_SUPPORT && OMPT_OPTIONAL
2455   OMPT_STORE_RETURN_ADDRESS(gtid);
2456 #endif
2457   return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st
2458 #if OMPT_SUPPORT && OMPT_OPTIONAL
2459                                         ,
2460                                         OMPT_LOAD_RETURN_ADDRESS(gtid)
2461 #endif
2462                                             );
2463 }
2464 
2465 /*!
2466 See @ref __kmpc_dispatch_next_4
2467 */
__kmpc_dispatch_next_8u(ident_t * loc,kmp_int32 gtid,kmp_int32 * p_last,kmp_uint64 * p_lb,kmp_uint64 * p_ub,kmp_int64 * p_st)2468 int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2469                             kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2470                             kmp_int64 *p_st) {
2471 #if OMPT_SUPPORT && OMPT_OPTIONAL
2472   OMPT_STORE_RETURN_ADDRESS(gtid);
2473 #endif
2474   return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st
2475 #if OMPT_SUPPORT && OMPT_OPTIONAL
2476                                          ,
2477                                          OMPT_LOAD_RETURN_ADDRESS(gtid)
2478 #endif
2479                                              );
2480 }
2481 
2482 /*!
2483 @param loc Source code location
2484 @param gtid Global thread id
2485 
2486 Mark the end of a dynamic loop.
2487 */
__kmpc_dispatch_fini_4(ident_t * loc,kmp_int32 gtid)2488 void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2489   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2490 }
2491 
2492 /*!
2493 See @ref __kmpc_dispatch_fini_4
2494 */
__kmpc_dispatch_fini_8(ident_t * loc,kmp_int32 gtid)2495 void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2496   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2497 }
2498 
2499 /*!
2500 See @ref __kmpc_dispatch_fini_4
2501 */
__kmpc_dispatch_fini_4u(ident_t * loc,kmp_int32 gtid)2502 void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2503   __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
2504 }
2505 
2506 /*!
2507 See @ref __kmpc_dispatch_fini_4
2508 */
__kmpc_dispatch_fini_8u(ident_t * loc,kmp_int32 gtid)2509 void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2510   __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
2511 }
2512 /*! @} */
2513 
2514 //-----------------------------------------------------------------------------
2515 // Non-template routines from kmp_dispatch.cpp used in other sources
2516 
__kmp_eq_4(kmp_uint32 value,kmp_uint32 checker)2517 kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2518   return value == checker;
2519 }
2520 
__kmp_neq_4(kmp_uint32 value,kmp_uint32 checker)2521 kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2522   return value != checker;
2523 }
2524 
__kmp_lt_4(kmp_uint32 value,kmp_uint32 checker)2525 kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2526   return value < checker;
2527 }
2528 
__kmp_ge_4(kmp_uint32 value,kmp_uint32 checker)2529 kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2530   return value >= checker;
2531 }
2532 
__kmp_le_4(kmp_uint32 value,kmp_uint32 checker)2533 kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2534   return value <= checker;
2535 }
2536 
2537 kmp_uint32
__kmp_wait_4(volatile kmp_uint32 * spinner,kmp_uint32 checker,kmp_uint32 (* pred)(kmp_uint32,kmp_uint32),void * obj)2538 __kmp_wait_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2539              kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2540              void *obj // Higher-level synchronization object, or NULL.
2541              ) {
2542   // note: we may not belong to a team at this point
2543   volatile kmp_uint32 *spin = spinner;
2544   kmp_uint32 check = checker;
2545   kmp_uint32 spins;
2546   kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2547   kmp_uint32 r;
2548 
2549   KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
2550   KMP_INIT_YIELD(spins);
2551   // main wait spin loop
2552   while (!f(r = TCR_4(*spin), check)) {
2553     KMP_FSYNC_SPIN_PREPARE(obj);
2554     /* GEH - remove this since it was accidentally introduced when kmp_wait was
2555        split. It causes problems with infinite recursion because of exit lock */
2556     /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2557         __kmp_abort_thread(); */
2558     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2559   }
2560   KMP_FSYNC_SPIN_ACQUIRED(obj);
2561   return r;
2562 }
2563 
__kmp_wait_4_ptr(void * spinner,kmp_uint32 checker,kmp_uint32 (* pred)(void *,kmp_uint32),void * obj)2564 void __kmp_wait_4_ptr(void *spinner, kmp_uint32 checker,
2565                       kmp_uint32 (*pred)(void *, kmp_uint32),
2566                       void *obj // Higher-level synchronization object, or NULL.
2567                       ) {
2568   // note: we may not belong to a team at this point
2569   void *spin = spinner;
2570   kmp_uint32 check = checker;
2571   kmp_uint32 spins;
2572   kmp_uint32 (*f)(void *, kmp_uint32) = pred;
2573 
2574   KMP_FSYNC_SPIN_INIT(obj, spin);
2575   KMP_INIT_YIELD(spins);
2576   // main wait spin loop
2577   while (!f(spin, check)) {
2578     KMP_FSYNC_SPIN_PREPARE(obj);
2579     /* if we have waited a bit, or are noversubscribed, yield */
2580     /* pause is in the following code */
2581     KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
2582   }
2583   KMP_FSYNC_SPIN_ACQUIRED(obj);
2584 }
2585 
2586 } // extern "C"
2587 
2588 #ifdef KMP_GOMP_COMPAT
2589 
__kmp_aux_dispatch_init_4(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int32 lb,kmp_int32 ub,kmp_int32 st,kmp_int32 chunk,int push_ws)2590 void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2591                                enum sched_type schedule, kmp_int32 lb,
2592                                kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2593                                int push_ws) {
2594   __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2595                                  push_ws);
2596 }
2597 
__kmp_aux_dispatch_init_4u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint32 lb,kmp_uint32 ub,kmp_int32 st,kmp_int32 chunk,int push_ws)2598 void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2599                                 enum sched_type schedule, kmp_uint32 lb,
2600                                 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2601                                 int push_ws) {
2602   __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2603                                   push_ws);
2604 }
2605 
__kmp_aux_dispatch_init_8(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_int64 lb,kmp_int64 ub,kmp_int64 st,kmp_int64 chunk,int push_ws)2606 void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2607                                enum sched_type schedule, kmp_int64 lb,
2608                                kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2609                                int push_ws) {
2610   __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2611                                  push_ws);
2612 }
2613 
__kmp_aux_dispatch_init_8u(ident_t * loc,kmp_int32 gtid,enum sched_type schedule,kmp_uint64 lb,kmp_uint64 ub,kmp_int64 st,kmp_int64 chunk,int push_ws)2614 void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2615                                 enum sched_type schedule, kmp_uint64 lb,
2616                                 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2617                                 int push_ws) {
2618   __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2619                                   push_ws);
2620 }
2621 
__kmp_aux_dispatch_fini_chunk_4(ident_t * loc,kmp_int32 gtid)2622 void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2623   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2624 }
2625 
__kmp_aux_dispatch_fini_chunk_8(ident_t * loc,kmp_int32 gtid)2626 void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2627   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2628 }
2629 
__kmp_aux_dispatch_fini_chunk_4u(ident_t * loc,kmp_int32 gtid)2630 void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2631   __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
2632 }
2633 
__kmp_aux_dispatch_fini_chunk_8u(ident_t * loc,kmp_int32 gtid)2634 void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2635   __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
2636 }
2637 
2638 #endif /* KMP_GOMP_COMPAT */
2639 
2640 /* ------------------------------------------------------------------------ */
2641