1 /*
2 * kmp_dispatch.h: dynamic scheduling - iteration initialization and dispatch.
3 */
4
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12
13 #ifndef KMP_DISPATCH_H
14 #define KMP_DISPATCH_H
15
16 /* ------------------------------------------------------------------------ */
17 /* ------------------------------------------------------------------------ */
18
19 #include "kmp.h"
20 #include "kmp_error.h"
21 #include "kmp_i18n.h"
22 #include "kmp_itt.h"
23 #include "kmp_stats.h"
24 #include "kmp_str.h"
25 #if KMP_OS_WINDOWS && KMP_ARCH_X86
26 #include <float.h>
27 #endif
28
29 #if OMPT_SUPPORT
30 #include "ompt-internal.h"
31 #include "ompt-specific.h"
32 #endif
33
34 /* ------------------------------------------------------------------------ */
35 /* ------------------------------------------------------------------------ */
36 #if KMP_USE_HIER_SCHED
37 // Forward declarations of some hierarchical scheduling data structures
38 template <typename T> struct kmp_hier_t;
39 template <typename T> struct kmp_hier_top_unit_t;
40 #endif // KMP_USE_HIER_SCHED
41
42 template <typename T> struct dispatch_shared_info_template;
43 template <typename T> struct dispatch_private_info_template;
44
45 template <typename T>
46 extern void __kmp_dispatch_init_algorithm(ident_t *loc, int gtid,
47 dispatch_private_info_template<T> *pr,
48 enum sched_type schedule, T lb, T ub,
49 typename traits_t<T>::signed_t st,
50 #if USE_ITT_BUILD
51 kmp_uint64 *cur_chunk,
52 #endif
53 typename traits_t<T>::signed_t chunk,
54 T nproc, T unit_id);
55 template <typename T>
56 extern int __kmp_dispatch_next_algorithm(
57 int gtid, dispatch_private_info_template<T> *pr,
58 dispatch_shared_info_template<T> volatile *sh, kmp_int32 *p_last, T *p_lb,
59 T *p_ub, typename traits_t<T>::signed_t *p_st, T nproc, T unit_id);
60
61 void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
62 void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref, ident_t *loc_ref);
63
64 #if KMP_STATIC_STEAL_ENABLED
65
66 // replaces dispatch_private_info{32,64} structures and
67 // dispatch_private_info{32,64}_t types
68 template <typename T> struct dispatch_private_infoXX_template {
69 typedef typename traits_t<T>::unsigned_t UT;
70 typedef typename traits_t<T>::signed_t ST;
71 UT count; // unsigned
72 T ub;
73 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
74 T lb;
75 ST st; // signed
76 UT tc; // unsigned
77 T static_steal_counter; // for static_steal only; maybe better to put after ub
78 kmp_lock_t *th_steal_lock; // lock used for chunk stealing
79 /* parm[1-4] are used in different ways by different scheduling algorithms */
80
81 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
82 // a) parm3 is properly aligned and
83 // b) all parm1-4 are in the same cache line.
84 // Because of parm1-4 are used together, performance seems to be better
85 // if they are in the same line (not measured though).
86
87 struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
88 T parm1;
89 T parm2;
90 T parm3;
91 T parm4;
92 };
93
94 UT ordered_lower; // unsigned
95 UT ordered_upper; // unsigned
96 #if KMP_OS_WINDOWS
97 T last_upper;
98 #endif /* KMP_OS_WINDOWS */
99 };
100
101 #else /* KMP_STATIC_STEAL_ENABLED */
102
103 // replaces dispatch_private_info{32,64} structures and
104 // dispatch_private_info{32,64}_t types
105 template <typename T> struct dispatch_private_infoXX_template {
106 typedef typename traits_t<T>::unsigned_t UT;
107 typedef typename traits_t<T>::signed_t ST;
108 T lb;
109 T ub;
110 ST st; // signed
111 UT tc; // unsigned
112
113 T parm1;
114 T parm2;
115 T parm3;
116 T parm4;
117
118 UT count; // unsigned
119
120 UT ordered_lower; // unsigned
121 UT ordered_upper; // unsigned
122 #if KMP_OS_WINDOWS
123 T last_upper;
124 #endif /* KMP_OS_WINDOWS */
125 };
126 #endif /* KMP_STATIC_STEAL_ENABLED */
127
128 template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
129 // duplicate alignment here, otherwise size of structure is not correct in our
130 // compiler
131 union KMP_ALIGN_CACHE private_info_tmpl {
132 dispatch_private_infoXX_template<T> p;
133 dispatch_private_info64_t p64;
134 } u;
135 enum sched_type schedule; /* scheduling algorithm */
136 kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */
137 kmp_uint32 ordered_bumped;
138 // to retain the structure size after making order
139 kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
140 dispatch_private_info *next; /* stack of buffers for nest of serial regions */
141 kmp_uint32 type_size;
142 #if KMP_USE_HIER_SCHED
143 kmp_int32 hier_id;
144 kmp_hier_top_unit_t<T> *hier_parent;
145 // member functions
get_hier_iddispatch_private_info_template146 kmp_int32 get_hier_id() const { return hier_id; }
get_parentdispatch_private_info_template147 kmp_hier_top_unit_t<T> *get_parent() { return hier_parent; }
148 #endif
149 enum cons_type pushed_ws;
150 };
151
152 // replaces dispatch_shared_info{32,64} structures and
153 // dispatch_shared_info{32,64}_t types
154 template <typename T> struct dispatch_shared_infoXX_template {
155 typedef typename traits_t<T>::unsigned_t UT;
156 /* chunk index under dynamic, number of idle threads under static-steal;
157 iteration index otherwise */
158 volatile UT iteration;
159 volatile UT num_done;
160 volatile UT ordered_iteration;
161 // to retain the structure size making ordered_iteration scalar
162 UT ordered_dummy[KMP_MAX_ORDERED - 3];
163 };
164
165 // replaces dispatch_shared_info structure and dispatch_shared_info_t type
166 template <typename T> struct dispatch_shared_info_template {
167 typedef typename traits_t<T>::unsigned_t UT;
168 // we need union here to keep the structure size
169 union shared_info_tmpl {
170 dispatch_shared_infoXX_template<UT> s;
171 dispatch_shared_info64_t s64;
172 } u;
173 volatile kmp_uint32 buffer_index;
174 volatile kmp_int32 doacross_buf_idx; // teamwise index
175 kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
176 kmp_int32 doacross_num_done; // count finished threads
177 #if KMP_USE_HIER_SCHED
178 kmp_hier_t<T> *hier;
179 #endif
180 #if KMP_USE_HWLOC
181 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
182 // machines (> 48 cores). Performance analysis showed that a cache thrash
183 // was occurring and this padding helps alleviate the problem.
184 char padding[64];
185 #endif
186 };
187
188 /* ------------------------------------------------------------------------ */
189 /* ------------------------------------------------------------------------ */
190
191 #undef USE_TEST_LOCKS
192
193 // test_then_add template (general template should NOT be used)
194 template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
195
196 template <>
197 __forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
198 kmp_int32 d) {
199 kmp_int32 r;
200 r = KMP_TEST_THEN_ADD32(p, d);
201 return r;
202 }
203
204 template <>
205 __forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
206 kmp_int64 d) {
207 kmp_int64 r;
208 r = KMP_TEST_THEN_ADD64(p, d);
209 return r;
210 }
211
212 // test_then_inc_acq template (general template should NOT be used)
213 template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
214
215 template <>
216 __forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
217 kmp_int32 r;
218 r = KMP_TEST_THEN_INC_ACQ32(p);
219 return r;
220 }
221
222 template <>
223 __forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
224 kmp_int64 r;
225 r = KMP_TEST_THEN_INC_ACQ64(p);
226 return r;
227 }
228
229 // test_then_inc template (general template should NOT be used)
230 template <typename T> static __forceinline T test_then_inc(volatile T *p);
231
232 template <>
233 __forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
234 kmp_int32 r;
235 r = KMP_TEST_THEN_INC32(p);
236 return r;
237 }
238
239 template <>
240 __forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
241 kmp_int64 r;
242 r = KMP_TEST_THEN_INC64(p);
243 return r;
244 }
245
246 // compare_and_swap template (general template should NOT be used)
247 template <typename T>
248 static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
249
250 template <>
251 __forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
252 kmp_int32 c, kmp_int32 s) {
253 return KMP_COMPARE_AND_STORE_REL32(p, c, s);
254 }
255
256 template <>
257 __forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
258 kmp_int64 c, kmp_int64 s) {
259 return KMP_COMPARE_AND_STORE_REL64(p, c, s);
260 }
261
__kmp_ge(T value,T checker)262 template <typename T> kmp_uint32 __kmp_ge(T value, T checker) {
263 return value >= checker;
264 }
__kmp_eq(T value,T checker)265 template <typename T> kmp_uint32 __kmp_eq(T value, T checker) {
266 return value == checker;
267 }
268
269 /*
270 Spin wait loop that pauses between checks.
271 Waits until function returns non-zero when called with *spinner and check.
272 Does NOT put threads to sleep.
273 Arguments:
274 UT is unsigned 4- or 8-byte type
275 spinner - memory location to check value
276 checker - value which spinner is >, <, ==, etc.
277 pred - predicate function to perform binary comparison of some sort
278 #if USE_ITT_BUILD
279 obj -- is higher-level synchronization object to report to ittnotify. It
280 is used to report locks consistently. For example, if lock is acquired
281 immediately, its address is reported to ittnotify via
282 KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
283 and lock routine calls to KMP_WAIT(), the later should report the
284 same address, not an address of low-level spinner.
285 #endif // USE_ITT_BUILD
286 TODO: make inline function (move to header file for icl)
287 */
288 template <typename UT>
__kmp_wait(volatile UT * spinner,UT checker,kmp_uint32 (* pred)(UT,UT)USE_ITT_BUILD_ARG (void * obj))289 static UT __kmp_wait(volatile UT *spinner, UT checker,
290 kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(void *obj)) {
291 // note: we may not belong to a team at this point
292 volatile UT *spin = spinner;
293 UT check = checker;
294 kmp_uint32 spins;
295 kmp_uint32 (*f)(UT, UT) = pred;
296 UT r;
297
298 KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
299 KMP_INIT_YIELD(spins);
300 // main wait spin loop
301 while (!f(r = *spin, check)) {
302 KMP_FSYNC_SPIN_PREPARE(obj);
303 /* GEH - remove this since it was accidentally introduced when kmp_wait was
304 split.
305 It causes problems with infinite recursion because of exit lock */
306 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
307 __kmp_abort_thread(); */
308 // If oversubscribed, or have waited a bit then yield.
309 KMP_YIELD_OVERSUB_ELSE_SPIN(spins);
310 }
311 KMP_FSYNC_SPIN_ACQUIRED(obj);
312 return r;
313 }
314
315 /* ------------------------------------------------------------------------ */
316 /* ------------------------------------------------------------------------ */
317
318 template <typename UT>
__kmp_dispatch_deo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)319 void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
320 dispatch_private_info_template<UT> *pr;
321
322 int gtid = *gtid_ref;
323 // int cid = *cid_ref;
324 kmp_info_t *th = __kmp_threads[gtid];
325 KMP_DEBUG_ASSERT(th->th.th_dispatch);
326
327 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
328 if (__kmp_env_consistency_check) {
329 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
330 th->th.th_dispatch->th_dispatch_pr_current);
331 if (pr->pushed_ws != ct_none) {
332 #if KMP_USE_DYNAMIC_LOCK
333 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
334 #else
335 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
336 #endif
337 }
338 }
339
340 if (!th->th.th_team->t.t_serialized) {
341 dispatch_shared_info_template<UT> *sh =
342 reinterpret_cast<dispatch_shared_info_template<UT> *>(
343 th->th.th_dispatch->th_dispatch_sh_current);
344 UT lower;
345
346 if (!__kmp_env_consistency_check) {
347 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
348 th->th.th_dispatch->th_dispatch_pr_current);
349 }
350 lower = pr->u.p.ordered_lower;
351
352 #if !defined(KMP_GOMP_COMPAT)
353 if (__kmp_env_consistency_check) {
354 if (pr->ordered_bumped) {
355 struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
356 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
357 ct_ordered_in_pdo, loc_ref,
358 &p->stack_data[p->w_top]);
359 }
360 }
361 #endif /* !defined(KMP_GOMP_COMPAT) */
362
363 KMP_MB();
364 #ifdef KMP_DEBUG
365 {
366 char *buff;
367 // create format specifiers before the debug output
368 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
369 "ordered_iter:%%%s lower:%%%s\n",
370 traits_t<UT>::spec, traits_t<UT>::spec);
371 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
372 __kmp_str_free(&buff);
373 }
374 #endif
375 __kmp_wait<UT>(&sh->u.s.ordered_iteration, lower,
376 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
377 KMP_MB(); /* is this necessary? */
378 #ifdef KMP_DEBUG
379 {
380 char *buff;
381 // create format specifiers before the debug output
382 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
383 "ordered_iter:%%%s lower:%%%s\n",
384 traits_t<UT>::spec, traits_t<UT>::spec);
385 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
386 __kmp_str_free(&buff);
387 }
388 #endif
389 }
390 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
391 }
392
393 template <typename UT>
__kmp_dispatch_dxo(int * gtid_ref,int * cid_ref,ident_t * loc_ref)394 void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
395 typedef typename traits_t<UT>::signed_t ST;
396 dispatch_private_info_template<UT> *pr;
397
398 int gtid = *gtid_ref;
399 // int cid = *cid_ref;
400 kmp_info_t *th = __kmp_threads[gtid];
401 KMP_DEBUG_ASSERT(th->th.th_dispatch);
402
403 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
404 if (__kmp_env_consistency_check) {
405 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
406 th->th.th_dispatch->th_dispatch_pr_current);
407 if (pr->pushed_ws != ct_none) {
408 __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
409 }
410 }
411
412 if (!th->th.th_team->t.t_serialized) {
413 dispatch_shared_info_template<UT> *sh =
414 reinterpret_cast<dispatch_shared_info_template<UT> *>(
415 th->th.th_dispatch->th_dispatch_sh_current);
416
417 if (!__kmp_env_consistency_check) {
418 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
419 th->th.th_dispatch->th_dispatch_pr_current);
420 }
421
422 KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
423 #if !defined(KMP_GOMP_COMPAT)
424 if (__kmp_env_consistency_check) {
425 if (pr->ordered_bumped != 0) {
426 struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
427 /* How to test it? - OM */
428 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
429 ct_ordered_in_pdo, loc_ref,
430 &p->stack_data[p->w_top]);
431 }
432 }
433 #endif /* !defined(KMP_GOMP_COMPAT) */
434
435 KMP_MB(); /* Flush all pending memory write invalidates. */
436
437 pr->ordered_bumped += 1;
438
439 KD_TRACE(1000,
440 ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
441 gtid, pr->ordered_bumped));
442
443 KMP_MB(); /* Flush all pending memory write invalidates. */
444
445 /* TODO use general release procedure? */
446 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
447
448 KMP_MB(); /* Flush all pending memory write invalidates. */
449 }
450 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
451 }
452
453 /* Computes and returns x to the power of y, where y must a non-negative integer
454 */
455 template <typename UT>
__kmp_pow(long double x,UT y)456 static __forceinline long double __kmp_pow(long double x, UT y) {
457 long double s = 1.0L;
458
459 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
460 // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
461 while (y) {
462 if (y & 1)
463 s *= x;
464 x *= x;
465 y >>= 1;
466 }
467 return s;
468 }
469
470 /* Computes and returns the number of unassigned iterations after idx chunks
471 have been assigned
472 (the total number of unassigned iterations in chunks with index greater than
473 or equal to idx).
474 __forceinline seems to be broken so that if we __forceinline this function,
475 the behavior is wrong
476 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
477 */
478 template <typename T>
479 static __inline typename traits_t<T>::unsigned_t
__kmp_dispatch_guided_remaining(T tc,typename traits_t<T>::floating_t base,typename traits_t<T>::unsigned_t idx)480 __kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
481 typename traits_t<T>::unsigned_t idx) {
482 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
483 least for ICL 8.1, long double arithmetic may not really have
484 long double precision, even with /Qlong_double. Currently, we
485 workaround that in the caller code, by manipulating the FPCW for
486 Windows* OS on IA-32 architecture. The lack of precision is not
487 expected to be a correctness issue, though.
488 */
489 typedef typename traits_t<T>::unsigned_t UT;
490
491 long double x = tc * __kmp_pow<UT>(base, idx);
492 UT r = (UT)x;
493 if (x == r)
494 return r;
495 return r + 1;
496 }
497
498 // Parameters of the guided-iterative algorithm:
499 // p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
500 // p3 = 1 / ( n * nproc ) // remaining iterations multiplier
501 // by default n = 2. For example with n = 3 the chunks distribution will be more
502 // flat.
503 // With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
504 static const int guided_int_param = 2;
505 static const double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
506 #endif // KMP_DISPATCH_H
507