1 #ifndef KMP_STATS_H 2 #define KMP_STATS_H 3 4 /** @file kmp_stats.h 5 * Functions for collecting statistics. 6 */ 7 8 //===----------------------------------------------------------------------===// 9 // 10 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 11 // See https://llvm.org/LICENSE.txt for license information. 12 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 13 // 14 //===----------------------------------------------------------------------===// 15 16 #include "kmp_config.h" 17 #include "kmp_debug.h" 18 19 #if KMP_STATS_ENABLED 20 /* Statistics accumulator. 21 Accumulates number of samples and computes min, max, mean, standard deviation 22 on the fly. 23 24 Online variance calculation algorithm from 25 http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm 26 */ 27 28 #include "kmp_stats_timing.h" 29 #include <limits> 30 #include <math.h> 31 #include <new> // placement new 32 #include <stdint.h> 33 #include <string> 34 #include <vector> 35 36 /* Enable developer statistics here if you want them. They are more detailed 37 than is useful for application characterisation and are intended for the 38 runtime library developer. */ 39 #define KMP_DEVELOPER_STATS 0 40 41 /* Enable/Disable histogram output */ 42 #define KMP_STATS_HIST 0 43 44 /*! 45 * @ingroup STATS_GATHERING 46 * \brief flags to describe the statistic (timer or counter) 47 * 48 */ 49 enum stats_flags_e { 50 noTotal = 1 << 0, //!< do not show a TOTAL_aggregation for this statistic 51 onlyInMaster = 1 << 1, //!< statistic is valid only for master 52 noUnits = 1 << 2, //!< statistic doesn't need units printed next to it 53 notInMaster = 1 << 3, //!< statistic is valid only for non-master threads 54 logEvent = 1 << 4 //!< statistic can be logged on the event timeline when 55 //! KMP_STATS_EVENTS is on (valid only for timers) 56 }; 57 58 /*! 59 * @ingroup STATS_GATHERING 60 * \brief the states which a thread can be in 61 * 62 */ 63 enum stats_state_e { 64 IDLE, 65 SERIAL_REGION, 66 FORK_JOIN_BARRIER, 67 PLAIN_BARRIER, 68 TASKWAIT, 69 TASKYIELD, 70 TASKGROUP, 71 IMPLICIT_TASK, 72 EXPLICIT_TASK, 73 TEAMS_REGION 74 }; 75 76 /*! 77 * \brief Add new counters under KMP_FOREACH_COUNTER() macro in kmp_stats.h 78 * 79 * @param macro a user defined macro that takes three arguments - 80 * macro(COUNTER_NAME, flags, arg) 81 * @param arg a user defined argument to send to the user defined macro 82 * 83 * \details A counter counts the occurrence of some event. Each thread 84 * accumulates its own count, at the end of execution the counts are aggregated 85 * treating each thread as a separate measurement. (Unless onlyInMaster is set, 86 * in which case there's only a single measurement). The min,mean,max are 87 * therefore the values for the threads. Adding the counter here and then 88 * putting a KMP_BLOCK_COUNTER(name) at the point you want to count is all you 89 * need to do. All of the tables and printing is generated from this macro. 90 * Format is "macro(name, flags, arg)" 91 * 92 * @ingroup STATS_GATHERING 93 */ 94 // clang-format off 95 #define KMP_FOREACH_COUNTER(macro, arg) \ 96 macro(OMP_PARALLEL,stats_flags_e::onlyInMaster|stats_flags_e::noTotal,arg) \ 97 macro(OMP_NESTED_PARALLEL, 0, arg) \ 98 macro(OMP_LOOP_STATIC, 0, arg) \ 99 macro(OMP_LOOP_STATIC_STEAL, 0, arg) \ 100 macro(OMP_LOOP_DYNAMIC, 0, arg) \ 101 macro(OMP_DISTRIBUTE, 0, arg) \ 102 macro(OMP_BARRIER, 0, arg) \ 103 macro(OMP_CRITICAL, 0, arg) \ 104 macro(OMP_SINGLE, 0, arg) \ 105 macro(OMP_MASTER, 0, arg) \ 106 macro(OMP_TEAMS, 0, arg) \ 107 macro(OMP_set_lock, 0, arg) \ 108 macro(OMP_test_lock, 0, arg) \ 109 macro(REDUCE_wait, 0, arg) \ 110 macro(REDUCE_nowait, 0, arg) \ 111 macro(OMP_TASKYIELD, 0, arg) \ 112 macro(OMP_TASKLOOP, 0, arg) \ 113 macro(TASK_executed, 0, arg) \ 114 macro(TASK_cancelled, 0, arg) \ 115 macro(TASK_stolen, 0, arg) 116 // clang-format on 117 118 /*! 119 * \brief Add new timers under KMP_FOREACH_TIMER() macro in kmp_stats.h 120 * 121 * @param macro a user defined macro that takes three arguments - 122 * macro(TIMER_NAME, flags, arg) 123 * @param arg a user defined argument to send to the user defined macro 124 * 125 * \details A timer collects multiple samples of some count in each thread and 126 * then finally aggregates all of the samples from all of the threads. For most 127 * timers the printing code also provides an aggregation over the thread totals. 128 * These are printed as TOTAL_foo. The count is normally a time (in ticks), 129 * hence the name "timer". (But can be any value, so we use this for "number of 130 * arguments passed to fork" as well). For timers the threads are not 131 * significant, it's the individual observations that count, so the statistics 132 * are at that level. Format is "macro(name, flags, arg)" 133 * 134 * @ingroup STATS_GATHERING2 135 */ 136 // clang-format off 137 #define KMP_FOREACH_TIMER(macro, arg) \ 138 macro (OMP_worker_thread_life, stats_flags_e::logEvent, arg) \ 139 macro (OMP_parallel, stats_flags_e::logEvent, arg) \ 140 macro (OMP_parallel_overhead, stats_flags_e::logEvent, arg) \ 141 macro (OMP_teams, stats_flags_e::logEvent, arg) \ 142 macro (OMP_teams_overhead, stats_flags_e::logEvent, arg) \ 143 macro (OMP_loop_static, 0, arg) \ 144 macro (OMP_loop_static_scheduling, 0, arg) \ 145 macro (OMP_loop_dynamic, 0, arg) \ 146 macro (OMP_loop_dynamic_scheduling, 0, arg) \ 147 macro (OMP_distribute, 0, arg) \ 148 macro (OMP_distribute_scheduling, 0, arg) \ 149 macro (OMP_critical, 0, arg) \ 150 macro (OMP_critical_wait, 0, arg) \ 151 macro (OMP_single, 0, arg) \ 152 macro (OMP_master, 0, arg) \ 153 macro (OMP_task_immediate, 0, arg) \ 154 macro (OMP_task_taskwait, 0, arg) \ 155 macro (OMP_task_taskyield, 0, arg) \ 156 macro (OMP_task_taskgroup, 0, arg) \ 157 macro (OMP_task_join_bar, 0, arg) \ 158 macro (OMP_task_plain_bar, 0, arg) \ 159 macro (OMP_taskloop_scheduling, 0, arg) \ 160 macro (OMP_plain_barrier, stats_flags_e::logEvent, arg) \ 161 macro (OMP_idle, stats_flags_e::logEvent, arg) \ 162 macro (OMP_fork_barrier, stats_flags_e::logEvent, arg) \ 163 macro (OMP_join_barrier, stats_flags_e::logEvent, arg) \ 164 macro (OMP_serial, stats_flags_e::logEvent, arg) \ 165 macro (OMP_set_numthreads, stats_flags_e::noUnits | stats_flags_e::noTotal, \ 166 arg) \ 167 macro (OMP_PARALLEL_args, stats_flags_e::noUnits | stats_flags_e::noTotal, \ 168 arg) \ 169 macro (OMP_loop_static_iterations, \ 170 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 171 macro (OMP_loop_static_total_iterations, \ 172 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 173 macro (OMP_loop_dynamic_iterations, \ 174 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 175 macro (OMP_loop_dynamic_total_iterations, \ 176 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 177 macro (OMP_distribute_iterations, \ 178 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 179 KMP_FOREACH_DEVELOPER_TIMER(macro, arg) 180 // clang-format on 181 182 // OMP_worker_thread_life -- Time from thread becoming an OpenMP thread (either 183 // initializing OpenMP or being created by a master) 184 // until the thread is destroyed 185 // OMP_parallel -- Time thread spends executing work directly 186 // within a #pragma omp parallel 187 // OMP_parallel_overhead -- Time thread spends setting up a parallel region 188 // OMP_loop_static -- Time thread spends executing loop iterations from 189 // a statically scheduled loop 190 // OMP_loop_static_scheduling -- Time thread spends scheduling loop iterations 191 // from a statically scheduled loop 192 // OMP_loop_dynamic -- Time thread spends executing loop iterations from 193 // a dynamically scheduled loop 194 // OMP_loop_dynamic_scheduling -- Time thread spends scheduling loop iterations 195 // from a dynamically scheduled loop 196 // OMP_critical -- Time thread spends executing critical section 197 // OMP_critical_wait -- Time thread spends waiting to enter 198 // a critical section 199 // OMP_single -- Time spent executing a "single" region 200 // OMP_master -- Time spent executing a "master" region 201 // OMP_task_immediate -- Time spent executing non-deferred tasks 202 // OMP_task_taskwait -- Time spent executing tasks inside a taskwait 203 // construct 204 // OMP_task_taskyield -- Time spent executing tasks inside a taskyield 205 // construct 206 // OMP_task_taskgroup -- Time spent executing tasks inside a taskygroup 207 // construct 208 // OMP_task_join_bar -- Time spent executing tasks inside a join barrier 209 // OMP_task_plain_bar -- Time spent executing tasks inside a barrier 210 // construct 211 // OMP_taskloop_scheduling -- Time spent scheduling tasks inside a taskloop 212 // construct 213 // OMP_plain_barrier -- Time spent in a #pragma omp barrier construct or 214 // inside implicit barrier at end of worksharing 215 // construct 216 // OMP_idle -- Time worker threads spend waiting for next 217 // parallel region 218 // OMP_fork_barrier -- Time spent in a the fork barrier surrounding a 219 // parallel region 220 // OMP_join_barrier -- Time spent in a the join barrier surrounding a 221 // parallel region 222 // OMP_serial -- Time thread zero spends executing serial code 223 // OMP_set_numthreads -- Values passed to omp_set_num_threads 224 // OMP_PARALLEL_args -- Number of arguments passed to a parallel region 225 // OMP_loop_static_iterations -- Number of iterations thread is assigned for 226 // statically scheduled loops 227 // OMP_loop_dynamic_iterations -- Number of iterations thread is assigned for 228 // dynamically scheduled loops 229 230 #if (KMP_DEVELOPER_STATS) 231 // Timers which are of interest to runtime library developers, not end users. 232 // These have to be explicitly enabled in addition to the other stats. 233 234 // KMP_fork_barrier -- time in __kmp_fork_barrier 235 // KMP_join_barrier -- time in __kmp_join_barrier 236 // KMP_barrier -- time in __kmp_barrier 237 // KMP_end_split_barrier -- time in __kmp_end_split_barrier 238 // KMP_setup_icv_copy -- time in __kmp_setup_icv_copy 239 // KMP_icv_copy -- start/stop timer for any ICV copying 240 // KMP_linear_gather -- time in __kmp_linear_barrier_gather 241 // KMP_linear_release -- time in __kmp_linear_barrier_release 242 // KMP_tree_gather -- time in __kmp_tree_barrier_gather 243 // KMP_tree_release -- time in __kmp_tree_barrier_release 244 // KMP_hyper_gather -- time in __kmp_hyper_barrier_gather 245 // KMP_hyper_release -- time in __kmp_hyper_barrier_release 246 // clang-format off 247 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) \ 248 macro(KMP_fork_call, 0, arg) \ 249 macro(KMP_join_call, 0, arg) \ 250 macro(KMP_end_split_barrier, 0, arg) \ 251 macro(KMP_hier_gather, 0, arg) \ 252 macro(KMP_hier_release, 0, arg) \ 253 macro(KMP_hyper_gather, 0, arg) \ 254 macro(KMP_hyper_release, 0, arg) \ 255 macro(KMP_linear_gather, 0, arg) \ 256 macro(KMP_linear_release, 0, arg) \ 257 macro(KMP_tree_gather, 0, arg) \ 258 macro(KMP_tree_release, 0, arg) \ 259 macro(USER_resume, 0, arg) \ 260 macro(USER_suspend, 0, arg) \ 261 macro(USER_mwait, 0, arg) \ 262 macro(KMP_allocate_team, 0, arg) \ 263 macro(KMP_setup_icv_copy, 0, arg) \ 264 macro(USER_icv_copy, 0, arg) \ 265 macro (FOR_static_steal_stolen, \ 266 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) \ 267 macro (FOR_static_steal_chunks, \ 268 stats_flags_e::noUnits | stats_flags_e::noTotal, arg) 269 #else 270 #define KMP_FOREACH_DEVELOPER_TIMER(macro, arg) 271 #endif 272 // clang-format on 273 274 /*! 275 * \brief Add new explicit timers under KMP_FOREACH_EXPLICIT_TIMER() macro. 276 * 277 * @param macro a user defined macro that takes three arguments - 278 * macro(TIMER_NAME, flags, arg) 279 * @param arg a user defined argument to send to the user defined macro 280 * 281 * \warning YOU MUST HAVE THE SAME NAMED TIMER UNDER KMP_FOREACH_TIMER() OR ELSE 282 * BAD THINGS WILL HAPPEN! 283 * 284 * \details Explicit timers are ones where we need to allocate a timer itself 285 * (as well as the accumulated timing statistics). We allocate these on a 286 * per-thread basis, and explicitly start and stop them. Block timers just 287 * allocate the timer itself on the stack, and use the destructor to notice 288 * block exit; they don't need to be defined here. The name here should be the 289 * same as that of a timer above. 290 * 291 * @ingroup STATS_GATHERING 292 */ 293 #define KMP_FOREACH_EXPLICIT_TIMER(macro, arg) KMP_FOREACH_TIMER(macro, arg) 294 295 #define ENUMERATE(name, ignore, prefix) prefix##name, 296 enum timer_e { KMP_FOREACH_TIMER(ENUMERATE, TIMER_) TIMER_LAST }; 297 298 enum explicit_timer_e { 299 KMP_FOREACH_EXPLICIT_TIMER(ENUMERATE, EXPLICIT_TIMER_) EXPLICIT_TIMER_LAST 300 }; 301 302 enum counter_e { KMP_FOREACH_COUNTER(ENUMERATE, COUNTER_) COUNTER_LAST }; 303 #undef ENUMERATE 304 305 /* 306 * A logarithmic histogram. It accumulates the number of values in each power of 307 * ten bin. So 1<=x<10, 10<=x<100, ... 308 * Mostly useful where we have some big outliers and want to see information 309 * about them. 310 */ 311 class logHistogram { 312 enum { 313 numBins = 31, /* Number of powers of 10. If this changes you need to change 314 * the initializer for binMax */ 315 316 /* 317 * If you want to use this to analyse values that may be less than 1, (for 318 * instance times in s), then the logOffset gives you negative powers. 319 * In our case here, we're just looking at times in ticks, or counts, so we 320 * can never see values with magnitude < 1 (other than zero), so we can set 321 * it to 0. As above change the initializer if you change this. 322 */ 323 logOffset = 0 324 }; 325 uint32_t KMP_ALIGN_CACHE zeroCount; 326 struct { 327 uint32_t count; 328 double total; 329 } bins[numBins]; 330 331 static double binMax[numBins]; 332 333 #ifdef KMP_DEBUG 334 uint64_t _total; 335 check()336 void check() const { 337 uint64_t t = zeroCount; 338 for (int i = 0; i < numBins; i++) 339 t += bins[i].count; 340 KMP_DEBUG_ASSERT(t == _total); 341 } 342 #else check()343 void check() const {} 344 #endif 345 346 public: logHistogram()347 logHistogram() { reset(); } 348 logHistogram(logHistogram const & o)349 logHistogram(logHistogram const &o) { 350 for (int i = 0; i < numBins; i++) 351 bins[i] = o.bins[i]; 352 #ifdef KMP_DEBUG 353 _total = o._total; 354 #endif 355 } 356 reset()357 void reset() { 358 zeroCount = 0; 359 for (int i = 0; i < numBins; i++) { 360 bins[i].count = 0; 361 bins[i].total = 0; 362 } 363 364 #ifdef KMP_DEBUG 365 _total = 0; 366 #endif 367 } count(int b)368 uint32_t count(int b) const { return bins[b + logOffset].count; } total(int b)369 double total(int b) const { return bins[b + logOffset].total; } 370 static uint32_t findBin(double sample); 371 372 logHistogram &operator+=(logHistogram const &o) { 373 zeroCount += o.zeroCount; 374 for (int i = 0; i < numBins; i++) { 375 bins[i].count += o.bins[i].count; 376 bins[i].total += o.bins[i].total; 377 } 378 #ifdef KMP_DEBUG 379 _total += o._total; 380 check(); 381 #endif 382 383 return *this; 384 } 385 386 void addSample(double sample); 387 int minBin() const; 388 int maxBin() const; 389 390 std::string format(char) const; 391 }; 392 393 class statistic { 394 double KMP_ALIGN_CACHE minVal; 395 double maxVal; 396 double meanVal; 397 double m2; 398 uint64_t sampleCount; 399 double offset; 400 bool collectingHist; 401 logHistogram hist; 402 403 public: 404 statistic(bool doHist = bool(KMP_STATS_HIST)) { 405 reset(); 406 collectingHist = doHist; 407 } statistic(statistic const & o)408 statistic(statistic const &o) 409 : minVal(o.minVal), maxVal(o.maxVal), meanVal(o.meanVal), m2(o.m2), 410 sampleCount(o.sampleCount), offset(o.offset), 411 collectingHist(o.collectingHist), hist(o.hist) {} statistic(double minv,double maxv,double meanv,uint64_t sc,double sd)412 statistic(double minv, double maxv, double meanv, uint64_t sc, double sd) 413 : minVal(minv), maxVal(maxv), meanVal(meanv), m2(sd * sd * sc), 414 sampleCount(sc), offset(0.0), collectingHist(false) {} haveHist()415 bool haveHist() const { return collectingHist; } getMin()416 double getMin() const { return minVal; } getMean()417 double getMean() const { return meanVal; } getMax()418 double getMax() const { return maxVal; } getCount()419 uint64_t getCount() const { return sampleCount; } getSD()420 double getSD() const { return sqrt(m2 / sampleCount); } getTotal()421 double getTotal() const { return sampleCount * meanVal; } getHist()422 logHistogram const *getHist() const { return &hist; } setOffset(double d)423 void setOffset(double d) { offset = d; } 424 reset()425 void reset() { 426 minVal = (std::numeric_limits<double>::max)(); 427 maxVal = -minVal; 428 meanVal = 0.0; 429 m2 = 0.0; 430 sampleCount = 0; 431 offset = 0.0; 432 hist.reset(); 433 } 434 void addSample(double sample); 435 void scale(double factor); scaleDown(double f)436 void scaleDown(double f) { scale(1. / f); } forceCount(uint64_t count)437 void forceCount(uint64_t count) { sampleCount = count; } 438 statistic &operator+=(statistic const &other); 439 440 std::string format(char unit, bool total = false) const; formatHist(char unit)441 std::string formatHist(char unit) const { return hist.format(unit); } 442 }; 443 444 struct statInfo { 445 const char *name; 446 uint32_t flags; 447 }; 448 449 class timeStat : public statistic { 450 static statInfo timerInfo[]; 451 452 public: timeStat()453 timeStat() : statistic() {} name(timer_e e)454 static const char *name(timer_e e) { return timerInfo[e].name; } noTotal(timer_e e)455 static bool noTotal(timer_e e) { 456 return timerInfo[e].flags & stats_flags_e::noTotal; 457 } masterOnly(timer_e e)458 static bool masterOnly(timer_e e) { 459 return timerInfo[e].flags & stats_flags_e::onlyInMaster; 460 } workerOnly(timer_e e)461 static bool workerOnly(timer_e e) { 462 return timerInfo[e].flags & stats_flags_e::notInMaster; 463 } noUnits(timer_e e)464 static bool noUnits(timer_e e) { 465 return timerInfo[e].flags & stats_flags_e::noUnits; 466 } logEvent(timer_e e)467 static bool logEvent(timer_e e) { 468 return timerInfo[e].flags & stats_flags_e::logEvent; 469 } clearEventFlags()470 static void clearEventFlags() { 471 for (int i = 0; i < TIMER_LAST; i++) { 472 timerInfo[i].flags &= (~(stats_flags_e::logEvent)); 473 } 474 } 475 }; 476 477 // Where we need explicitly to start and end the timer, this version can be used 478 // Since these timers normally aren't nicely scoped, so don't have a good place 479 // to live on the stack of the thread, they're more work to use. 480 class explicitTimer { 481 timeStat *stat; 482 timer_e timerEnumValue; 483 tsc_tick_count startTime; 484 tsc_tick_count pauseStartTime; 485 tsc_tick_count::tsc_interval_t totalPauseTime; 486 487 public: explicitTimer(timeStat * s,timer_e te)488 explicitTimer(timeStat *s, timer_e te) 489 : stat(s), timerEnumValue(te), startTime(), pauseStartTime(0), 490 totalPauseTime() {} 491 492 // void setStat(timeStat *s) { stat = s; } 493 void start(tsc_tick_count tick); pause(tsc_tick_count tick)494 void pause(tsc_tick_count tick) { pauseStartTime = tick; } resume(tsc_tick_count tick)495 void resume(tsc_tick_count tick) { 496 totalPauseTime += (tick - pauseStartTime); 497 } 498 void stop(tsc_tick_count tick, kmp_stats_list *stats_ptr = nullptr); reset()499 void reset() { 500 startTime = 0; 501 pauseStartTime = 0; 502 totalPauseTime = 0; 503 } get_type()504 timer_e get_type() const { return timerEnumValue; } 505 }; 506 507 // Where you need to partition a threads clock ticks into separate states 508 // e.g., a partitionedTimers class with two timers of EXECUTING_TASK, and 509 // DOING_NOTHING would render these conditions: 510 // time(EXECUTING_TASK) + time(DOING_NOTHING) = total time thread is alive 511 // No clock tick in the EXECUTING_TASK is a member of DOING_NOTHING and vice 512 // versa 513 class partitionedTimers { 514 private: 515 std::vector<explicitTimer> timer_stack; 516 517 public: 518 partitionedTimers(); 519 void init(explicitTimer timer); 520 void exchange(explicitTimer timer); 521 void push(explicitTimer timer); 522 void pop(); 523 void windup(); 524 }; 525 526 // Special wrapper around the partitioned timers to aid timing code blocks 527 // It avoids the need to have an explicit end, leaving the scope suffices. 528 class blockPartitionedTimer { 529 partitionedTimers *part_timers; 530 531 public: blockPartitionedTimer(partitionedTimers * pt,explicitTimer timer)532 blockPartitionedTimer(partitionedTimers *pt, explicitTimer timer) 533 : part_timers(pt) { 534 part_timers->push(timer); 535 } ~blockPartitionedTimer()536 ~blockPartitionedTimer() { part_timers->pop(); } 537 }; 538 539 // Special wrapper around the thread state to aid in keeping state in code 540 // blocks It avoids the need to have an explicit end, leaving the scope 541 // suffices. 542 class blockThreadState { 543 stats_state_e *state_pointer; 544 stats_state_e old_state; 545 546 public: blockThreadState(stats_state_e * thread_state_pointer,stats_state_e new_state)547 blockThreadState(stats_state_e *thread_state_pointer, stats_state_e new_state) 548 : state_pointer(thread_state_pointer), old_state(*thread_state_pointer) { 549 *state_pointer = new_state; 550 } ~blockThreadState()551 ~blockThreadState() { *state_pointer = old_state; } 552 }; 553 554 // If all you want is a count, then you can use this... 555 // The individual per-thread counts will be aggregated into a statistic at 556 // program exit. 557 class counter { 558 uint64_t value; 559 static const statInfo counterInfo[]; 560 561 public: counter()562 counter() : value(0) {} increment()563 void increment() { value++; } getValue()564 uint64_t getValue() const { return value; } reset()565 void reset() { value = 0; } name(counter_e e)566 static const char *name(counter_e e) { return counterInfo[e].name; } masterOnly(counter_e e)567 static bool masterOnly(counter_e e) { 568 return counterInfo[e].flags & stats_flags_e::onlyInMaster; 569 } 570 }; 571 572 /* **************************************************************** 573 Class to implement an event 574 575 There are four components to an event: start time, stop time 576 nest_level, and timer_name. 577 The start and stop time should be obvious (recorded in clock ticks). 578 The nest_level relates to the bar width in the timeline graph. 579 The timer_name is used to determine which timer event triggered this event. 580 581 the interface to this class is through four read-only operations: 582 1) getStart() -- returns the start time as 64 bit integer 583 2) getStop() -- returns the stop time as 64 bit integer 584 3) getNestLevel() -- returns the nest level of the event 585 4) getTimerName() -- returns the timer name that triggered event 586 587 *MORE ON NEST_LEVEL* 588 The nest level is used in the bar graph that represents the timeline. 589 Its main purpose is for showing how events are nested inside eachother. 590 For example, say events, A, B, and C are recorded. If the timeline 591 looks like this: 592 593 Begin -------------------------------------------------------------> Time 594 | | | | | | 595 A B C C B A 596 start start start end end end 597 598 Then A, B, C will have a nest level of 1, 2, 3 respectively. 599 These values are then used to calculate the barwidth so you can 600 see that inside A, B has occurred, and inside B, C has occurred. 601 Currently, this is shown with A's bar width being larger than B's 602 bar width, and B's bar width being larger than C's bar width. 603 604 **************************************************************** */ 605 class kmp_stats_event { 606 uint64_t start; 607 uint64_t stop; 608 int nest_level; 609 timer_e timer_name; 610 611 public: kmp_stats_event()612 kmp_stats_event() 613 : start(0), stop(0), nest_level(0), timer_name(TIMER_LAST) {} kmp_stats_event(uint64_t strt,uint64_t stp,int nst,timer_e nme)614 kmp_stats_event(uint64_t strt, uint64_t stp, int nst, timer_e nme) 615 : start(strt), stop(stp), nest_level(nst), timer_name(nme) {} getStart()616 inline uint64_t getStart() const { return start; } getStop()617 inline uint64_t getStop() const { return stop; } getNestLevel()618 inline int getNestLevel() const { return nest_level; } getTimerName()619 inline timer_e getTimerName() const { return timer_name; } 620 }; 621 622 /* **************************************************************** 623 Class to implement a dynamically expandable array of events 624 625 --------------------------------------------------------- 626 | event 1 | event 2 | event 3 | event 4 | ... | event N | 627 --------------------------------------------------------- 628 629 An event is pushed onto the back of this array at every 630 explicitTimer->stop() call. The event records the thread #, 631 start time, stop time, and nest level related to the bar width. 632 633 The event vector starts at size INIT_SIZE and grows (doubles in size) 634 if needed. An implication of this behavior is that log(N) 635 reallocations are needed (where N is number of events). If you want 636 to avoid reallocations, then set INIT_SIZE to a large value. 637 638 the interface to this class is through six operations: 639 1) reset() -- sets the internal_size back to 0 but does not deallocate any 640 memory 641 2) size() -- returns the number of valid elements in the vector 642 3) push_back(start, stop, nest, timer_name) -- pushes an event onto 643 the back of the array 644 4) deallocate() -- frees all memory associated with the vector 645 5) sort() -- sorts the vector by start time 646 6) operator[index] or at(index) -- returns event reference at that index 647 **************************************************************** */ 648 class kmp_stats_event_vector { 649 kmp_stats_event *events; 650 int internal_size; 651 int allocated_size; 652 static const int INIT_SIZE = 1024; 653 654 public: kmp_stats_event_vector()655 kmp_stats_event_vector() { 656 events = 657 (kmp_stats_event *)__kmp_allocate(sizeof(kmp_stats_event) * INIT_SIZE); 658 internal_size = 0; 659 allocated_size = INIT_SIZE; 660 } ~kmp_stats_event_vector()661 ~kmp_stats_event_vector() {} reset()662 inline void reset() { internal_size = 0; } size()663 inline int size() const { return internal_size; } push_back(uint64_t start_time,uint64_t stop_time,int nest_level,timer_e name)664 void push_back(uint64_t start_time, uint64_t stop_time, int nest_level, 665 timer_e name) { 666 int i; 667 if (internal_size == allocated_size) { 668 kmp_stats_event *tmp = (kmp_stats_event *)__kmp_allocate( 669 sizeof(kmp_stats_event) * allocated_size * 2); 670 for (i = 0; i < internal_size; i++) 671 tmp[i] = events[i]; 672 __kmp_free(events); 673 events = tmp; 674 allocated_size *= 2; 675 } 676 events[internal_size] = 677 kmp_stats_event(start_time, stop_time, nest_level, name); 678 internal_size++; 679 return; 680 } 681 void deallocate(); 682 void sort(); 683 const kmp_stats_event &operator[](int index) const { return events[index]; } 684 kmp_stats_event &operator[](int index) { return events[index]; } at(int index)685 const kmp_stats_event &at(int index) const { return events[index]; } at(int index)686 kmp_stats_event &at(int index) { return events[index]; } 687 }; 688 689 /* **************************************************************** 690 Class to implement a doubly-linked, circular, statistics list 691 692 |---| ---> |---| ---> |---| ---> |---| ---> ... next 693 | | | | | | | | 694 |---| <--- |---| <--- |---| <--- |---| <--- ... prev 695 Sentinel first second third 696 Node node node node 697 698 The Sentinel Node is the user handle on the list. 699 The first node corresponds to thread 0's statistics. 700 The second node corresponds to thread 1's statistics and so on... 701 702 Each node has a _timers, _counters, and _explicitTimers array to hold that 703 thread's statistics. The _explicitTimers point to the correct _timer and 704 update its statistics at every stop() call. The explicitTimers' pointers are 705 set up in the constructor. Each node also has an event vector to hold that 706 thread's timing events. The event vector expands as necessary and records 707 the start-stop times for each timer. 708 709 The nestLevel variable is for plotting events and is related 710 to the bar width in the timeline graph. 711 712 Every thread will have a thread local pointer to its node in 713 the list. The sentinel node is used by the master thread to 714 store "dummy" statistics before __kmp_create_worker() is called. 715 **************************************************************** */ 716 class kmp_stats_list { 717 int gtid; 718 timeStat _timers[TIMER_LAST + 1]; 719 counter _counters[COUNTER_LAST + 1]; 720 explicitTimer thread_life_timer; 721 partitionedTimers _partitionedTimers; 722 int _nestLevel; // one per thread 723 kmp_stats_event_vector _event_vector; 724 kmp_stats_list *next; 725 kmp_stats_list *prev; 726 stats_state_e state; 727 int thread_is_idle_flag; 728 729 public: kmp_stats_list()730 kmp_stats_list() 731 : thread_life_timer(&_timers[TIMER_OMP_worker_thread_life], 732 TIMER_OMP_worker_thread_life), 733 _nestLevel(0), _event_vector(), next(this), prev(this), state(IDLE), 734 thread_is_idle_flag(0) {} ~kmp_stats_list()735 ~kmp_stats_list() {} getTimer(timer_e idx)736 inline timeStat *getTimer(timer_e idx) { return &_timers[idx]; } getCounter(counter_e idx)737 inline counter *getCounter(counter_e idx) { return &_counters[idx]; } getPartitionedTimers()738 inline partitionedTimers *getPartitionedTimers() { 739 return &_partitionedTimers; 740 } getTimers()741 inline timeStat *getTimers() { return _timers; } getCounters()742 inline counter *getCounters() { return _counters; } getEventVector()743 inline kmp_stats_event_vector &getEventVector() { return _event_vector; } startLife()744 inline void startLife() { thread_life_timer.start(tsc_tick_count::now()); } endLife()745 inline void endLife() { thread_life_timer.stop(tsc_tick_count::now(), this); } resetEventVector()746 inline void resetEventVector() { _event_vector.reset(); } incrementNestValue()747 inline void incrementNestValue() { _nestLevel++; } getNestValue()748 inline int getNestValue() { return _nestLevel; } decrementNestValue()749 inline void decrementNestValue() { _nestLevel--; } getGtid()750 inline int getGtid() const { return gtid; } setGtid(int newgtid)751 inline void setGtid(int newgtid) { gtid = newgtid; } setState(stats_state_e newstate)752 inline void setState(stats_state_e newstate) { state = newstate; } getState()753 inline stats_state_e getState() const { return state; } getStatePointer()754 inline stats_state_e *getStatePointer() { return &state; } isIdle()755 inline bool isIdle() { return thread_is_idle_flag == 1; } setIdleFlag()756 inline void setIdleFlag() { thread_is_idle_flag = 1; } resetIdleFlag()757 inline void resetIdleFlag() { thread_is_idle_flag = 0; } 758 kmp_stats_list *push_back(int gtid); // returns newly created list node push_event(uint64_t start_time,uint64_t stop_time,int nest_level,timer_e name)759 inline void push_event(uint64_t start_time, uint64_t stop_time, 760 int nest_level, timer_e name) { 761 _event_vector.push_back(start_time, stop_time, nest_level, name); 762 } 763 void deallocate(); 764 class iterator; 765 kmp_stats_list::iterator begin(); 766 kmp_stats_list::iterator end(); 767 int size(); 768 class iterator { 769 kmp_stats_list *ptr; 770 friend kmp_stats_list::iterator kmp_stats_list::begin(); 771 friend kmp_stats_list::iterator kmp_stats_list::end(); 772 773 public: 774 iterator(); 775 ~iterator(); 776 iterator operator++(); 777 iterator operator++(int dummy); 778 iterator operator--(); 779 iterator operator--(int dummy); 780 bool operator!=(const iterator &rhs); 781 bool operator==(const iterator &rhs); 782 kmp_stats_list *operator*() const; // dereference operator 783 }; 784 }; 785 786 /* **************************************************************** 787 Class to encapsulate all output functions and the environment variables 788 789 This module holds filenames for various outputs (normal stats, events, plot 790 file), as well as coloring information for the plot file. 791 792 The filenames and flags variables are read from environment variables. 793 These are read once by the constructor of the global variable 794 __kmp_stats_output which calls init(). 795 796 During this init() call, event flags for the timeStat::timerInfo[] global 797 array are cleared if KMP_STATS_EVENTS is not true (on, 1, yes). 798 799 The only interface function that is public is outputStats(heading). This 800 function should print out everything it needs to, either to files or stderr, 801 depending on the environment variables described below 802 803 ENVIRONMENT VARIABLES: 804 KMP_STATS_FILE -- if set, all statistics (not events) will be printed to this 805 file, otherwise, print to stderr 806 KMP_STATS_THREADS -- if set to "on", then will print per thread statistics to 807 either KMP_STATS_FILE or stderr 808 KMP_STATS_PLOT_FILE -- if set, print the ploticus plot file to this filename, 809 otherwise, the plot file is sent to "events.plt" 810 KMP_STATS_EVENTS -- if set to "on", then log events, otherwise, don't log 811 events 812 KMP_STATS_EVENTS_FILE -- if set, all events are outputted to this file, 813 otherwise, output is sent to "events.dat" 814 **************************************************************** */ 815 class kmp_stats_output_module { 816 817 public: 818 struct rgb_color { 819 float r; 820 float g; 821 float b; 822 }; 823 824 private: 825 std::string outputFileName; 826 static const char *eventsFileName; 827 static const char *plotFileName; 828 static int printPerThreadFlag; 829 static int printPerThreadEventsFlag; 830 static const rgb_color globalColorArray[]; 831 static rgb_color timerColorInfo[]; 832 833 void init(); 834 static void setupEventColors(); 835 static void printPloticusFile(); 836 static void printHeaderInfo(FILE *statsOut); 837 static void printTimerStats(FILE *statsOut, statistic const *theStats, 838 statistic const *totalStats); 839 static void printCounterStats(FILE *statsOut, statistic const *theStats); 840 static void printCounters(FILE *statsOut, counter const *theCounters); 841 static void printEvents(FILE *eventsOut, kmp_stats_event_vector *theEvents, 842 int gtid); getEventColor(timer_e e)843 static rgb_color getEventColor(timer_e e) { return timerColorInfo[e]; } 844 static void windupExplicitTimers(); eventPrintingEnabled()845 bool eventPrintingEnabled() const { return printPerThreadEventsFlag; } 846 847 public: kmp_stats_output_module()848 kmp_stats_output_module() { init(); } 849 void outputStats(const char *heading); 850 }; 851 852 #ifdef __cplusplus 853 extern "C" { 854 #endif 855 void __kmp_stats_init(); 856 void __kmp_stats_fini(); 857 void __kmp_reset_stats(); 858 void __kmp_output_stats(const char *); 859 void __kmp_accumulate_stats_at_exit(void); 860 // thread local pointer to stats node within list 861 extern KMP_THREAD_LOCAL kmp_stats_list *__kmp_stats_thread_ptr; 862 // head to stats list. 863 extern kmp_stats_list *__kmp_stats_list; 864 // lock for __kmp_stats_list 865 extern kmp_tas_lock_t __kmp_stats_lock; 866 // reference start time 867 extern tsc_tick_count __kmp_stats_start_time; 868 // interface to output 869 extern kmp_stats_output_module __kmp_stats_output; 870 871 #ifdef __cplusplus 872 } 873 #endif 874 875 // Simple, standard interfaces that drop out completely if stats aren't enabled 876 877 /*! 878 * \brief Adds value to specified timer (name). 879 * 880 * @param name timer name as specified under the KMP_FOREACH_TIMER() macro 881 * @param value double precision sample value to add to statistics for the timer 882 * 883 * \details Use KMP_COUNT_VALUE(name, value) macro to add a particular value to 884 * a timer statistics. 885 * 886 * @ingroup STATS_GATHERING 887 */ 888 #define KMP_COUNT_VALUE(name, value) \ 889 __kmp_stats_thread_ptr->getTimer(TIMER_##name)->addSample(value) 890 891 /*! 892 * \brief Increments specified counter (name). 893 * 894 * @param name counter name as specified under the KMP_FOREACH_COUNTER() macro 895 * 896 * \details Use KMP_COUNT_BLOCK(name, value) macro to increment a statistics 897 * counter for the executing thread. 898 * 899 * @ingroup STATS_GATHERING 900 */ 901 #define KMP_COUNT_BLOCK(name) \ 902 __kmp_stats_thread_ptr->getCounter(COUNTER_##name)->increment() 903 904 /*! 905 * \brief Outputs the current thread statistics and reset them. 906 * 907 * @param heading_string heading put above the final stats output 908 * 909 * \details Explicitly stops all timers and outputs all stats. Environment 910 * variable, `OMPTB_STATSFILE=filename`, can be used to output the stats to a 911 * filename instead of stderr. Environment variable, 912 * `OMPTB_STATSTHREADS=true|undefined`, can be used to output thread specific 913 * stats. For now the `OMPTB_STATSTHREADS` environment variable can either be 914 * defined with any value, which will print out thread specific stats, or it can 915 * be undefined (not specified in the environment) and thread specific stats 916 * won't be printed. It should be noted that all statistics are reset when this 917 * macro is called. 918 * 919 * @ingroup STATS_GATHERING 920 */ 921 #define KMP_OUTPUT_STATS(heading_string) __kmp_output_stats(heading_string) 922 923 /*! 924 * \brief Initializes the partitioned timers to begin with name. 925 * 926 * @param name timer which you want this thread to begin with 927 * 928 * @ingroup STATS_GATHERING 929 */ 930 #define KMP_INIT_PARTITIONED_TIMERS(name) \ 931 __kmp_stats_thread_ptr->getPartitionedTimers()->init(explicitTimer( \ 932 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 933 934 #define KMP_TIME_PARTITIONED_BLOCK(name) \ 935 blockPartitionedTimer __PBLOCKTIME__( \ 936 __kmp_stats_thread_ptr->getPartitionedTimers(), \ 937 explicitTimer(__kmp_stats_thread_ptr->getTimer(TIMER_##name), \ 938 TIMER_##name)) 939 940 #define KMP_PUSH_PARTITIONED_TIMER(name) \ 941 __kmp_stats_thread_ptr->getPartitionedTimers()->push(explicitTimer( \ 942 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 943 944 #define KMP_POP_PARTITIONED_TIMER() \ 945 __kmp_stats_thread_ptr->getPartitionedTimers()->pop() 946 947 #define KMP_EXCHANGE_PARTITIONED_TIMER(name) \ 948 __kmp_stats_thread_ptr->getPartitionedTimers()->exchange(explicitTimer( \ 949 __kmp_stats_thread_ptr->getTimer(TIMER_##name), TIMER_##name)) 950 951 #define KMP_SET_THREAD_STATE(state_name) \ 952 __kmp_stats_thread_ptr->setState(state_name) 953 954 #define KMP_GET_THREAD_STATE() __kmp_stats_thread_ptr->getState() 955 956 #define KMP_SET_THREAD_STATE_BLOCK(state_name) \ 957 blockThreadState __BTHREADSTATE__(__kmp_stats_thread_ptr->getStatePointer(), \ 958 state_name) 959 960 /*! 961 * \brief resets all stats (counters to 0, timers to 0 elapsed ticks) 962 * 963 * \details Reset all stats for all threads. 964 * 965 * @ingroup STATS_GATHERING 966 */ 967 #define KMP_RESET_STATS() __kmp_reset_stats() 968 969 #if (KMP_DEVELOPER_STATS) 970 #define KMP_COUNT_DEVELOPER_VALUE(n, v) KMP_COUNT_VALUE(n, v) 971 #define KMP_COUNT_DEVELOPER_BLOCK(n) KMP_COUNT_BLOCK(n) 972 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) KMP_TIME_PARTITIONED_BLOCK(n) 973 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) KMP_PUSH_PARTITIONED_TIMER(n) 974 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) KMP_POP_PARTITIONED_TIMER(n) 975 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) \ 976 KMP_EXCHANGE_PARTITIONED_TIMER(n) 977 #else 978 // Null definitions 979 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0) 980 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) 981 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0) 982 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 983 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 984 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 985 #endif 986 987 #else // KMP_STATS_ENABLED 988 989 // Null definitions 990 #define KMP_COUNT_VALUE(n, v) ((void)0) 991 #define KMP_COUNT_BLOCK(n) ((void)0) 992 993 #define KMP_OUTPUT_STATS(heading_string) ((void)0) 994 #define KMP_RESET_STATS() ((void)0) 995 996 #define KMP_COUNT_DEVELOPER_VALUE(n, v) ((void)0) 997 #define KMP_COUNT_DEVELOPER_BLOCK(n) ((void)0) 998 #define KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(n) ((void)0) 999 #define KMP_PUSH_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 1000 #define KMP_POP_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 1001 #define KMP_EXCHANGE_DEVELOPER_PARTITIONED_TIMER(n) ((void)0) 1002 #define KMP_INIT_PARTITIONED_TIMERS(name) ((void)0) 1003 #define KMP_TIME_PARTITIONED_BLOCK(name) ((void)0) 1004 #define KMP_PUSH_PARTITIONED_TIMER(name) ((void)0) 1005 #define KMP_POP_PARTITIONED_TIMER() ((void)0) 1006 #define KMP_SET_THREAD_STATE(state_name) ((void)0) 1007 #define KMP_GET_THREAD_STATE() ((void)0) 1008 #define KMP_SET_THREAD_STATE_BLOCK(state_name) ((void)0) 1009 #endif // KMP_STATS_ENABLED 1010 1011 #endif // KMP_STATS_H 1012