1 /* Copyright 2021 Google LLC. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef RUY_RUY_TRACE_H_
17 #define RUY_RUY_TRACE_H_
18
19 #ifdef RUY_TRACE
20
21 #include <algorithm>
22 #include <cstdio>
23 #include <cstdlib>
24 #include <memory>
25 #include <mutex>
26 #include <string>
27 #include <thread>
28 #include <vector>
29
30 #include "ruy/mat.h"
31 #include "ruy/matrix.h"
32 #include "ruy/path.h"
33 #include "ruy/platform.h"
34 #include "ruy/side_pair.h"
35
36 namespace ruy {
37
38 // Helper for `formatted` so we don't have to put .c_str() on strings.
39 template <typename T>
value_for_snprintf(T value)40 T value_for_snprintf(T value) {
41 return value;
42 }
43
value_for_snprintf(const std::string & s)44 inline const char* value_for_snprintf(const std::string& s) {
45 return s.c_str();
46 }
47
48 // A sprintf-like function returning a std::string.
49 // Remove this once we can rely on std::format (c++20).
50 template <typename... Args>
formatted(const char * format,Args...args)51 std::string formatted(const char* format, Args... args) {
52 char buf[1024];
53 #pragma GCC diagnostic push
54 #pragma GCC diagnostic warning "-Wformat-security"
55 int size = snprintf(buf, sizeof buf, format, value_for_snprintf(args)...);
56 #pragma GCC diagnostic pop
57 if (size <= 0) {
58 abort();
59 }
60 return std::string(buf);
61 }
62
63 // An entry in the trace.
64 struct ThreadTraceEntry final {
65 std::string text;
66 int indent = 0;
67 const char* source_file = nullptr;
68 int source_line = 0;
69 };
70
71 // Trace for one thread.
72 class ThreadTrace final {
73 public:
~ThreadTrace()74 ~ThreadTrace() {}
75
set_thread_id(int thread_id)76 void set_thread_id(int thread_id) { thread_id_ = thread_id; }
thread_id()77 int thread_id() const { return thread_id_; }
78
is_in_run_ahead_packing_loop()79 bool is_in_run_ahead_packing_loop() const {
80 return is_in_run_ahead_packing_loop_;
81 }
set_is_in_run_ahead_packing_loop(bool value)82 void set_is_in_run_ahead_packing_loop(bool value) {
83 is_in_run_ahead_packing_loop_ = value;
84 }
85
set_current_source_file(const char * source_file)86 void set_current_source_file(const char* source_file) {
87 current_source_file_ = source_file;
88 }
89
set_current_source_line(int source_line)90 void set_current_source_line(int source_line) {
91 current_source_line_ = source_line;
92 }
93
entries()94 const std::vector<ThreadTraceEntry>& entries() const { return entries_; }
95
96 template <typename... Args>
Write(const char * format,Args...args)97 void Write(const char* format, Args... args) {
98 ThreadTraceEntry entry;
99 entry.text = formatted(format, args...);
100 entry.indent = indent_;
101 entry.source_file = current_source_file_;
102 entry.source_line = current_source_line_;
103 entries_.emplace_back(std::move(entry));
104 }
105
106 template <typename... Args>
EnterScope(const char * scope_name)107 void EnterScope(const char* scope_name) {
108 Write("%s {", scope_name);
109 indent_++;
110 }
LeaveScope(const char * scope_name)111 void LeaveScope(const char* scope_name) {
112 indent_--;
113 Write("} // end of %s", scope_name);
114 }
115
116 private:
117 // The trace contents
118 std::vector<ThreadTraceEntry> entries_;
119
120 // Current indentation level.
121 int indent_ = 0;
122 // Thread's ID as set by Ruy, e.g. [0,N-1]. Not OS TID.
123 int thread_id_ = -1;
124 // The run-ahead loop in `EnsurePacked` may run many iterations when the
125 // thread is waiting for a block to be packed by another thread --- it's
126 // a busy wait. We track whether we are in that mode to avoid generating
127 // many uninteresting trace entries.
128 bool is_in_run_ahead_packing_loop_ = false;
129 // Last recorded value of __FILE__ and __LINE__, as a convenience so we don't
130 // have to pass these in every call to `Write`.
131 const char* current_source_file_ = nullptr;
132 int current_source_line_ = 0;
133 };
134
135 // Main components of ruy. Used for trace colorization.
136 enum class Component { kNone, kFrontEnd, kMiddleEnd, kBackEnd, kThreadPool };
137
138 // Output format for the trace.
139 enum class TraceOutputFormat { kNone, kTerminal, kHtml };
140
IndentString(int indent)141 inline std::string IndentString(int indent) {
142 std::string s;
143 for (int i = 0; i < indent; i++) {
144 s += " ";
145 }
146 return s;
147 }
148
149 // Returns the text to write to the trace to open a colored section.
ColorSectionStart(TraceOutputFormat output_format,Component component)150 inline const char* ColorSectionStart(TraceOutputFormat output_format,
151 Component component) {
152 switch (output_format) {
153 case TraceOutputFormat::kTerminal:
154 switch (component) {
155 case Component::kFrontEnd:
156 return "\x1b[36m";
157 case Component::kMiddleEnd:
158 return "\x1b[32m";
159 case Component::kBackEnd:
160 return "\x1b[31m";
161 case Component::kThreadPool:
162 return "\x1b[33m";
163 default:
164 abort();
165 return nullptr;
166 }
167 case TraceOutputFormat::kHtml:
168 switch (component) {
169 case Component::kFrontEnd:
170 return "<span style=\"background-color:#B2EBF2\">";
171 case Component::kMiddleEnd:
172 return "<span style=\"background-color:#C8E6C9\">";
173 case Component::kBackEnd:
174 return "<span style=\"background-color:#FFCDD2\">";
175 case Component::kThreadPool:
176 return "<span style=\"background-color:#FFF9C4\">";
177 default:
178 abort();
179 return nullptr;
180 }
181 default:
182 abort();
183 return nullptr;
184 }
185 }
186
187 // Returns the text to write to the trace to close a colored section.
ColorSectionEnd(TraceOutputFormat output_format)188 inline const char* ColorSectionEnd(TraceOutputFormat output_format) {
189 switch (output_format) {
190 case TraceOutputFormat::kTerminal:
191 return "\x1b[0m";
192 case TraceOutputFormat::kHtml:
193 return "</span>";
194 default:
195 abort();
196 return nullptr;
197 }
198 }
199
200 // Returns the output format to use for the trace.
GetOutputFormat()201 inline TraceOutputFormat GetOutputFormat() {
202 const char* html_env = getenv("RUY_TRACE_HTML");
203 if (html_env && strtol(html_env, nullptr, 10) != 0) {
204 return TraceOutputFormat::kHtml;
205 } else {
206 return TraceOutputFormat::kTerminal;
207 }
208 }
209
210 // A `basename` function that's good enough for ruy __FILE__'s.
211 // Note: `basename` is POSIX-only and annoying (takes a char*, may mutate).
GetBaseName(const char * path)212 inline const char* GetBaseName(const char* path) {
213 std::size_t len = strlen(path);
214 if (len == 0) {
215 return path;
216 }
217 const char* ptr = path + len - 1;
218 while (ptr != path) {
219 if (*ptr == '/' || *ptr == '\\') {
220 return ptr + 1;
221 }
222 --ptr;
223 }
224 // Path did not contain any path separator.
225 return path;
226 }
227
228 // Determines a Component (used for colorization) by source file.
GetComponentBySourceFile(const char * base_name)229 inline Component GetComponentBySourceFile(const char* base_name) {
230 if (!strcmp(base_name, "pack.h") || !strcmp(base_name, "kernel.h")) {
231 return Component::kBackEnd;
232 } else if (!strcmp(base_name, "trmul.cc") ||
233 !strcmp(base_name, "block_map.cc")) {
234 return Component::kMiddleEnd;
235 } else if (!strcmp(base_name, "thread_pool.cc")) {
236 return Component::kThreadPool;
237 } else {
238 return Component::kFrontEnd;
239 }
240 }
241
EscapeText(TraceOutputFormat output_format,const std::string & text)242 inline std::string EscapeText(TraceOutputFormat output_format,
243 const std::string& text) {
244 if (output_format == TraceOutputFormat::kHtml) {
245 std::string escaped_text;
246 for (char c : text) {
247 if (c == '<') {
248 escaped_text += "<";
249 } else if (c == '>') {
250 escaped_text += ">";
251 } else {
252 escaped_text += c;
253 }
254 }
255 return escaped_text;
256 } else {
257 return text;
258 }
259 }
260
261 // Prints an entry from the trace to the destination trace file.
Print(const ThreadTraceEntry & entry,TraceOutputFormat output_format,FILE * file)262 inline void Print(const ThreadTraceEntry& entry,
263 TraceOutputFormat output_format, FILE* file) {
264 const char* base_name = GetBaseName(entry.source_file);
265 Component component = GetComponentBySourceFile(base_name);
266 const std::string& source_location =
267 formatted("%s:%d", base_name, entry.source_line);
268 const std::string& escaped_text = EscapeText(output_format, entry.text);
269 fprintf(file, "%s%-32s%s%s%s\n", ColorSectionStart(output_format, component),
270 source_location.c_str(), IndentString(entry.indent).c_str(),
271 escaped_text.c_str(), ColorSectionEnd(output_format));
272 }
273
274 // Prints a thread's entire trace to the destination trace file.
Print(const ThreadTrace & trace,TraceOutputFormat output_format,FILE * file)275 inline void Print(const ThreadTrace& trace, TraceOutputFormat output_format,
276 FILE* file) {
277 if (output_format == TraceOutputFormat::kHtml) {
278 fprintf(file, "<html><body><pre>\n<span style=\"font-weight:bold\">\n");
279 }
280 fprintf(file, "Ruy trace for thread %d:\n", trace.thread_id());
281 if (output_format == TraceOutputFormat::kHtml) {
282 fprintf(file, "</span>\n");
283 }
284 for (const ThreadTraceEntry& entry : trace.entries()) {
285 Print(entry, output_format, file);
286 }
287 fprintf(file, "\n");
288 if (output_format == TraceOutputFormat::kHtml) {
289 fprintf(file, "</pre></body></html>\n");
290 }
291 }
292
293 // Holds all the threads' traces. This is a global singleton class.
294 // On exit, when the singleton is destroyed, the destructor prints out the
295 // traces.
296 class AllThreadTraces final {
297 public:
298 // Add a new ThreadTrace for the current thread. Should be called only once
299 // on each thread.
AddCurrentThread()300 ThreadTrace* AddCurrentThread() {
301 std::lock_guard<std::mutex> lock(mutex_);
302 ThreadTrace* thread_trace = new ThreadTrace;
303 thread_traces_.emplace_back(thread_trace);
304 return thread_trace;
305 }
~AllThreadTraces()306 ~AllThreadTraces() {
307 std::lock_guard<std::mutex> lock(mutex_);
308 // Open the destination file.
309 const char* file_env = getenv("RUY_TRACE_FILE");
310 FILE* file = stdout;
311 if (file_env) {
312 file = fopen(file_env, "w");
313 if (!file) {
314 fprintf(stderr, "Failed to open %s for write\n", file_env);
315 exit(1);
316 }
317 }
318 // Sort the threads by Ruy Thread ID (not OS TID).
319 auto output_format = GetOutputFormat();
320 std::sort(std::begin(thread_traces_), std::end(thread_traces_),
321 [](const auto& a, const auto& b) {
322 return a->thread_id() < b->thread_id();
323 });
324 // Print all the threads' traces.
325 for (const auto& trace : thread_traces_) {
326 Print(*trace, output_format, file);
327 }
328 if (file_env) {
329 fclose(file);
330 }
331 }
Singleton()332 static AllThreadTraces* Singleton() {
333 static AllThreadTraces all_thread_traces;
334 return &all_thread_traces;
335 }
336
337 private:
338 std::vector<std::unique_ptr<ThreadTrace>> thread_traces_;
339 std::mutex mutex_;
340 };
341
342 // Returns the thread-local ThreadTrace singleton, constructing it as needed.
ThreadLocalTrace()343 inline ThreadTrace* ThreadLocalTrace() {
344 static thread_local ThreadTrace* thread_local_trace =
345 AllThreadTraces::Singleton()->AddCurrentThread();
346 return thread_local_trace;
347 }
348
349 // RAII helper to trace a scope, e.g. a function scope.
350 class RuyTraceScope {
351 const char* source_file_;
352 int source_line_;
353 const char* scope_name_;
354
355 public:
RuyTraceScope(const char * source_file,int source_line,const char * scope_name)356 RuyTraceScope(const char* source_file, int source_line,
357 const char* scope_name)
358 : source_file_(source_file),
359 source_line_(source_line),
360 scope_name_(scope_name) {
361 ThreadLocalTrace()->set_current_source_file(source_file_);
362 ThreadLocalTrace()->set_current_source_line(source_line_);
363 ThreadLocalTrace()->EnterScope(scope_name_);
364 }
~RuyTraceScope()365 ~RuyTraceScope() {
366 ThreadLocalTrace()->set_current_source_file(source_file_);
367 ThreadLocalTrace()->set_current_source_line(source_line_);
368 ThreadLocalTrace()->LeaveScope(scope_name_);
369 }
370 };
371
372 #define RUY_TRACE_SCOPE_NAME_IMPL(file, line, name) \
373 RuyTraceScope ruy_trace_scope##line(file, line, name)
374 #define RUY_TRACE_SCOPE_NAME(name) \
375 RUY_TRACE_SCOPE_NAME_IMPL(__FILE__, __LINE__, name)
376 #define RUY_TRACE_SCOPE \
377 RUY_TRACE_SCOPE_NAME_IMPL(__FILE__, __LINE__, __FUNCTION__)
378
379 // Helpers to trace Ruy objects.
380
str(Order o)381 inline std::string str(Order o) {
382 return o == Order::kRowMajor ? "row-major" : "column-major";
383 }
384
str(Side s)385 inline std::string str(Side s) { return s == Side::kLhs ? "LHS" : "RHS"; }
386
str(const Layout & layout)387 inline std::string str(const Layout& layout) {
388 std::string s =
389 formatted("%dx%d, %s", layout.rows(), layout.cols(), str(layout.order()));
390 int inner_size =
391 layout.order() == Order::kRowMajor ? layout.cols() : layout.rows();
392 if (inner_size != layout.stride()) {
393 s += formatted(", stride=%d", layout.stride());
394 } else {
395 s += formatted(", unstrided");
396 }
397 return s;
398 }
399
str(const MatLayout & layout)400 inline std::string str(const MatLayout& layout) {
401 std::string s =
402 formatted("%dx%d, %s", layout.rows, layout.cols, str(layout.order));
403 int inner_size = layout.order == Order::kRowMajor ? layout.cols : layout.rows;
404 if (inner_size != layout.stride) {
405 s += formatted(", stride=%d", layout.stride);
406 } else {
407 s += formatted(", unstrided");
408 }
409 return s;
410 }
411
str(const PMatLayout & layout)412 inline std::string str(const PMatLayout& layout) {
413 std::string s =
414 formatted("%dx%d, %s", layout.rows, layout.cols, str(layout.order));
415 int inner_size = layout.order == Order::kRowMajor ? layout.cols : layout.rows;
416 if (inner_size != layout.stride) {
417 s += formatted(", stride=%d", layout.stride);
418 } else {
419 s += formatted(", unstrided");
420 }
421 s += formatted(", kernel blocks: %dx%d %s", layout.kernel.rows,
422 layout.kernel.cols, str(layout.kernel.order));
423 return s;
424 }
425
426 template <typename T>
str()427 std::string str() {
428 return "<unknown type>";
429 }
430 #define RUY_IMPL_STR_TYPE_STD(T) \
431 template <> \
432 inline std::string str<std::T>() { \
433 return #T; \
434 }
435 #define RUY_IMPL_STR_TYPE(T) \
436 template <> \
437 inline std::string str<T>() { \
438 return #T; \
439 }
440
441 RUY_IMPL_STR_TYPE(float)
RUY_IMPL_STR_TYPE(double)442 RUY_IMPL_STR_TYPE(double)
443 RUY_IMPL_STR_TYPE_STD(int8_t)
444 RUY_IMPL_STR_TYPE_STD(uint8_t)
445 RUY_IMPL_STR_TYPE_STD(int16_t)
446 RUY_IMPL_STR_TYPE_STD(uint16_t)
447 RUY_IMPL_STR_TYPE_STD(int32_t)
448 RUY_IMPL_STR_TYPE_STD(uint32_t)
449 RUY_IMPL_STR_TYPE_STD(int64_t)
450 RUY_IMPL_STR_TYPE_STD(uint64_t)
451
452 template <typename T>
453 std::string str(const Matrix<T>& matrix) {
454 std::string s = formatted("Matrix<%s>, %s", str<T>(), str(matrix.layout()));
455 if (matrix.zero_point()) {
456 s += formatted(", zero_point=%d", static_cast<int>(matrix.zero_point()));
457 }
458 if (matrix.cache_policy() != CachePolicy::kNeverCache) {
459 s +=
460 formatted(", cache_policy=%d", static_cast<int>(matrix.cache_policy()));
461 }
462 return s;
463 }
464
str(const Type & type)465 inline std::string str(const Type& type) {
466 char c;
467 if (type.is_floating_point) {
468 c = 'f';
469 } else if (type.is_signed) {
470 c = 'i';
471 } else {
472 c = 'u';
473 }
474 return formatted("%c%d", c, type.size * 8);
475 }
476
str(const EMat & mat)477 inline std::string str(const EMat& mat) {
478 std::string s =
479 formatted("EMat, data_type=%s, %s", str(mat.data_type), str(mat.layout));
480 if (mat.zero_point) {
481 s += formatted(", zero_point=%d", static_cast<int>(mat.zero_point));
482 }
483 if (mat.cache_policy != CachePolicy::kNeverCache) {
484 s += formatted(", cache_policy=%d", static_cast<int>(mat.cache_policy));
485 }
486 return s;
487 }
488
str(const PEMat & mat)489 inline std::string str(const PEMat& mat) {
490 std::string s =
491 formatted("PEMat, data_type=%s, %s", str(mat.data_type), str(mat.layout));
492 if (mat.zero_point) {
493 s += formatted(", zero_point=%d", static_cast<int>(mat.zero_point));
494 }
495 return s;
496 }
497
str(Path paths)498 inline std::string str(Path paths) {
499 bool first = true;
500 std::string s;
501 for (int bit = 0; bit < 16; bit++) {
502 Path cur_path = static_cast<Path>(1 << bit);
503 if ((paths & cur_path) != Path::kNone) {
504 if (!first) {
505 s += " | ";
506 }
507 first = false;
508 switch (cur_path) {
509 case Path::kNone:
510 continue;
511 #define RUY_HANDLE_PATH(p) \
512 case Path::p: \
513 s += #p; \
514 break;
515 RUY_HANDLE_PATH(kStandardCpp)
516 RUY_HANDLE_PATH(kInternalStandardCppVariant1)
517 RUY_HANDLE_PATH(kInternalStandardCppVariant2)
518 RUY_HANDLE_PATH(kInternalStandardCppVariant3)
519 #if RUY_PLATFORM_ARM
520 RUY_HANDLE_PATH(kNeon)
521 RUY_HANDLE_PATH(kNeonDotprod)
522 #endif // RUY_PLATFORM_ARM
523 #if RUY_PLATFORM_X86
524 RUY_HANDLE_PATH(kAvx)
525 RUY_HANDLE_PATH(kAvx2Fma)
526 RUY_HANDLE_PATH(kAvx512)
527 #endif // RUY_PLATFORM_X86
528 #undef RUY_HANDLE_PATH
529 default:
530 fprintf(stderr, "Unhandled Path value 0x%x\n",
531 static_cast<int>(cur_path));
532 abort();
533 }
534 }
535 }
536 return s;
537 }
538
539 // Implementation of RUY_TRACE_INFO(X) macros.
540
541 #define RUY_TRACE_INFO_MUL \
542 ThreadLocalTrace()->Write("CompiledPaths: %s", str(CompiledPaths)); \
543 ThreadLocalTrace()->Write("LHS: %s", str(lhs)); \
544 ThreadLocalTrace()->Write("RHS: %s", str(rhs)); \
545 ThreadLocalTrace()->Write("Destination: %s", str(*dst));
546
547 #define RUY_TRACE_INFO_CREATE_TRMUL_PARAMS_TRANSPOSING \
548 ThreadLocalTrace()->Write("Canonicalizing to column-major destination:"); \
549 ThreadLocalTrace()->Write( \
550 "Swapping LHS<->RHS and flipping all storage orders.");
551
552 #define RUY_TRACE_INFO_CREATE_TRMUL_PARAMS_ASSUMING_COLMAJOR_DST \
553 ThreadLocalTrace()->Write("Runtime-selected path: %s", str(the_path)); \
554 ThreadLocalTrace()->Write("LHS: %s", str(params->src[Side::kLhs])); \
555 ThreadLocalTrace()->Write("RHS: %s", str(params->src[Side::kRhs])); \
556 ThreadLocalTrace()->Write("Destination: %s", str(params->dst));
557
558 #define RUY_TRACE_INFO_POPULATE_TRMUL_PARAMS \
559 ThreadLocalTrace()->Write( \
560 "Here we have this Path as a template parameter: %s", str(ThePath)); \
561 ThreadLocalTrace()->Write("PackedLhsScalar: %s", str<PackedLhsScalar>()); \
562 ThreadLocalTrace()->Write("PackedRhsScalar: %s", str<PackedRhsScalar>()); \
563 ThreadLocalTrace()->Write("Kernel function pointer: %p", \
564 params->run_kernel); \
565 ThreadLocalTrace()->Write("Kernel LHS block layout: %dx%d %s", \
566 LhsKernelLayout::kRows, LhsKernelLayout::kCols, \
567 str(LhsKernelLayout::kOrder)); \
568 ThreadLocalTrace()->Write("Kernel RHS block layout: %dx%d %s", \
569 RhsKernelLayout::kRows, RhsKernelLayout::kCols, \
570 str(RhsKernelLayout::kOrder)); \
571 ThreadLocalTrace()->Write("Created packed matrices:"); \
572 ThreadLocalTrace()->Write("Packed LHS matrix: %s", \
573 str(params->packed_matrix[Side::kLhs])); \
574 ThreadLocalTrace()->Write("Packed RHS matrix: %s", \
575 str(params->packed_matrix[Side::kRhs])); \
576 ThreadLocalTrace()->Write("LHS packing function pointer: %p", \
577 params->run_pack[Side::kLhs]); \
578 ThreadLocalTrace()->Write("RHS packing function pointer: %p", \
579 params->run_pack[Side::kRhs]);
580
581 #define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_SET_VALUE \
582 ThreadLocalTrace()->Write("SetRuntimeEnabledPaths forcing paths: %s", \
583 str(*paths));
584
585 #define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_ENV_VAR \
586 ThreadLocalTrace()->Write("Environment variable forcing paths: %s", \
587 str(*paths));
588
589 #define RUY_TRACE_INFO_GET_RUNTIME_ENABLED_PATHS_USING_DETECTION \
590 ThreadLocalTrace()->Write( \
591 "Runtime-detected paths: %s", \
592 str(*paths & ~kNonArchPathsIncludingInternalVariants));
593
594 #define RUY_TRACE_INFO_PREPARE_PACKED_MATRICES_SHOULD_CACHE \
595 ThreadLocalTrace()->Write( \
596 "Caching the packed %s matrix. Already in cache: %s", str(side), \
597 action == PrepackedCache::Action::kInsertedNewEntry ? "no" : "yes");
598
599 #define RUY_TRACE_INFO_PREPARE_PACKED_MATRICES_NO_CACHE \
600 ThreadLocalTrace()->Write("Not caching the packed %s matrix.", str(side));
601
602 #define RUY_TRACE_INFO_GET_TENTATIVE_THREAD_COUNT \
603 ThreadLocalTrace()->Write( \
604 "tentative_thread_count=%d (determined based on shape %dx%dx%d and " \
605 "max_num_threads=%d)", \
606 tentative_thread_count, rows, depth, cols, ctx->max_num_threads());
607
608 #define RUY_TRACE_INFO_GET_USE_SIMPLE_LOOP_RETURNS_TRUE \
609 ThreadLocalTrace()->Write( \
610 "Choosing to use the simple loop code path in TrMul because of the " \
611 "linear traversal and single thread.");
612
613 #define RUY_TRACE_INFO_GET_USE_SIMPLE_LOOP_RETURNS_FALSE \
614 ThreadLocalTrace()->Write( \
615 "Choosing to use the general case code path in TrMul because of: %s", \
616 tentative_thread_count > 1 ? "multi-threading" \
617 : "non-linear traversal order");
618
619 #define RUY_TRACE_INFO_TRMUL_SIMPLE_LOOP \
620 ThreadLocalTrace()->Write("Entering the simple loop code path of TrMul");
621
622 #define RUY_TRACE_INFO_TRMUL_GENERAL_CASE \
623 ThreadLocalTrace()->Write("Entering the general case code path of TrMul");
624
625 #define RUY_TRACE_INFO_MAKE_BLOCK_MAP_START \
626 ThreadLocalTrace()->Write("Kernel block: %dx%d", kernel_rows, kernel_cols); \
627 ThreadLocalTrace()->Write( \
628 "BlockMap shape: %dx%d (destination matrix shape rounded to next " \
629 "kernel blocks)", \
630 rows, cols); \
631 ThreadLocalTrace()->Write( \
632 "Rectangularness log2: %dx%d (powers of two factors bringing the shape " \
633 "closest to square)", \
634 rows_rectangularness_log2, cols_rectangularness_log2); \
635 ThreadLocalTrace()->Write("Accumulation depth: %d", depth); \
636 ThreadLocalTrace()->Write("LHS scalar type size: %d", lhs_scalar_size); \
637 ThreadLocalTrace()->Write("RHS scalar type size: %d", rhs_scalar_size); \
638 ThreadLocalTrace()->Write("Tentative thread count: %d", \
639 tentative_thread_count); \
640 ThreadLocalTrace()->Write( \
641 "CPU cache params: local_cache_size=%d, last_level_cache_size=%d", \
642 cpu_cache_params.local_cache_size, \
643 cpu_cache_params.last_level_cache_size); \
644 ThreadLocalTrace()->Write( \
645 "For the sizes below, when rows!=cols, we always retain the min of the " \
646 "two."); \
647 ThreadLocalTrace()->Write("Kernel block size_log2: %d", kernel_size_log2); \
648 ThreadLocalTrace()->Write( \
649 "BlockMap size_log2: %d (destination matrix shape rounded to next " \
650 "kernel blocks)", \
651 size_log2); \
652 ThreadLocalTrace()->Write( \
653 "Now we will pick the optimal log2 of BlockMap block size");
654
655 #define RUY_TRACE_INFO_MAKE_BLOCK_MAP_EACH_TENTATIVE_BLOCK_SIZE \
656 ThreadLocalTrace()->Write( \
657 "For BlockMap block size_log2 %d, score=%d (" \
658 "multithreading_score=%d + cache_locality_score=%d + " \
659 "kernel_amortization_score=%d)", \
660 block_size_log2, score, multithreading_score, cache_locality_score, \
661 kernel_amortization_score);
662
663 #define RUY_TRACE_INFO_MAKE_BLOCK_MAP_END \
664 ThreadLocalTrace()->Write("Selecting BlockMap block size_log2: %d", \
665 best_score_block_size_log2); \
666 ThreadLocalTrace()->Write( \
667 "BlockMap has %dx%d blocks, each of size between %dx%d and %dx%d.", \
668 1 << num_blocks_of_rows_log2, 1 << num_blocks_of_cols_log2, \
669 block_map->small_block_dims[Side::kLhs], \
670 block_map->small_block_dims[Side::kRhs], \
671 block_map->small_block_dims[Side::kLhs] + \
672 block_map->kernel_dims[Side::kLhs], \
673 block_map->small_block_dims[Side::kRhs] + \
674 block_map->kernel_dims[Side::kRhs]); \
675 ThreadLocalTrace()->Write( \
676 "The first %d rows of blocks have %d rows, the remaining ones have %d " \
677 "rows ", \
678 block_map->large_blocks[Side::kLhs], \
679 block_map->small_block_dims[Side::kLhs] + \
680 block_map->kernel_dims[Side::kLhs], \
681 block_map->small_block_dims[Side::kLhs]); \
682 ThreadLocalTrace()->Write( \
683 "The first %d columns of blocks have %d columns, the remaining ones " \
684 "have %d columns ", \
685 block_map->large_blocks[Side::kRhs], \
686 block_map->small_block_dims[Side::kRhs] + \
687 block_map->kernel_dims[Side::kLhs], \
688 block_map->small_block_dims[Side::kRhs]); \
689 ThreadLocalTrace()->Write( \
690 "Traversal order: %s", \
691 block_map->traversal_order == BlockMapTraversalOrder::kLinear ? "linear" \
692 : block_map->traversal_order == BlockMapTraversalOrder::kFractalZ \
693 ? "fractal Z-curve" \
694 : block_map->traversal_order == BlockMapTraversalOrder::kFractalU \
695 ? "fractal U-curve" \
696 : block_map->traversal_order == BlockMapTraversalOrder::kFractalHilbert \
697 ? "fractal Hilbert curve" \
698 : nullptr); \
699 ThreadLocalTrace()->Write("Finalized thread count: %d", \
700 block_map->thread_count);
701
702 #define RUY_TRACE_SET_THEAD_ID(thread_id) \
703 ThreadLocalTrace()->set_thread_id(thread_id);
704
705 #define RUY_TRACE_INFO_TRMUL_TASK_MAIN_LOOP_GOT_BLOCK_COORDS \
706 ThreadLocalTrace()->Write( \
707 "Block #%d is at position (%d, %d) in the BlockMap.", block_id, \
708 block[Side::kLhs], block[Side::kRhs]); \
709 ThreadLocalTrace()->Write( \
710 "Block #%d has shape %dx%d and starts at position (%d, %d) in the " \
711 "destination matrix.", \
712 block_id, end[Side::kLhs] - start[Side::kLhs], \
713 end[Side::kRhs] - start[Side::kRhs], start[Side::kLhs], \
714 start[Side::kRhs]); \
715 ThreadLocalTrace()->Write( \
716 "Block #%d depends on LHS panel #%d and RHS panel #%d.", block_id, \
717 block[Side::kLhs], block[Side::kRhs]);
718
719 #define RUY_TRACE_INFO_TRYPACK_PACKING \
720 ThreadLocalTrace()->Write( \
721 "%s panel #%d is not already packed. Packing it now.", str(side), \
722 block);
723
724 #define RUY_TRACE_INFO_TRYPACK_ANOTHER_THREAD_PACKING \
725 if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) { \
726 ThreadLocalTrace()->Write( \
727 "%s panel #%d is currently being packed by another thread.", \
728 str(side), block); \
729 }
730
731 #define RUY_TRACE_INFO_TRYPACK_PREVIOUSLY_PACKED \
732 if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) { \
733 ThreadLocalTrace()->Write("%s panel #%d had previously been packed.", \
734 str(side), block); \
735 }
736
737 #define RUY_TRACE_INFO_TRYPACK_PACKED_BY_ANOTHER_THREAD \
738 ThreadLocalTrace()->Write( \
739 "%s panel #%d has just been packed by another thread.", str(side), \
740 block);
741
742 #define RUY_TRACE_INFO_ENSURE_PACKED_ENTER_RUN_AHEAD \
743 if (!ThreadLocalTrace()->is_in_run_ahead_packing_loop()) { \
744 ThreadLocalTrace()->set_is_in_run_ahead_packing_loop(true); \
745 ThreadLocalTrace()->Write( \
746 "We're blocked on other threads packing the panels that we need. " \
747 "Packing some other panels while we wait..."); \
748 }
749
750 #define RUY_TRACE_INFO_ENSURE_PACKED_END \
751 if (ThreadLocalTrace()->is_in_run_ahead_packing_loop()) { \
752 ThreadLocalTrace()->set_is_in_run_ahead_packing_loop(false); \
753 ThreadLocalTrace()->Write( \
754 "Other threads have finished packing what we were waiting for."); \
755 }
756
757 #define RUY_TRACE_INFO_RUN_PACK \
758 ThreadLocalTrace()->Write("Path: %s", str(ThePath)); \
759 ThreadLocalTrace()->Write("Packing panel consisting of columns [%d, %d)", \
760 start_col, end_col); \
761 ThreadLocalTrace()->Write("Source: columns [%d, %d) of %s", start_col, \
762 end_col, str(src_matrix)); \
763 ThreadLocalTrace()->Write("Destination: columns [%d, %d) of %s", start_col, \
764 end_col, str(*packed_matrix)); \
765 if (end_col > src_matrix.layout.cols) { \
766 ThreadLocalTrace()->Write( \
767 "This runs past the last column of the source matrix. Padding as " \
768 "needed."); \
769 } \
770 if (packed_matrix->layout.rows > src_matrix.layout.rows) { \
771 ThreadLocalTrace()->Write( \
772 "The packed matrix has more rows than the source matrix due to " \
773 "rounding up to the kernel block size. Padding as needed."); \
774 }
775
776 #define RUY_TRACE_INFO_RUN_KERNEL \
777 { \
778 ThreadLocalTrace()->Write("Path: %s", str(KernelArgs<KernelType>::kPath)); \
779 int lhs_cols = end[Side::kLhs] - start[Side::kLhs]; \
780 int rhs_cols = end[Side::kRhs] - start[Side::kRhs]; \
781 int kernel_lhs_cols = src[Side::kLhs].layout.kernel.cols; \
782 int kernel_rhs_cols = src[Side::kRhs].layout.kernel.cols; \
783 ThreadLocalTrace()->Write("LHS: columns [%d, %d) of %s", \
784 start[Side::kLhs], end[Side::kLhs], \
785 str(src[Side::kLhs])); \
786 ThreadLocalTrace()->Write("RHS: columns [%d, %d) of %s", \
787 start[Side::kRhs], end[Side::kRhs], \
788 str(src[Side::kRhs])); \
789 ThreadLocalTrace()->Write("Destination: block [%d, %d)x[%d, %d) of %s", \
790 start[Side::kLhs], end[Side::kLhs], \
791 start[Side::kRhs], end[Side::kRhs], str(*dst)); \
792 if (end[Side::kLhs] > dst->layout.rows || \
793 end[Side::kRhs] > dst->layout.cols) { \
794 ThreadLocalTrace()->Write( \
795 "This runs over the destination matrix boundaries. The kernel will " \
796 "internally clamp stores to avoid overruns."); \
797 } \
798 ThreadLocalTrace()->Write( \
799 "The kernel's inner loop only produces a %dx%d block, so the " \
800 "kernel's outer loops will run %dx%d times.", \
801 kernel_lhs_cols, kernel_rhs_cols, lhs_cols / kernel_lhs_cols, \
802 rhs_cols / kernel_rhs_cols); \
803 }
804
805 #define RUY_TRACE_INFO_THREAD_FUNC_IMPL_WAITING \
806 ThreadLocalTrace()->Write("Waiting for a task...");
807
808 #define RUY_TRACE_INFO_THREADPOOL_EXECUTE_STARTING_TASK \
809 ThreadLocalTrace()->Write("Sending task #%d to a worker thread...", i);
810
811 #define RUY_TRACE_INFO_THREADPOOL_EXECUTE_STARTING_TASK_ZERO_ON_CUR_THREAD \
812 ThreadLocalTrace()->Write("Running task #0 on the current thread...");
813
814 #define RUY_TRACE_INFO_THREADPOOL_EXECUTE_WAITING_FOR_THREADS \
815 ThreadLocalTrace()->Write("Waiting for worker threads to finish..");
816
817 #define RUY_TRACE_INFO(id) \
818 [=]() { \
819 ThreadLocalTrace()->set_current_source_file(__FILE__); \
820 ThreadLocalTrace()->set_current_source_line(__LINE__); \
821 RUY_TRACE_INFO_##id \
822 }()
823
824 } // namespace ruy
825
826 #else
827
828 // Vacuous implementation when RUY_TRACE is not defined.
829 #define RUY_TRACE_SCOPE_NAME(name)
830 #define RUY_TRACE_SCOPE
831 #define RUY_TRACE_SET_THEAD_ID(thread_id)
832 #define RUY_TRACE_INFO(id)
833
834 #endif
835
836 #endif // RUY_RUY_TRACE_H_
837