1 /* Copyright 2020 Google LLC. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef RUY_RUY_PROFILER_INSTRUMENTATION_H_ 17 #define RUY_RUY_PROFILER_INSTRUMENTATION_H_ 18 19 #ifdef RUY_PROFILER 20 #include <cstdio> 21 #include <mutex> 22 #include <vector> 23 #endif 24 25 namespace ruy { 26 namespace profiler { 27 28 #ifdef RUY_PROFILER 29 30 // A label is how a code scope is annotated to appear in profiles. 31 // The stacks that are sampled by the profiler are stacks of such labels. 32 // A label consists of a literal string, plus optional integer arguments. 33 class Label { 34 public: Label()35 Label() {} 36 template <typename... Args> Label(Args...args)37 explicit Label(Args... args) { 38 Set(args...); 39 } Set(const char * format)40 void Set(const char* format) { 41 format_ = format; 42 args_count_ = 0; 43 } 44 template <typename... Args> Set(const char * format,Args...args)45 void Set(const char* format, Args... args) { 46 format_ = format; 47 args_count_ = sizeof...(args); 48 SetArgs(0, args...); 49 } 50 51 void operator=(const Label& other); 52 53 bool operator==(const Label& other) const; 54 55 std::string Formatted() const; format()56 const char* format() const { return format_; } 57 58 private: SetArgs(int position,int arg0)59 void SetArgs(int position, int arg0) { args_[position] = arg0; } 60 61 template <typename... Args> SetArgs(int position,int arg0,Args...args)62 void SetArgs(int position, int arg0, Args... args) { 63 SetArgs(position, arg0); 64 SetArgs(position + 1, args...); 65 } 66 67 static constexpr int kMaxArgs = 4; 68 const char* format_ = nullptr; 69 int args_count_ = 0; 70 int args_[kMaxArgs]; 71 }; 72 73 namespace detail { 74 75 // Forward-declaration, see class ThreadStack below. 76 class ThreadStack; 77 78 bool& GlobalIsProfilerRunning(); 79 80 // Returns the global vector of pointers to all stacks, there being one stack 81 // per thread executing instrumented code. 82 std::vector<ThreadStack*>* GlobalAllThreadStacks(); 83 84 // Returns the mutex to be locked around any access to GlobalAllThreadStacks(). 85 std::mutex* GlobalsMutex(); 86 87 // Returns the thread-local stack, specific to the current thread. 88 ThreadStack* ThreadLocalThreadStack(); 89 90 // This 'stack' is what may be more appropriately called a 'pseudostack': 91 // It contains Label entries that are 'manually' entered by instrumentation 92 // code. It's unrelated to real call stacks. 93 struct Stack { 94 std::uint32_t id = 0; 95 static constexpr int kMaxSize = 64; 96 int size = 0; 97 Label labels[kMaxSize]; 98 }; 99 100 // Returns the buffer byte size required by CopyToSample. 101 int GetBufferSize(const Stack& stack); 102 103 // Copies this Stack into a byte buffer, called a 'sample'. 104 void CopyToBuffer(const Stack& stack, char* dst); 105 106 // Populates this Stack from an existing sample buffer, typically 107 // produced by CopyToSample. 108 void ReadFromBuffer(const char* src, Stack* stack); 109 110 // ThreadStack is meant to be used as a thread-local singleton, assigning to 111 // each thread a Stack object holding its pseudo-stack of profile labels, 112 // plus a mutex allowing to synchronize accesses to this pseudo-stack between 113 // this thread and a possible profiler thread sampling it. 114 class ThreadStack { 115 public: 116 ThreadStack(); 117 ~ThreadStack(); 118 stack()119 const Stack& stack() const { return stack_; } 120 121 // Returns the mutex to lock around any access to this stack. Each stack is 122 // accessed by potentially two threads: the thread that it belongs to 123 // (which calls Push and Pop) and the profiler thread during profiling 124 // (which calls CopyToSample). Mutex()125 std::mutex& Mutex() const { return mutex_; } 126 127 // Pushes a new label on the top of this Stack. 128 template <typename... Args> Push(Args...args)129 void Push(Args... args) { 130 // This mutex locking is needed to guard against race conditions as both 131 // the current thread and the profiler thread may be concurrently accessing 132 // this stack. In addition to that, this mutex locking also serves the other 133 // purpose of acting as a barrier (of compiler code reordering, of runtime 134 // CPU instruction reordering, and of memory access reordering), which 135 // gives a measure of correctness to this profiler. The downside is some 136 // latency. As this lock will be uncontended most of the times, the cost 137 // should be roughly that of an sequentially-consistent atomic access, 138 // comparable to an access to the level of CPU data cache that is shared 139 // among all cores, typically 60 cycles on current ARM CPUs, plus side 140 // effects from barrier instructions. 141 std::lock_guard<std::mutex> lock(mutex_); 142 // Avoid overrunning the stack, even in 'release' builds. This profiling 143 // instrumentation code should not ship in release builds anyway, the 144 // overhead of this check is negligible, and overrunning a stack array would 145 // be bad. 146 if (stack_.size >= Stack::kMaxSize) { 147 abort(); 148 } 149 stack_.labels[stack_.size++].Set(args...); 150 } 151 152 // Pops the top-most label from this Stack. Pop()153 void Pop() { 154 // See the comment in Push about this lock. While it would be tempting to 155 // try to remove this lock and just atomically decrement size_ with a 156 // store-release, that would not necessarily be a substitute for all of the 157 // purposes that this lock serves, or if it was done carefully to serve all 158 // of the same purposes, then that wouldn't be faster than this (mostly 159 // uncontended) lock. 160 std::lock_guard<std::mutex> lock(mutex_); 161 stack_.size--; 162 } 163 164 private: 165 mutable std::mutex mutex_; 166 Stack stack_; 167 }; 168 169 } // namespace detail 170 171 // RAII user-facing way to construct Labels associated with their life scope 172 // and get them pushed to / popped from the current thread stack. 173 class ScopeLabel { 174 public: 175 template <typename... Args> ScopeLabel(Args...args)176 ScopeLabel(Args... args) : thread_stack_(detail::ThreadLocalThreadStack()) { 177 thread_stack_->Push(args...); 178 } 179 ~ScopeLabel()180 ~ScopeLabel() { thread_stack_->Pop(); } 181 182 private: 183 detail::ThreadStack* thread_stack_; 184 }; 185 186 #else // no RUY_PROFILER 187 188 class ScopeLabel { 189 public: 190 template <typename... Args> 191 explicit ScopeLabel(Args...) {} 192 193 // This destructor is needed to consistently silence clang's -Wunused-variable 194 // which seems to trigger semi-randomly. 195 ~ScopeLabel() {} 196 }; 197 198 #endif 199 200 } // namespace profiler 201 } // namespace ruy 202 203 #endif // RUY_RUY_PROFILER_INSTRUMENTATION_H_ 204