1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 // This class is designed to get accurate profile for programs. 16 17 #ifndef TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_ 18 #define TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_ 19 20 #include <chrono> 21 #include <memory> 22 23 #include "tensorflow/core/platform/macros.h" 24 #include "tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h" 25 #include "tensorflow/core/platform/types.h" 26 27 #if defined(ARMV6) || defined(__ARM_ARCH_7A__) 28 #include <sys/time.h> 29 #endif 30 31 #if defined(_WIN32) 32 #include <intrin.h> 33 #endif 34 35 namespace tensorflow { 36 37 namespace profile_utils { 38 39 // CpuUtils is a profiling tool with static functions 40 // designed to be called from multiple classes. 41 // A dedicated class which inherits ICpuUtilsHelper is 42 // stored as a function-local static variable which inherits 43 // GetCpuUtilsHelperSingletonInstance that caches CPU information, 44 // because loading CPU information may take a long time. 45 // Users must call EnableClockCycleProfiling before using CpuUtils. 46 class CpuUtils { 47 public: 48 // Constant for invalid frequency. 49 // This value is returned when the frequency is not obtained somehow. 50 static constexpr int64 INVALID_FREQUENCY = -1; 51 static constexpr uint64 DUMMY_CYCLE_CLOCK = 1; 52 53 // Return current clock cycle. This function is designed to 54 // minimize the overhead to get clock and maximize the accuracy of 55 // time for profile. 56 // This returns unsigned int because there is no guarantee that rdtsc 57 // is less than 2 ^ 61. GetCurrentClockCycle()58 static inline uint64 GetCurrentClockCycle() { 59 #if defined(__ANDROID__) 60 return GetCpuUtilsHelperSingletonInstance().GetCurrentClockCycle(); 61 // ---------------------------------------------------------------- 62 #elif defined(_WIN32) 63 return __rdtsc(); 64 // ---------------------------------------------------------------- 65 #elif defined(__x86_64__) || defined(__amd64__) 66 uint64_t high, low; 67 __asm__ volatile("rdtsc" : "=a"(low), "=d"(high)); 68 return (high << 32) | low; 69 // ---------------------------------------------------------------- 70 #elif defined(__aarch64__) 71 // System timer of ARMv8 runs at a different frequency than the CPU's. 72 // The frequency is fixed, typically in the range 1-50MHz. It can because 73 // read at CNTFRQ special register. We assume the OS has set up 74 // the virtual timer properly. 75 uint64_t virtual_timer_value; 76 asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value)); 77 return virtual_timer_value; 78 // ---------------------------------------------------------------- 79 // V6 is the earliest arm that has a standard cyclecount 80 #elif defined(ARMV6) || defined(__ARM_ARCH_7A__) 81 uint32_t pmccntr; 82 uint32_t pmuseren; 83 uint32_t pmcntenset; 84 // Read the user mode perf monitor counter access permissions. 85 asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren)); 86 if (pmuseren & 1) { // Allows reading perfmon counters for user mode code. 87 asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset)); 88 if (pmcntenset & 0x80000000ul) { // Is it counting? 89 asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr)); 90 // The counter is set up to count every 64th cyclecount 91 return static_cast<uint64>(pmccntr) * 64; // Should optimize to << 64 92 } 93 } 94 // Returning dummy clock when can't access to the counter 95 return DUMMY_CYCLE_CLOCK; 96 #else 97 // TODO(satok): Support generic way to emulate clock count. 98 // TODO(satok): Support other architectures if wanted. 99 // Returning dummy clock when can't access to the counter 100 return DUMMY_CYCLE_CLOCK; 101 #endif 102 } 103 104 // Return cycle counter frequency. 105 // As this method caches the cpu frequency internally, 106 // the first call will incur overhead, but not subsequent calls. 107 #if (defined(__powerpc__) || \ 108 defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \ 109 (defined(__s390x__)) 110 static uint64 GetCycleCounterFrequency(); 111 #else 112 static int64 GetCycleCounterFrequency(); 113 #endif 114 115 // Return micro second per each clock 116 // As this method caches the cpu frequency internally, 117 // the first call will incur overhead, but not subsequent calls. 118 static double GetMicroSecPerClock(); 119 120 // Reset clock cycle 121 // Resetting clock cycle is recommended to prevent 122 // clock cycle counters from overflowing on some platforms. 123 static void ResetClockCycle(); 124 125 // Enable clock cycle profile 126 // You can enable / disable profile if it's supported by the platform 127 static void EnableClockCycleProfiling(bool enable); 128 129 // Return chrono::duration per each clock 130 static std::chrono::duration<double> ConvertClockCycleToTime( 131 const int64 clock_cycle); 132 133 private: 134 class DefaultCpuUtilsHelper : public ICpuUtilsHelper { 135 public: 136 DefaultCpuUtilsHelper() = default; ResetClockCycle()137 void ResetClockCycle() final {} GetCurrentClockCycle()138 uint64 GetCurrentClockCycle() final { return DUMMY_CYCLE_CLOCK; } EnableClockCycleProfiling(bool)139 void EnableClockCycleProfiling(bool /* enable */) final {} CalculateCpuFrequency()140 int64 CalculateCpuFrequency() final { return INVALID_FREQUENCY; } 141 142 private: 143 TF_DISALLOW_COPY_AND_ASSIGN(DefaultCpuUtilsHelper); 144 }; 145 146 // Return cpu frequency. 147 // CAVEAT: as this method calls system call and parse the mssage, 148 // this call may be slow. This is why this class caches the value by 149 // StaticVariableInitializer. 150 static int64 GetCycleCounterFrequencyImpl(); 151 152 // Return a singleton of ICpuUtilsHelper 153 // ICpuUtilsHelper is declared as a function-local static variable 154 // for the following two reasons: 155 // 1. Avoid passing instances to all classes which want 156 // to use profiling tools in CpuUtils 157 // 2. Minimize the overhead of acquiring ICpuUtilsHelper 158 static ICpuUtilsHelper& GetCpuUtilsHelperSingletonInstance(); 159 160 TF_DISALLOW_COPY_AND_ASSIGN(CpuUtils); 161 }; 162 163 } // namespace profile_utils 164 165 } // namespace tensorflow 166 167 #endif // TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_ 168