• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 // This class is designed to get accurate profile for programs.
16 
17 #ifndef TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_
18 #define TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_
19 
20 #include <chrono>
21 #include <memory>
22 
23 #include "tensorflow/core/platform/macros.h"
24 #include "tensorflow/core/platform/profile_utils/i_cpu_utils_helper.h"
25 #include "tensorflow/core/platform/types.h"
26 
27 #if defined(ARMV6) || defined(__ARM_ARCH_7A__)
28 #include <sys/time.h>
29 #endif
30 
31 #if defined(_WIN32)
32 #include <intrin.h>
33 #endif
34 
35 namespace tensorflow {
36 
37 namespace profile_utils {
38 
39 // CpuUtils is a profiling tool with static functions
40 // designed to be called from multiple classes.
41 // A dedicated class which inherits ICpuUtilsHelper is
42 // stored as a function-local static variable which inherits
43 // GetCpuUtilsHelperSingletonInstance that caches CPU information,
44 // because loading CPU information may take a long time.
45 // Users must call EnableClockCycleProfiling before using CpuUtils.
46 class CpuUtils {
47  public:
48   // Constant for invalid frequency.
49   // This value is returned when the frequency is not obtained somehow.
50   static constexpr int64 INVALID_FREQUENCY = -1;
51   static constexpr uint64 DUMMY_CYCLE_CLOCK = 1;
52 
53   // Return current clock cycle. This function is designed to
54   // minimize the overhead to get clock and maximize the accuracy of
55   // time for profile.
56   // This returns unsigned int because there is no guarantee that rdtsc
57   // is less than 2 ^ 61.
GetCurrentClockCycle()58   static inline uint64 GetCurrentClockCycle() {
59 #if defined(__ANDROID__)
60     return GetCpuUtilsHelperSingletonInstance().GetCurrentClockCycle();
61 // ----------------------------------------------------------------
62 #elif defined(_WIN32)
63     return __rdtsc();
64 // ----------------------------------------------------------------
65 #elif defined(__x86_64__) || defined(__amd64__)
66     uint64_t high, low;
67     __asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
68     return (high << 32) | low;
69 // ----------------------------------------------------------------
70 #elif defined(__aarch64__)
71     // System timer of ARMv8 runs at a different frequency than the CPU's.
72     // The frequency is fixed, typically in the range 1-50MHz.  It can because
73     // read at CNTFRQ special register.  We assume the OS has set up
74     // the virtual timer properly.
75     uint64_t virtual_timer_value;
76     asm volatile("mrs %0, cntvct_el0" : "=r"(virtual_timer_value));
77     return virtual_timer_value;
78 // ----------------------------------------------------------------
79 // V6 is the earliest arm that has a standard cyclecount
80 #elif defined(ARMV6) || defined(__ARM_ARCH_7A__)
81     uint32_t pmccntr;
82     uint32_t pmuseren;
83     uint32_t pmcntenset;
84     // Read the user mode perf monitor counter access permissions.
85     asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
86     if (pmuseren & 1) {  // Allows reading perfmon counters for user mode code.
87       asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
88       if (pmcntenset & 0x80000000ul) {  // Is it counting?
89         asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
90         // The counter is set up to count every 64th cyclecount
91         return static_cast<uint64>(pmccntr) * 64;  // Should optimize to << 64
92       }
93     }
94     // Returning dummy clock when can't access to the counter
95     return DUMMY_CYCLE_CLOCK;
96 #elif defined(__powerpc64__) || defined(__ppc64__)
97     uint64 __t;
98     __asm__ __volatile__("mfspr %0,268" : "=r"(__t));
99     return __t;
100 
101 #elif defined(__powerpc__) || defined(__ppc__)
102     uint64 upper, lower, tmp;
103     __asm__ volatile(
104         "0:                     \n"
105         "\tmftbu   %0           \n"
106         "\tmftb    %1           \n"
107         "\tmftbu   %2           \n"
108         "\tcmpw    %2,%0        \n"
109         "\tbne     0b           \n"
110         : "=r"(upper), "=r"(lower), "=r"(tmp));
111     return ((static_cast<uint64>(upper) << 32) | lower);
112 #elif defined(__s390x__)
113     // TOD Clock of s390x runs at a different frequency than the CPU's.
114     // The stepping is 244 picoseconds (~4Ghz).
115     uint64 t;
116     __asm__ __volatile__("stckf %0" : "=Q"(t));
117     return t;
118 #else
119     // TODO(satok): Support generic way to emulate clock count.
120     // TODO(satok): Support other architectures if wanted.
121     // Returning dummy clock when can't access to the counter
122     return DUMMY_CYCLE_CLOCK;
123 #endif
124   }
125 
126 // Return cycle counter frequency.
127 // As this method caches the cpu frequency internally,
128 // the first call will incur overhead, but not subsequent calls.
129 #if (defined(__powerpc__) ||                                             \
130      defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)) || \
131     (defined(__s390x__))
132   static uint64 GetCycleCounterFrequency();
133 #else
134   static int64 GetCycleCounterFrequency();
135 #endif
136 
137   // Return micro second per each clock
138   // As this method caches the cpu frequency internally,
139   // the first call will incur overhead, but not subsequent calls.
140   static double GetMicroSecPerClock();
141 
142   // Reset clock cycle
143   // Resetting clock cycle is recommended to prevent
144   // clock cycle counters from overflowing on some platforms.
145   static void ResetClockCycle();
146 
147   // Enable/Disable clock cycle profile
148   // You can enable / disable profile if it's supported by the platform
149   static void EnableClockCycleProfiling();
150   static void DisableClockCycleProfiling();
151 
152   // Return chrono::duration per each clock
153   static std::chrono::duration<double> ConvertClockCycleToTime(
154       const int64 clock_cycle);
155 
156  private:
157   class DefaultCpuUtilsHelper : public ICpuUtilsHelper {
158    public:
159     DefaultCpuUtilsHelper() = default;
ResetClockCycle()160     void ResetClockCycle() final {}
GetCurrentClockCycle()161     uint64 GetCurrentClockCycle() final { return DUMMY_CYCLE_CLOCK; }
EnableClockCycleProfiling()162     void EnableClockCycleProfiling() final {}
DisableClockCycleProfiling()163     void DisableClockCycleProfiling() final {}
CalculateCpuFrequency()164     int64 CalculateCpuFrequency() final { return INVALID_FREQUENCY; }
165 
166    private:
167     TF_DISALLOW_COPY_AND_ASSIGN(DefaultCpuUtilsHelper);
168   };
169 
170   // Return cpu frequency.
171   // CAVEAT: as this method calls system call and parse the message,
172   // this call may be slow. This is why this class caches the value by
173   // StaticVariableInitializer.
174   static int64 GetCycleCounterFrequencyImpl();
175 
176   // Return a singleton of ICpuUtilsHelper
177   // ICpuUtilsHelper is declared as a function-local static variable
178   // for the following two reasons:
179   // 1. Avoid passing instances to all classes which want
180   // to use profiling tools in CpuUtils
181   // 2. Minimize the overhead of acquiring ICpuUtilsHelper
182   static ICpuUtilsHelper& GetCpuUtilsHelperSingletonInstance();
183 
184   TF_DISALLOW_COPY_AND_ASSIGN(CpuUtils);
185 };
186 
187 }  // namespace profile_utils
188 
189 }  // namespace tensorflow
190 
191 #endif  // TENSORFLOW_CORE_PLATFORM_PROFILE_UTILS_CPU_UTILS_H_
192