• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
17 #define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
18 
19 #include <memory>
20 
21 #include "absl/container/fixed_array.h"
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/container/node_hash_set.h"
24 #include "absl/strings/string_view.h"
25 #include "tensorflow/core/framework/step_stats.pb.h"
26 #include "tensorflow/core/platform/macros.h"
27 #include "tensorflow/core/platform/status.h"
28 #include "tensorflow/core/platform/types.h"
29 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
30 
31 namespace tensorflow {
32 namespace profiler {
33 
34 struct MemcpyDetails {
35   // The amount of data copied for memcpy events.
36   size_t num_bytes;
37   // The destination device for peer-2-peer communication (memcpy). The source
38   // device is implicit: it's the current device.
39   uint32 destination;
40   // Whether or not the memcpy is asynchronous.
41   bool async;
42   // This contains CUpti_ActivityMemcpyKind for activity event (on device).
43   // For events from other CuptiTracerEventSource, it is always 0.
44   int8 kind;
45   // CUpti_ActivityMemoryKind of source.
46   int8 src_mem_kind;
47   // CUpti_ActivityMemoryKind of destination.
48   int8 dst_mem_kind;
49 };
50 
51 struct MemAllocDetails {
52   // Size of memory to be written over in bytes.
53   size_t num_bytes;
54   // The CUpti_ActivityMemoryKind value for this activity event.
55   int8 kind;
56   // The virtual address of allocation. 0 if it is a free operation.
57   uint64 address;
58 };
59 
60 using MemFreeDetails = MemAllocDetails;
61 
62 // Memory residency contains details read from CUpti_ActivityMemory type. This
63 // is populated in the CUPTI tracer encounters a CUPTI_ACTIVITY_KIND_MEMORY
64 // event. The start of this even corresponse to a cudaMalloc, and the end
65 // corresponds to a cudaFree.
66 using MemoryResidencyDetails = MemAllocDetails;
67 
68 struct MemsetDetails {
69   // Size of memory to be written over in bytes.
70   size_t num_bytes;
71   // The CUpti_ActivityMemoryKind value for this activity event.
72   int8 kind;
73   // Whether or not the memset is asynchronous.
74   bool async;
75 };
76 
77 struct KernelDetails {
78   // The number of registers used in this kernel.
79   uint32 registers_per_thread;
80   // The amount of shared memory space used by a thread block.
81   uint32 static_shared_memory_usage;
82   // The amount of dynamic memory space used by a thread block.
83   uint32 dynamic_shared_memory_usage;
84   // X-dimension of a thread block.
85   uint32 block_x;
86   // Y-dimension of a thread block.
87   uint32 block_y;
88   // Z-dimension of a thread block.
89   uint32 block_z;
90   // X-dimension of a grid.
91   uint32 grid_x;
92   // Y-dimension of a grid.
93   uint32 grid_y;
94   // Z-dimension of a grid.
95   uint32 grid_z;
96 };
97 
ToXStat(const KernelDetails & kernel_info,double occupancy_pct)98 inline std::string ToXStat(const KernelDetails& kernel_info,
99                            double occupancy_pct) {
100   return absl::StrCat(
101       "regs:", kernel_info.registers_per_thread,
102       " static_shared:", kernel_info.static_shared_memory_usage,
103       " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
104       " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
105       kernel_info.grid_z, " block:", kernel_info.block_x, ",",
106       kernel_info.block_y, ",", kernel_info.block_z,
107       " occ_pct:", occupancy_pct);
108 }
109 
110 // Gets the name of the CUpti_ActivityMemoryKind value.
111 absl::string_view GetMemoryKindName(int8 kind);
112 
113 enum class CuptiTracerEventType {
114   Unsupported = 0,
115   Kernel = 1,
116   MemcpyH2D = 2,
117   MemcpyD2H = 3,
118   MemcpyD2D = 4,
119   MemcpyP2P = 5,
120   MemcpyOther = 6,
121   MemoryAlloc = 7,
122   Overhead = 8,
123   UnifiedMemory = 9,
124   MemoryFree = 10,
125   Memset = 11,
126   MemoryResidency = 12,
127   Generic = 100,
128 };
129 
130 const char* GetTraceEventTypeName(const CuptiTracerEventType& type);
131 
132 enum class CuptiTracerEventSource {
133   Invalid = 0,
134   DriverCallback = 1,
135   Activity = 2,
136   // Maybe consider adding runtime callback and metric api in the future.
137 };
138 
139 struct CuptiTracerEvent {
140   static constexpr uint32 kInvalidThreadId =
141       std::numeric_limits<uint32_t>::max();
142   static constexpr uint32 kInvalidCorrelationId =
143       std::numeric_limits<uint32_t>::max();
144   static constexpr uint64 kInvalidContextId =
145       std::numeric_limits<uint64_t>::max();
146   static constexpr uint64 kInvalidStreamId =
147       std::numeric_limits<uint64_t>::max();
148   CuptiTracerEventType type = CuptiTracerEventType::Unsupported;
149   CuptiTracerEventSource source = CuptiTracerEventSource::Invalid;
150   // Although CUpti_CallbackData::functionName is persistent, however
151   // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
152   // it.
153   std::string name;
154   // This points to strings in AnnotationMap, which should outlive the point
155   // where serialization happens.
156   absl::string_view annotation;
157   absl::string_view nvtx_range;
158   uint64 start_time_ns = 0;
159   uint64 end_time_ns = 0;
160   uint32 device_id = 0;
161   uint32 correlation_id = kInvalidCorrelationId;
162   uint32 thread_id = kInvalidThreadId;
163   int64 context_id = kInvalidContextId;
164   int64 stream_id = kInvalidStreamId;
165   union {
166     // For Memcpy API and activities. `type` must be Memcpy*.
167     MemcpyDetails memcpy_info;
168     // Used for MemAlloc API. `type` must be MemoryAlloc.
169     MemAllocDetails memalloc_info;
170     // Used for kernel activities. `type` must be Kernel.
171     KernelDetails kernel_info;
172     // Used for MemFree activities. `type` must be MemoryFree.
173     MemFreeDetails memfree_info;
174     // Used for Memset API and activities. `type` must be Memset.
175     MemsetDetails memset_info;
176     // Used for Memory residency activities. `type` must be MemoryResidency.
177     MemoryResidencyDetails memory_residency_info;
178   };
179 };
180 
181 struct CuptiTracerCollectorOptions {
182   // Maximum number of events to collect from callback API; if -1, no limit.
183   // if 0, the callback API is enabled to build a correlation map, but no
184   // events are collected.
185   uint64 max_callback_api_events = 2 * 1024 * 1024;
186   // Maximum number of events to collect from activity API; if -1, no limit.
187   uint64 max_activity_api_events = 2 * 1024 * 1024;
188   // Maximum number of annotation strings that we can accommodate.
189   uint64 max_annotation_strings = 1024 * 1024;
190   // Number of GPUs involved.
191   uint32 num_gpus;
192 };
193 
194 class AnnotationMap {
195  public:
196   struct AnnotationInfo {
197     absl::string_view annotation;
198     absl::string_view nvtx_range;
199   };
200 
AnnotationMap(uint64 max_size,uint32 num_gpus)201   explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
202       : max_size_(max_size), per_device_map_(num_gpus) {}
203   void Add(uint32 device_id, uint32 correlation_id,
204            const absl::string_view annotation,
205            const absl::string_view nvtx_range);
206   AnnotationInfo LookUp(uint32 device_id, uint32 correlation_id);
207 
208  private:
209   struct PerDeviceAnnotationMap {
210     // The population/consumption of annotations might happen from multiple
211     // callback/activity api related threads.
212     absl::Mutex mutex;
213     // Annotation tends to be repetitive, use a hash_set to store the strings,
214     // an use the reference to the string in the map.
215     absl::node_hash_set<std::string> annotations;
216     absl::node_hash_set<std::string> nvtx_ranges;
217     absl::flat_hash_map<uint32, AnnotationInfo> correlation_map;
218   };
219   const uint64 max_size_;
220   absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
221 
222   TF_DISALLOW_COPY_AND_ASSIGN(AnnotationMap);
223 };
224 
225 class CuptiTraceCollector {
226  public:
CuptiTraceCollector(const CuptiTracerCollectorOptions & options)227   explicit CuptiTraceCollector(const CuptiTracerCollectorOptions& options)
228       : options_(options),
229         annotation_map_(options.max_annotation_strings, options.num_gpus) {}
~CuptiTraceCollector()230   virtual ~CuptiTraceCollector() {}
231 
232   // Producer side functions (i.e. called by CuptiTracer).
233   virtual void AddEvent(CuptiTracerEvent&& event) = 0;
234   virtual void OnEventsDropped(const std::string& reason,
235                                uint32 num_events) = 0;
236   virtual void Flush() = 0;
237 
238   // Consumer side functions (i.e. called by GPU tracer);
Export(StepStats * step_stats)239   virtual void Export(StepStats* step_stats) {}
Export(XSpace * space,uint64 end_gpu_ns)240   virtual bool Export(XSpace* space, uint64 end_gpu_ns) { return true; }
ReportNumEventsIfDropped()241   virtual std::string ReportNumEventsIfDropped() { return ""; }
242 
annotation_map()243   AnnotationMap* annotation_map() { return &annotation_map_; }
244 
245  protected:
246   CuptiTracerCollectorOptions options_;
247 
248  private:
249   AnnotationMap annotation_map_;
250 
251   TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollector);
252 };
253 
254 std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
255     const CuptiTracerCollectorOptions& options, const uint64 start_walltime_ns,
256     const uint64 start_gputime_ns);
257 
258 }  // namespace profiler
259 }  // namespace tensorflow
260 
261 #endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
262