• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
17 #define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
18 
19 #include <memory>
20 
21 #include "absl/container/fixed_array.h"
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/container/node_hash_set.h"
24 #include "absl/strings/string_view.h"
25 #include "tensorflow/core/platform/macros.h"
26 #include "tensorflow/core/platform/status.h"
27 #include "tensorflow/core/platform/types.h"
28 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
29 
30 namespace tensorflow {
31 namespace profiler {
32 
33 struct MemcpyDetails {
34   // The amount of data copied for memcpy events.
35   size_t num_bytes;
36   // The destination device for peer-2-peer communication (memcpy). The source
37   // device is implicit: it's the current device.
38   uint32 destination;
39   // Whether or not the memcpy is asynchronous.
40   bool async;
41   // This contains CUpti_ActivityMemcpyKind for activity event (on device).
42   // For events from other CuptiTracerEventSource, it is always 0.
43   int8 copy_kind;
44   // CUpti_ActivityMemoryKind of source.
45   int8 src_mem_kind;
46   // CUpti_ActivityMemoryKind of destination.
47   int8 dst_mem_kind;
48 };
49 
50 struct MemAllocDetails {
51   // Size of memory to be written over in bytes.
52   size_t num_bytes;
53   // The CUpti_ActivityMemoryKind value for this activity event.
54   int8 mem_kind;
55   // The virtual address of allocation. 0 if it is a free operation.
56   uint64 address;
57 };
58 
59 using MemFreeDetails = MemAllocDetails;
60 
61 // Memory residency contains details read from CUpti_ActivityMemory type. This
62 // is populated in the CUPTI tracer encounters a CUPTI_ACTIVITY_KIND_MEMORY
63 // event. The start of this even corresponse to a cudaMalloc, and the end
64 // corresponds to a cudaFree.
65 using MemoryResidencyDetails = MemAllocDetails;
66 
67 struct MemsetDetails {
68   // Size of memory to be written over in bytes.
69   size_t num_bytes;
70   // The CUpti_ActivityMemoryKind value for this activity event.
71   int8 mem_kind;
72   // Whether or not the memset is asynchronous.
73   bool async;
74 };
75 
76 struct KernelDetails {
77   // The number of registers used in this kernel.
78   uint32 registers_per_thread;
79   // The amount of shared memory space used by a thread block.
80   uint32 static_shared_memory_usage;
81   // The amount of dynamic memory space used by a thread block.
82   uint32 dynamic_shared_memory_usage;
83   // X-dimension of a thread block.
84   uint32 block_x;
85   // Y-dimension of a thread block.
86   uint32 block_y;
87   // Z-dimension of a thread block.
88   uint32 block_z;
89   // X-dimension of a grid.
90   uint32 grid_x;
91   // Y-dimension of a grid.
92   uint32 grid_y;
93   // Z-dimension of a grid.
94   uint32 grid_z;
95 };
96 
ToXStat(const KernelDetails & kernel_info,double occupancy_pct)97 inline std::string ToXStat(const KernelDetails& kernel_info,
98                            double occupancy_pct) {
99   return absl::StrCat(
100       "regs:", kernel_info.registers_per_thread,
101       " static_shared:", kernel_info.static_shared_memory_usage,
102       " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
103       " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
104       kernel_info.grid_z, " block:", kernel_info.block_x, ",",
105       kernel_info.block_y, ",", kernel_info.block_z,
106       " occ_pct:", occupancy_pct);
107 }
108 
109 // Gets the name of the CUpti_ActivityMemoryKind value.
110 absl::string_view GetMemoryKindName(int8_t memory_kind);
111 
112 enum class CuptiTracerEventType {
113   Unsupported = 0,
114   Kernel = 1,
115   MemcpyH2D = 2,
116   MemcpyD2H = 3,
117   MemcpyD2D = 4,
118   MemcpyP2P = 5,
119   MemcpyOther = 6,
120   MemoryAlloc = 7,
121   Overhead = 8,
122   UnifiedMemory = 9,
123   MemoryFree = 10,
124   Memset = 11,
125   MemoryResidency = 12,
126   Generic = 100,
127 };
128 
129 const char* GetTraceEventTypeName(const CuptiTracerEventType& type);
130 
131 enum class CuptiTracerEventSource {
132   Invalid = 0,
133   DriverCallback = 1,
134   Activity = 2,
135   // Maybe consider adding runtime callback and metric api in the future.
136 };
137 
138 struct CuptiTracerEvent {
139   static constexpr uint32 kInvalidThreadId =
140       std::numeric_limits<uint32_t>::max();
141   static constexpr uint32 kInvalidCorrelationId =
142       std::numeric_limits<uint32_t>::max();
143   static constexpr uint64 kInvalidContextId =
144       std::numeric_limits<uint64_t>::max();
145   static constexpr uint64 kInvalidStreamId =
146       std::numeric_limits<uint64_t>::max();
147   CuptiTracerEventType type = CuptiTracerEventType::Unsupported;
148   CuptiTracerEventSource source = CuptiTracerEventSource::Invalid;
149   // Although CUpti_CallbackData::functionName is persistent, however
150   // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
151   // it.
152   std::string name;
153   // This points to strings in AnnotationMap, which should outlive the point
154   // where serialization happens.
155   absl::string_view annotation;
156   absl::string_view nvtx_range;
157   uint64 start_time_ns = 0;
158   uint64 end_time_ns = 0;
159   uint32 device_id = 0;
160   uint32 correlation_id = kInvalidCorrelationId;
161   uint32 thread_id = kInvalidThreadId;
162   int64 context_id = kInvalidContextId;
163   int64 stream_id = kInvalidStreamId;
164   union {
165     // For Memcpy API and activities. `type` must be Memcpy*.
166     MemcpyDetails memcpy_info;
167     // Used for MemAlloc API. `type` must be MemoryAlloc.
168     MemAllocDetails memalloc_info;
169     // Used for kernel activities. `type` must be Kernel.
170     KernelDetails kernel_info;
171     // Used for MemFree activities. `type` must be MemoryFree.
172     MemFreeDetails memfree_info;
173     // Used for Memset API and activities. `type` must be Memset.
174     MemsetDetails memset_info;
175     // Used for Memory residency activities. `type` must be MemoryResidency.
176     MemoryResidencyDetails memory_residency_info;
177   };
178 };
179 
180 struct CuptiTracerCollectorOptions {
181   // Maximum number of events to collect from callback API; if -1, no limit.
182   // if 0, the callback API is enabled to build a correlation map, but no
183   // events are collected.
184   uint64 max_callback_api_events = 2 * 1024 * 1024;
185   // Maximum number of events to collect from activity API; if -1, no limit.
186   uint64 max_activity_api_events = 2 * 1024 * 1024;
187   // Maximum number of annotation strings that we can accommodate.
188   uint64 max_annotation_strings = 1024 * 1024;
189   // Number of GPUs involved.
190   uint32 num_gpus;
191 };
192 
193 class AnnotationMap {
194  public:
195   struct AnnotationInfo {
196     absl::string_view annotation;
197     absl::string_view nvtx_range;
198   };
199 
AnnotationMap(uint64 max_size,uint32 num_gpus)200   explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
201       : max_size_(max_size), per_device_map_(num_gpus) {}
202   void Add(uint32 device_id, uint32 correlation_id,
203            const absl::string_view annotation,
204            const absl::string_view nvtx_range);
205   AnnotationInfo LookUp(uint32 device_id, uint32 correlation_id);
206 
207  private:
208   struct PerDeviceAnnotationMap {
209     // The population/consumption of annotations might happen from multiple
210     // callback/activity api related threads.
211     absl::Mutex mutex;
212     // Annotation tends to be repetitive, use a hash_set to store the strings,
213     // an use the reference to the string in the map.
214     absl::node_hash_set<std::string> annotations;
215     absl::node_hash_set<std::string> nvtx_ranges;
216     absl::flat_hash_map<uint32, AnnotationInfo> correlation_map;
217   };
218   const uint64 max_size_;
219   absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
220 
221   TF_DISALLOW_COPY_AND_ASSIGN(AnnotationMap);
222 };
223 
224 class CuptiTraceCollector {
225  public:
CuptiTraceCollector(const CuptiTracerCollectorOptions & options)226   explicit CuptiTraceCollector(const CuptiTracerCollectorOptions& options)
227       : options_(options),
228         annotation_map_(options.max_annotation_strings, options.num_gpus) {}
~CuptiTraceCollector()229   virtual ~CuptiTraceCollector() {}
230 
231   // Producer side functions (i.e. called by CuptiTracer).
232   virtual void AddEvent(CuptiTracerEvent&& event) = 0;
233   virtual void OnEventsDropped(const std::string& reason,
234                                uint32 num_events) = 0;
235   virtual void Flush() = 0;
236 
237   // Consumer side functions (i.e. called by GPU tracer);
Export(XSpace * space,uint64 end_gpu_ns)238   virtual bool Export(XSpace* space, uint64 end_gpu_ns) { return true; }
ReportNumEventsIfDropped()239   virtual std::string ReportNumEventsIfDropped() { return ""; }
240 
annotation_map()241   AnnotationMap* annotation_map() { return &annotation_map_; }
242 
243  protected:
244   CuptiTracerCollectorOptions options_;
245 
246  private:
247   AnnotationMap annotation_map_;
248 
249   TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollector);
250 };
251 
252 std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
253     const CuptiTracerCollectorOptions& options, const uint64 start_walltime_ns,
254     const uint64 start_gputime_ns);
255 
256 }  // namespace profiler
257 }  // namespace tensorflow
258 
259 #endif  // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
260