1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
17 #define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
18
19 #include <memory>
20
21 #include "absl/container/fixed_array.h"
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/container/node_hash_set.h"
24 #include "absl/strings/string_view.h"
25 #include "tensorflow/core/platform/macros.h"
26 #include "tensorflow/core/platform/status.h"
27 #include "tensorflow/core/platform/types.h"
28 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
29
30 namespace tensorflow {
31 namespace profiler {
32
33 struct MemcpyDetails {
34 // The amount of data copied for memcpy events.
35 size_t num_bytes;
36 // The destination device for peer-2-peer communication (memcpy). The source
37 // device is implicit: it's the current device.
38 uint32 destination;
39 // Whether or not the memcpy is asynchronous.
40 bool async;
41 // This contains CUpti_ActivityMemcpyKind for activity event (on device).
42 // For events from other CuptiTracerEventSource, it is always 0.
43 int8 copy_kind;
44 // CUpti_ActivityMemoryKind of source.
45 int8 src_mem_kind;
46 // CUpti_ActivityMemoryKind of destination.
47 int8 dst_mem_kind;
48 };
49
50 struct MemAllocDetails {
51 // Size of memory to be written over in bytes.
52 size_t num_bytes;
53 // The CUpti_ActivityMemoryKind value for this activity event.
54 int8 mem_kind;
55 // The virtual address of allocation. 0 if it is a free operation.
56 uint64 address;
57 };
58
59 using MemFreeDetails = MemAllocDetails;
60
61 // Memory residency contains details read from CUpti_ActivityMemory type. This
62 // is populated in the CUPTI tracer encounters a CUPTI_ACTIVITY_KIND_MEMORY
63 // event. The start of this even corresponse to a cudaMalloc, and the end
64 // corresponds to a cudaFree.
65 using MemoryResidencyDetails = MemAllocDetails;
66
67 struct MemsetDetails {
68 // Size of memory to be written over in bytes.
69 size_t num_bytes;
70 // The CUpti_ActivityMemoryKind value for this activity event.
71 int8 mem_kind;
72 // Whether or not the memset is asynchronous.
73 bool async;
74 };
75
76 struct KernelDetails {
77 // The number of registers used in this kernel.
78 uint32 registers_per_thread;
79 // The amount of shared memory space used by a thread block.
80 uint32 static_shared_memory_usage;
81 // The amount of dynamic memory space used by a thread block.
82 uint32 dynamic_shared_memory_usage;
83 // X-dimension of a thread block.
84 uint32 block_x;
85 // Y-dimension of a thread block.
86 uint32 block_y;
87 // Z-dimension of a thread block.
88 uint32 block_z;
89 // X-dimension of a grid.
90 uint32 grid_x;
91 // Y-dimension of a grid.
92 uint32 grid_y;
93 // Z-dimension of a grid.
94 uint32 grid_z;
95 };
96
ToXStat(const KernelDetails & kernel_info,double occupancy_pct)97 inline std::string ToXStat(const KernelDetails& kernel_info,
98 double occupancy_pct) {
99 return absl::StrCat(
100 "regs:", kernel_info.registers_per_thread,
101 " static_shared:", kernel_info.static_shared_memory_usage,
102 " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
103 " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
104 kernel_info.grid_z, " block:", kernel_info.block_x, ",",
105 kernel_info.block_y, ",", kernel_info.block_z,
106 " occ_pct:", occupancy_pct);
107 }
108
109 // Gets the name of the CUpti_ActivityMemoryKind value.
110 absl::string_view GetMemoryKindName(int8_t memory_kind);
111
112 enum class CuptiTracerEventType {
113 Unsupported = 0,
114 Kernel = 1,
115 MemcpyH2D = 2,
116 MemcpyD2H = 3,
117 MemcpyD2D = 4,
118 MemcpyP2P = 5,
119 MemcpyOther = 6,
120 MemoryAlloc = 7,
121 Overhead = 8,
122 UnifiedMemory = 9,
123 MemoryFree = 10,
124 Memset = 11,
125 MemoryResidency = 12,
126 Generic = 100,
127 };
128
129 const char* GetTraceEventTypeName(const CuptiTracerEventType& type);
130
131 enum class CuptiTracerEventSource {
132 Invalid = 0,
133 DriverCallback = 1,
134 Activity = 2,
135 // Maybe consider adding runtime callback and metric api in the future.
136 };
137
138 struct CuptiTracerEvent {
139 static constexpr uint32 kInvalidThreadId =
140 std::numeric_limits<uint32_t>::max();
141 static constexpr uint32 kInvalidCorrelationId =
142 std::numeric_limits<uint32_t>::max();
143 static constexpr uint64 kInvalidContextId =
144 std::numeric_limits<uint64_t>::max();
145 static constexpr uint64 kInvalidStreamId =
146 std::numeric_limits<uint64_t>::max();
147 CuptiTracerEventType type = CuptiTracerEventType::Unsupported;
148 CuptiTracerEventSource source = CuptiTracerEventSource::Invalid;
149 // Although CUpti_CallbackData::functionName is persistent, however
150 // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
151 // it.
152 std::string name;
153 // This points to strings in AnnotationMap, which should outlive the point
154 // where serialization happens.
155 absl::string_view annotation;
156 absl::string_view nvtx_range;
157 uint64 start_time_ns = 0;
158 uint64 end_time_ns = 0;
159 uint32 device_id = 0;
160 uint32 correlation_id = kInvalidCorrelationId;
161 uint32 thread_id = kInvalidThreadId;
162 int64 context_id = kInvalidContextId;
163 int64 stream_id = kInvalidStreamId;
164 union {
165 // For Memcpy API and activities. `type` must be Memcpy*.
166 MemcpyDetails memcpy_info;
167 // Used for MemAlloc API. `type` must be MemoryAlloc.
168 MemAllocDetails memalloc_info;
169 // Used for kernel activities. `type` must be Kernel.
170 KernelDetails kernel_info;
171 // Used for MemFree activities. `type` must be MemoryFree.
172 MemFreeDetails memfree_info;
173 // Used for Memset API and activities. `type` must be Memset.
174 MemsetDetails memset_info;
175 // Used for Memory residency activities. `type` must be MemoryResidency.
176 MemoryResidencyDetails memory_residency_info;
177 };
178 };
179
180 struct CuptiTracerCollectorOptions {
181 // Maximum number of events to collect from callback API; if -1, no limit.
182 // if 0, the callback API is enabled to build a correlation map, but no
183 // events are collected.
184 uint64 max_callback_api_events = 2 * 1024 * 1024;
185 // Maximum number of events to collect from activity API; if -1, no limit.
186 uint64 max_activity_api_events = 2 * 1024 * 1024;
187 // Maximum number of annotation strings that we can accommodate.
188 uint64 max_annotation_strings = 1024 * 1024;
189 // Number of GPUs involved.
190 uint32 num_gpus;
191 };
192
193 class AnnotationMap {
194 public:
195 struct AnnotationInfo {
196 absl::string_view annotation;
197 absl::string_view nvtx_range;
198 };
199
AnnotationMap(uint64 max_size,uint32 num_gpus)200 explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
201 : max_size_(max_size), per_device_map_(num_gpus) {}
202 void Add(uint32 device_id, uint32 correlation_id,
203 const absl::string_view annotation,
204 const absl::string_view nvtx_range);
205 AnnotationInfo LookUp(uint32 device_id, uint32 correlation_id);
206
207 private:
208 struct PerDeviceAnnotationMap {
209 // The population/consumption of annotations might happen from multiple
210 // callback/activity api related threads.
211 absl::Mutex mutex;
212 // Annotation tends to be repetitive, use a hash_set to store the strings,
213 // an use the reference to the string in the map.
214 absl::node_hash_set<std::string> annotations;
215 absl::node_hash_set<std::string> nvtx_ranges;
216 absl::flat_hash_map<uint32, AnnotationInfo> correlation_map;
217 };
218 const uint64 max_size_;
219 absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
220
221 TF_DISALLOW_COPY_AND_ASSIGN(AnnotationMap);
222 };
223
224 class CuptiTraceCollector {
225 public:
CuptiTraceCollector(const CuptiTracerCollectorOptions & options)226 explicit CuptiTraceCollector(const CuptiTracerCollectorOptions& options)
227 : options_(options),
228 annotation_map_(options.max_annotation_strings, options.num_gpus) {}
~CuptiTraceCollector()229 virtual ~CuptiTraceCollector() {}
230
231 // Producer side functions (i.e. called by CuptiTracer).
232 virtual void AddEvent(CuptiTracerEvent&& event) = 0;
233 virtual void OnEventsDropped(const std::string& reason,
234 uint32 num_events) = 0;
235 virtual void Flush() = 0;
236
237 // Consumer side functions (i.e. called by GPU tracer);
Export(XSpace * space,uint64 end_gpu_ns)238 virtual bool Export(XSpace* space, uint64 end_gpu_ns) { return true; }
ReportNumEventsIfDropped()239 virtual std::string ReportNumEventsIfDropped() { return ""; }
240
annotation_map()241 AnnotationMap* annotation_map() { return &annotation_map_; }
242
243 protected:
244 CuptiTracerCollectorOptions options_;
245
246 private:
247 AnnotationMap annotation_map_;
248
249 TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollector);
250 };
251
252 std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
253 const CuptiTracerCollectorOptions& options, const uint64 start_walltime_ns,
254 const uint64 start_gputime_ns);
255
256 } // namespace profiler
257 } // namespace tensorflow
258
259 #endif // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
260