1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
17 #define TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
18
19 #include <memory>
20
21 #include "absl/container/fixed_array.h"
22 #include "absl/container/flat_hash_map.h"
23 #include "absl/container/node_hash_set.h"
24 #include "absl/strings/string_view.h"
25 #include "tensorflow/core/framework/step_stats.pb.h"
26 #include "tensorflow/core/platform/macros.h"
27 #include "tensorflow/core/platform/status.h"
28 #include "tensorflow/core/platform/types.h"
29 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
30
31 namespace tensorflow {
32 namespace profiler {
33
34 struct MemcpyDetails {
35 // The amount of data copied for memcpy events.
36 size_t num_bytes;
37 // The destination device for peer-2-peer communication (memcpy). The source
38 // device is implicit: it's the current device.
39 uint32 destination;
40 // Whether or not the memcpy is asynchronous.
41 bool async;
42 // This contains CUpti_ActivityMemcpyKind for activity event (on device).
43 // For events from other CuptiTracerEventSource, it is always 0.
44 int8 kind;
45 // CUpti_ActivityMemoryKind of source.
46 int8 src_mem_kind;
47 // CUpti_ActivityMemoryKind of destination.
48 int8 dst_mem_kind;
49 };
50
51 struct MemAllocDetails {
52 // Size of memory to be written over in bytes.
53 size_t num_bytes;
54 // The CUpti_ActivityMemoryKind value for this activity event.
55 int8 kind;
56 // The virtual address of allocation. 0 if it is a free operation.
57 uint64 address;
58 };
59
60 using MemFreeDetails = MemAllocDetails;
61
62 // Memory residency contains details read from CUpti_ActivityMemory type. This
63 // is populated in the CUPTI tracer encounters a CUPTI_ACTIVITY_KIND_MEMORY
64 // event. The start of this even corresponse to a cudaMalloc, and the end
65 // corresponds to a cudaFree.
66 using MemoryResidencyDetails = MemAllocDetails;
67
68 struct MemsetDetails {
69 // Size of memory to be written over in bytes.
70 size_t num_bytes;
71 // The CUpti_ActivityMemoryKind value for this activity event.
72 int8 kind;
73 // Whether or not the memset is asynchronous.
74 bool async;
75 };
76
77 struct KernelDetails {
78 // The number of registers used in this kernel.
79 uint32 registers_per_thread;
80 // The amount of shared memory space used by a thread block.
81 uint32 static_shared_memory_usage;
82 // The amount of dynamic memory space used by a thread block.
83 uint32 dynamic_shared_memory_usage;
84 // X-dimension of a thread block.
85 uint32 block_x;
86 // Y-dimension of a thread block.
87 uint32 block_y;
88 // Z-dimension of a thread block.
89 uint32 block_z;
90 // X-dimension of a grid.
91 uint32 grid_x;
92 // Y-dimension of a grid.
93 uint32 grid_y;
94 // Z-dimension of a grid.
95 uint32 grid_z;
96 };
97
ToXStat(const KernelDetails & kernel_info,double occupancy_pct)98 inline std::string ToXStat(const KernelDetails& kernel_info,
99 double occupancy_pct) {
100 return absl::StrCat(
101 "regs:", kernel_info.registers_per_thread,
102 " static_shared:", kernel_info.static_shared_memory_usage,
103 " dynamic_shared:", kernel_info.dynamic_shared_memory_usage,
104 " grid:", kernel_info.grid_x, ",", kernel_info.grid_y, ",",
105 kernel_info.grid_z, " block:", kernel_info.block_x, ",",
106 kernel_info.block_y, ",", kernel_info.block_z,
107 " occ_pct:", occupancy_pct);
108 }
109
110 // Gets the name of the CUpti_ActivityMemoryKind value.
111 absl::string_view GetMemoryKindName(int8 kind);
112
113 enum class CuptiTracerEventType {
114 Unsupported = 0,
115 Kernel = 1,
116 MemcpyH2D = 2,
117 MemcpyD2H = 3,
118 MemcpyD2D = 4,
119 MemcpyP2P = 5,
120 MemcpyOther = 6,
121 MemoryAlloc = 7,
122 Overhead = 8,
123 UnifiedMemory = 9,
124 MemoryFree = 10,
125 Memset = 11,
126 MemoryResidency = 12,
127 Generic = 100,
128 };
129
130 const char* GetTraceEventTypeName(const CuptiTracerEventType& type);
131
132 enum class CuptiTracerEventSource {
133 Invalid = 0,
134 DriverCallback = 1,
135 Activity = 2,
136 // Maybe consider adding runtime callback and metric api in the future.
137 };
138
139 struct CuptiTracerEvent {
140 static constexpr uint32 kInvalidThreadId =
141 std::numeric_limits<uint32_t>::max();
142 static constexpr uint32 kInvalidCorrelationId =
143 std::numeric_limits<uint32_t>::max();
144 static constexpr uint64 kInvalidContextId =
145 std::numeric_limits<uint64_t>::max();
146 static constexpr uint64 kInvalidStreamId =
147 std::numeric_limits<uint64_t>::max();
148 CuptiTracerEventType type = CuptiTracerEventType::Unsupported;
149 CuptiTracerEventSource source = CuptiTracerEventSource::Invalid;
150 // Although CUpti_CallbackData::functionName is persistent, however
151 // CUpti_ActivityKernel4::name is not persistent, therefore we need a copy of
152 // it.
153 std::string name;
154 // This points to strings in AnnotationMap, which should outlive the point
155 // where serialization happens.
156 absl::string_view annotation;
157 absl::string_view nvtx_range;
158 uint64 start_time_ns = 0;
159 uint64 end_time_ns = 0;
160 uint32 device_id = 0;
161 uint32 correlation_id = kInvalidCorrelationId;
162 uint32 thread_id = kInvalidThreadId;
163 int64 context_id = kInvalidContextId;
164 int64 stream_id = kInvalidStreamId;
165 union {
166 // For Memcpy API and activities. `type` must be Memcpy*.
167 MemcpyDetails memcpy_info;
168 // Used for MemAlloc API. `type` must be MemoryAlloc.
169 MemAllocDetails memalloc_info;
170 // Used for kernel activities. `type` must be Kernel.
171 KernelDetails kernel_info;
172 // Used for MemFree activities. `type` must be MemoryFree.
173 MemFreeDetails memfree_info;
174 // Used for Memset API and activities. `type` must be Memset.
175 MemsetDetails memset_info;
176 // Used for Memory residency activities. `type` must be MemoryResidency.
177 MemoryResidencyDetails memory_residency_info;
178 };
179 };
180
181 struct CuptiTracerCollectorOptions {
182 // Maximum number of events to collect from callback API; if -1, no limit.
183 // if 0, the callback API is enabled to build a correlation map, but no
184 // events are collected.
185 uint64 max_callback_api_events = 2 * 1024 * 1024;
186 // Maximum number of events to collect from activity API; if -1, no limit.
187 uint64 max_activity_api_events = 2 * 1024 * 1024;
188 // Maximum number of annotation strings that we can accommodate.
189 uint64 max_annotation_strings = 1024 * 1024;
190 // Number of GPUs involved.
191 uint32 num_gpus;
192 };
193
194 class AnnotationMap {
195 public:
196 struct AnnotationInfo {
197 absl::string_view annotation;
198 absl::string_view nvtx_range;
199 };
200
AnnotationMap(uint64 max_size,uint32 num_gpus)201 explicit AnnotationMap(uint64 max_size, uint32 num_gpus)
202 : max_size_(max_size), per_device_map_(num_gpus) {}
203 void Add(uint32 device_id, uint32 correlation_id,
204 const absl::string_view annotation,
205 const absl::string_view nvtx_range);
206 AnnotationInfo LookUp(uint32 device_id, uint32 correlation_id);
207
208 private:
209 struct PerDeviceAnnotationMap {
210 // The population/consumption of annotations might happen from multiple
211 // callback/activity api related threads.
212 absl::Mutex mutex;
213 // Annotation tends to be repetitive, use a hash_set to store the strings,
214 // an use the reference to the string in the map.
215 absl::node_hash_set<std::string> annotations;
216 absl::node_hash_set<std::string> nvtx_ranges;
217 absl::flat_hash_map<uint32, AnnotationInfo> correlation_map;
218 };
219 const uint64 max_size_;
220 absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
221
222 TF_DISALLOW_COPY_AND_ASSIGN(AnnotationMap);
223 };
224
225 class CuptiTraceCollector {
226 public:
CuptiTraceCollector(const CuptiTracerCollectorOptions & options)227 explicit CuptiTraceCollector(const CuptiTracerCollectorOptions& options)
228 : options_(options),
229 annotation_map_(options.max_annotation_strings, options.num_gpus) {}
~CuptiTraceCollector()230 virtual ~CuptiTraceCollector() {}
231
232 // Producer side functions (i.e. called by CuptiTracer).
233 virtual void AddEvent(CuptiTracerEvent&& event) = 0;
234 virtual void OnEventsDropped(const std::string& reason,
235 uint32 num_events) = 0;
236 virtual void Flush() = 0;
237
238 // Consumer side functions (i.e. called by GPU tracer);
Export(StepStats * step_stats)239 virtual void Export(StepStats* step_stats) {}
Export(XSpace * space,uint64 end_gpu_ns)240 virtual bool Export(XSpace* space, uint64 end_gpu_ns) { return true; }
ReportNumEventsIfDropped()241 virtual std::string ReportNumEventsIfDropped() { return ""; }
242
annotation_map()243 AnnotationMap* annotation_map() { return &annotation_map_; }
244
245 protected:
246 CuptiTracerCollectorOptions options_;
247
248 private:
249 AnnotationMap annotation_map_;
250
251 TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollector);
252 };
253
254 std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
255 const CuptiTracerCollectorOptions& options, const uint64 start_walltime_ns,
256 const uint64 start_gputime_ns);
257
258 } // namespace profiler
259 } // namespace tensorflow
260
261 #endif // TENSORFLOW_CORE_PROFILER_INTERNAL_GPU_CUPTI_COLLECTOR_H_
262