1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/internal/gpu/cupti_collector.h"
17
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/container/flat_hash_set.h"
20 #include "absl/hash/hash.h"
21 #include "absl/strings/str_cat.h"
22 #include "absl/strings/str_join.h"
23 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
24 #include "third_party/gpus/cuda/include/cuda.h"
25 #include "third_party/gpus/cuda/include/cuda_occupancy.h"
26 #include "tensorflow/core/platform/abi.h"
27 #include "tensorflow/core/platform/host_info.h"
28 #include "tensorflow/core/platform/mutex.h"
29 #include "tensorflow/core/profiler/utils/parse_annotation.h"
30 #include "tensorflow/core/profiler/utils/xplane_builder.h"
31 #include "tensorflow/core/profiler/utils/xplane_schema.h"
32 #include "tensorflow/core/profiler/utils/xplane_utils.h"
33
34 namespace tensorflow {
35 namespace profiler {
36
37 namespace {
38
IsHostEvent(const CuptiTracerEvent & event,int64 * line_id)39 bool IsHostEvent(const CuptiTracerEvent& event, int64* line_id) {
40 // DriverCallback(i.e. kernel launching) events are host events.
41 if (event.source == CuptiTracerEventSource::DriverCallback) {
42 *line_id = event.thread_id;
43 return true;
44 }
45 // Non-overhead activity events are device events.
46 if (event.type != CuptiTracerEventType::Overhead) {
47 *line_id = event.stream_id;
48 return false;
49 }
50 // Overhead events can be associated with a thread or a stream, etc.
51 // If a valid thread id is specified, we consider it as a host event.
52 //
53 if (event.stream_id != CuptiTracerEvent::kInvalidStreamId) {
54 *line_id = event.stream_id;
55 return false;
56 } else if (event.thread_id != CuptiTracerEvent::kInvalidThreadId &&
57 event.thread_id != 0) {
58 *line_id = event.thread_id;
59 return true;
60 } else {
61 *line_id = kThreadIdOverhead;
62 return false;
63 }
64 }
65
66 struct DeviceOccupancyParams {
67 cudaOccFuncAttributes attributes = {};
68 int block_size = 0;
69 size_t dynamic_smem_size = 0;
70
operator ==(const DeviceOccupancyParams & lhs,const DeviceOccupancyParams & rhs)71 friend bool operator==(const DeviceOccupancyParams& lhs,
72 const DeviceOccupancyParams& rhs) {
73 return 0 == memcmp(&lhs, &rhs, sizeof(lhs));
74 }
75
76 template <typename H>
AbslHashValue(H hash_state,const DeviceOccupancyParams & params)77 friend H AbslHashValue(H hash_state, const DeviceOccupancyParams& params) {
78 return H::combine(
79 std::move(hash_state), params.attributes.maxThreadsPerBlock,
80 params.attributes.numRegs, params.attributes.sharedSizeBytes,
81 static_cast<uint32_t>(params.attributes.partitionedGCConfig),
82 static_cast<uint32_t>(params.attributes.shmemLimitConfig),
83 params.attributes.maxDynamicSharedSizeBytes, params.block_size,
84 params.dynamic_smem_size);
85 }
86 };
87
88 struct OccupancyStats {
89 double occupancy_pct = 0.0;
90 int min_grid_size = 0;
91 int suggested_block_size = 0;
92 };
93
94 class PerDeviceCollector {
95 private:
GetOccupancy(const DeviceOccupancyParams & params) const96 OccupancyStats GetOccupancy(const DeviceOccupancyParams& params) const {
97 OccupancyStats stats;
98 if (device_properties_.computeMajor == 0) {
99 return {};
100 }
101
102 const cudaOccDeviceState state = {};
103 cudaOccResult occ_result;
104 cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor(
105 &occ_result, &device_properties_, ¶ms.attributes, &state,
106 params.block_size, params.dynamic_smem_size);
107 if (status != CUDA_OCC_SUCCESS) {
108 return {};
109 }
110
111 stats.occupancy_pct =
112 occ_result.activeBlocksPerMultiprocessor * params.block_size * 100;
113 stats.occupancy_pct /= device_properties_.maxThreadsPerMultiprocessor;
114
115 status = cudaOccMaxPotentialOccupancyBlockSize(
116 &stats.min_grid_size, &stats.suggested_block_size, &device_properties_,
117 ¶ms.attributes, &state, NULL, params.dynamic_smem_size);
118 if (status != CUDA_OCC_SUCCESS) {
119 return {};
120 }
121
122 return stats;
123 }
124
CreateXEvent(const CuptiTracerEvent & event,XPlaneBuilder * plane,uint64 start_gpu_ns,uint64 end_gpu_ns,XLineBuilder * line)125 void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
126 uint64 start_gpu_ns, uint64 end_gpu_ns,
127 XLineBuilder* line) {
128 if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
129 event.start_time_ns > event.end_time_ns) {
130 VLOG(2) << "events have abnormal timestamps:" << event.name
131 << " start time(ns): " << event.start_time_ns
132 << " end time(ns): " << event.end_time_ns;
133 return;
134 }
135 std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
136 if (kernel_name.empty()) {
137 kernel_name = GetTraceEventTypeName(event.type);
138 }
139 XEventMetadata* event_metadata =
140 plane->GetOrCreateEventMetadata(std::move(kernel_name));
141 XEventBuilder xevent = line->AddEvent(*event_metadata);
142 VLOG(7) << "Adding event to line=" << line->Id();
143 xevent.SetTimestampNs(event.start_time_ns);
144 xevent.SetEndTimestampNs(event.end_time_ns);
145 if (event.source == CuptiTracerEventSource::DriverCallback) {
146 xevent.AddStatValue(
147 *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDeviceId)),
148 event.device_id);
149 }
150 if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
151 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
152 GetStatTypeStr(StatType::kCorrelationId)),
153 event.correlation_id);
154 }
155 if (!event.annotation.empty()) {
156 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
157 GetStatTypeStr(StatType::kKernelAnnotation)),
158 *plane->GetOrCreateStatMetadata(event.annotation));
159 }
160 if (!event.nvtx_range.empty()) {
161 xevent.AddStatValue(
162 *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)),
163 *plane->GetOrCreateStatMetadata(event.nvtx_range));
164 }
165 if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
166 xevent.AddStatValue(
167 *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)),
168 absl::StrCat("$$", static_cast<uint64>(event.context_id)));
169 }
170
171 if (event.type == CuptiTracerEventType::Kernel &&
172 event.source == CuptiTracerEventSource::Activity) {
173 DeviceOccupancyParams params{};
174 params.attributes.maxThreadsPerBlock = INT_MAX;
175 params.attributes.numRegs =
176 static_cast<int>(event.kernel_info.registers_per_thread);
177 params.attributes.sharedSizeBytes =
178 event.kernel_info.static_shared_memory_usage;
179 params.attributes.partitionedGCConfig = PARTITIONED_GC_OFF;
180 params.attributes.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
181 params.attributes.maxDynamicSharedSizeBytes = 0;
182 params.block_size = static_cast<int>(event.kernel_info.block_x *
183 event.kernel_info.block_y *
184 event.kernel_info.block_z);
185
186 params.dynamic_smem_size = event.kernel_info.dynamic_shared_memory_usage;
187
188 OccupancyStats& occ_stats = occupancy_cache_[params];
189 if (occ_stats.occupancy_pct == 0.0) {
190 occ_stats = GetOccupancy(params);
191 }
192 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
193 StatType::kTheoreticalOccupancyPct)),
194 occ_stats.occupancy_pct);
195 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
196 GetStatTypeStr(StatType::kOccupancyMinGridSize)),
197 static_cast<int32>(occ_stats.min_grid_size));
198 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
199 StatType::kOccupancySuggestedBlockSize)),
200 static_cast<int32>(occ_stats.suggested_block_size));
201 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
202 GetStatTypeStr(StatType::kKernelDetails)),
203 *plane->GetOrCreateStatMetadata(ToXStat(
204 event.kernel_info, occ_stats.occupancy_pct)));
205 } else if (event.type == CuptiTracerEventType::MemcpyH2D ||
206 event.type == CuptiTracerEventType::MemcpyD2H ||
207 event.type == CuptiTracerEventType::MemcpyD2D ||
208 event.type == CuptiTracerEventType::MemcpyP2P ||
209 event.type == CuptiTracerEventType::MemcpyOther) {
210 const auto& memcpy_info = event.memcpy_info;
211 std::string value = absl::StrCat(
212 "kind_src:", GetMemoryKindName(event.memcpy_info.src_mem_kind),
213 " kind_dst:", GetMemoryKindName(event.memcpy_info.dst_mem_kind),
214 " size:", memcpy_info.num_bytes, " dest:", memcpy_info.destination,
215 " async:", memcpy_info.async);
216 VLOG(7) << "Add Memcpy stat. " << value;
217 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
218 GetStatTypeStr(StatType::kMemcpyDetails)),
219 *plane->GetOrCreateStatMetadata(std::move(value)));
220 } else if (event.type == CuptiTracerEventType::MemoryAlloc) {
221 std::string value =
222 absl::StrCat("kind:", GetMemoryKindName(event.memalloc_info.mem_kind),
223 " num_bytes:", event.memalloc_info.num_bytes);
224 VLOG(7) << "Add MemAlloc stat. " << value;
225 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
226 GetStatTypeStr(StatType::kMemallocDetails)),
227 *plane->GetOrCreateStatMetadata(std::move(value)));
228 } else if (event.type == CuptiTracerEventType::MemoryFree) {
229 std::string value =
230 absl::StrCat("kind:", GetMemoryKindName(event.memfree_info.mem_kind),
231 " num_bytes:", event.memfree_info.num_bytes);
232 VLOG(7) << "Add MemFree stat. " << value;
233 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
234 GetStatTypeStr(StatType::kMemFreeDetails)),
235 *plane->GetOrCreateStatMetadata(std::move(value)));
236 } else if (event.type == CuptiTracerEventType::Memset) {
237 std::string value =
238 absl::StrCat("kind:", GetMemoryKindName(event.memset_info.mem_kind),
239 " num_bytes:", event.memset_info.num_bytes,
240 " async:", event.memset_info.async);
241 VLOG(7) << "Add Memset stat. " << value;
242 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
243 GetStatTypeStr(StatType::kMemsetDetails)),
244 *plane->GetOrCreateStatMetadata(std::move(value)));
245 } else if (event.type == CuptiTracerEventType::MemoryResidency) {
246 std::string value = absl::StrCat(
247 "kind:", GetMemoryKindName(event.memory_residency_info.mem_kind),
248 " num_bytes:", event.memory_residency_info.num_bytes, " addr:0x",
249 absl::Hex(event.memory_residency_info.address, absl::kZeroPad16));
250 VLOG(7) << "Add MemoryResidency stat. " << value;
251 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
252 StatType::kMemoryResidencyDetails)),
253 *plane->GetOrCreateStatMetadata(std::move(value)));
254 }
255
256 std::vector<Annotation> annotation_stack =
257 ParseAnnotationStack(event.annotation);
258 if (!annotation_stack.empty()) {
259 xevent.AddStatValue(
260 *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
261 *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
262 }
263 // If multiple metadata have the same key name, show the values from the top
264 // of the stack (innermost annotation). Concatenate the values from
265 // "hlo_op".
266 absl::flat_hash_set<absl::string_view> key_set;
267
268 for (auto annotation = annotation_stack.rbegin();
269 annotation != annotation_stack.rend(); ++annotation) {
270 for (const Annotation::Metadata& metadata : annotation->metadata) {
271 if (key_set.insert(metadata.key).second) {
272 xevent.ParseAndAddStatValue(
273 *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
274 }
275 }
276 }
277 }
278
GetDeviceAttribute(CUdevice device,CUdevice_attribute attrib)279 absl::optional<int> GetDeviceAttribute(CUdevice device,
280 CUdevice_attribute attrib) {
281 int ret_val;
282 CUresult err = cuDeviceGetAttribute(&ret_val, attrib, device);
283 if (err != CUDA_SUCCESS) return absl::nullopt;
284 return ret_val;
285 }
286
GetDeviceXLineName(int64_t stream_id,absl::flat_hash_set<CuptiTracerEventType> & event_types)287 std::string GetDeviceXLineName(
288 int64_t stream_id,
289 absl::flat_hash_set<CuptiTracerEventType>& event_types) {
290 std::string line_name = absl::StrCat("Stream #", stream_id);
291 event_types.erase(CuptiTracerEventType::Unsupported);
292 if (event_types.empty()) return line_name;
293 if (event_types.count(CuptiTracerEventType::Overhead))
294 return "CUPTI overhead";
295 std::vector<const char*> type_names;
296 for (const auto event_type : event_types) {
297 type_names.emplace_back(GetTraceEventTypeName(event_type));
298 }
299 return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
300 }
301
302 public:
303 PerDeviceCollector() = default;
304
AddEvent(CuptiTracerEvent && event)305 void AddEvent(CuptiTracerEvent&& event) {
306 mutex_lock l(m_);
307 events_.emplace_back(std::move(event));
308 }
309
Flush(uint64 start_gpu_ns,uint64 end_gpu_ns,XPlaneBuilder * device_plane,XPlaneBuilder * host_plane)310 size_t Flush(uint64 start_gpu_ns, uint64 end_gpu_ns,
311 XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) {
312 mutex_lock l(m_);
313 // Tracking event types per line.
314 absl::flat_hash_map<int64, absl::flat_hash_set<CuptiTracerEventType>>
315 events_types_per_line;
316 for (auto& event : events_) {
317 int64_t line_id = CuptiTracerEvent::kInvalidThreadId;
318 bool is_host_event = IsHostEvent(event, &line_id);
319 if (line_id == CuptiTracerEvent::kInvalidThreadId ||
320 line_id == CuptiTracerEvent::kInvalidStreamId) {
321 VLOG(9) << "Ignoring event, type=" << static_cast<int>(event.type);
322 continue;
323 }
324 auto* plane = is_host_event ? host_plane : device_plane;
325 VLOG(9) << "Event"
326 << " type=" << static_cast<int>(event.type)
327 << " line_id=" << line_id
328 << (is_host_event ? " host plane=" : " device plane=")
329 << plane->Name();
330 XLineBuilder line = plane->GetOrCreateLine(line_id);
331 line.SetTimestampNs(start_gpu_ns);
332 CreateXEvent(event, plane, start_gpu_ns, end_gpu_ns, &line);
333 events_types_per_line[line_id].emplace(event.type);
334 }
335 device_plane->ForEachLine([&](XLineBuilder line) {
336 line.SetName(
337 GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
338 });
339 host_plane->ForEachLine([&](XLineBuilder line) {
340 line.SetName(absl::StrCat("Host Threads/", line.Id()));
341 });
342 size_t num_events = events_.size();
343 events_.clear();
344 return num_events;
345 }
346
GetDeviceCapabilities(int32_t device_ordinal,XPlaneBuilder * device_plane)347 void GetDeviceCapabilities(int32_t device_ordinal,
348 XPlaneBuilder* device_plane) {
349 CUdevice device;
350 if (cuDeviceGet(&device, device_ordinal) != CUDA_SUCCESS) return;
351
352 auto clock_rate_in_khz =
353 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_CLOCK_RATE);
354 if (clock_rate_in_khz) {
355 device_plane->AddStatValue(
356 *device_plane->GetOrCreateStatMetadata(
357 GetStatTypeStr(StatType::kDevCapClockRateKHz)),
358 *clock_rate_in_khz);
359 }
360
361 auto core_count =
362 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
363 if (core_count) {
364 device_plane->AddStatValue(
365 *device_plane->GetOrCreateStatMetadata(
366 GetStatTypeStr(StatType::kDevCapCoreCount)),
367 *core_count);
368 }
369
370 auto mem_clock_khz =
371 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE);
372 auto mem_bus_width_bits =
373 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH);
374 if (mem_clock_khz && mem_bus_width_bits) {
375 // Times 2 because HBM is DDR memory; it gets two data bits per each
376 // data lane.
377 auto memory_bandwidth =
378 uint64{2} * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8;
379 device_plane->AddStatValue(
380 *device_plane->GetOrCreateStatMetadata(
381 GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
382 memory_bandwidth);
383 }
384
385 size_t total_memory = 0;
386 if (cuDeviceTotalMem(&total_memory, device) == CUDA_SUCCESS) {
387 device_plane->AddStatValue(
388 *device_plane->GetOrCreateStatMetadata(
389 GetStatTypeStr(StatType::kDevCapMemorySize)),
390 static_cast<uint64>(total_memory));
391 }
392
393 auto compute_capability_major = GetDeviceAttribute(
394 device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
395 if (compute_capability_major) {
396 device_plane->AddStatValue(
397 *device_plane->GetOrCreateStatMetadata(
398 GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
399 *compute_capability_major);
400 }
401 auto compute_capability_minor = GetDeviceAttribute(
402 device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
403 if (compute_capability_minor) {
404 device_plane->AddStatValue(
405 *device_plane->GetOrCreateStatMetadata(
406 GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
407 *compute_capability_minor);
408 }
409
410 auto max_threads_per_block =
411 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
412 auto max_threads_per_sm = GetDeviceAttribute(
413 device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
414 auto regs_per_block =
415 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK);
416 auto regs_per_sm = GetDeviceAttribute(
417 device, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR);
418 auto warp_size = GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
419 auto shared_mem_per_block = GetDeviceAttribute(
420 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
421 auto shared_mem_per_sm = GetDeviceAttribute(
422 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
423 auto shared_mem_per_block_optin = GetDeviceAttribute(
424 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN);
425
426 // Precondition for calculating GPU occupancy is to have all of these
427 // inputs. Otherwise, GPU occupancy will be left unset as 0%.
428 if (core_count && compute_capability_major && compute_capability_minor &&
429 max_threads_per_block && max_threads_per_sm && regs_per_block &&
430 regs_per_sm && warp_size && shared_mem_per_block && shared_mem_per_sm &&
431 shared_mem_per_block_optin) {
432 device_properties_.computeMajor = *compute_capability_major;
433 device_properties_.computeMinor = *compute_capability_minor;
434 device_properties_.numSms = *core_count;
435 device_properties_.maxThreadsPerBlock = *max_threads_per_block;
436 device_properties_.maxThreadsPerMultiprocessor = *max_threads_per_sm;
437 device_properties_.regsPerBlock = *regs_per_block;
438 device_properties_.regsPerMultiprocessor = *regs_per_sm;
439 device_properties_.warpSize = *warp_size;
440 device_properties_.sharedMemPerBlock = *shared_mem_per_block;
441 device_properties_.sharedMemPerMultiprocessor = *shared_mem_per_sm;
442 device_properties_.sharedMemPerBlockOptin = *shared_mem_per_block_optin;
443 }
444 }
445
446 private:
447 mutex m_;
448 std::vector<CuptiTracerEvent> events_ TF_GUARDED_BY(m_);
449 cudaOccDeviceProp device_properties_;
450 absl::flat_hash_map<DeviceOccupancyParams, OccupancyStats> occupancy_cache_;
451 };
452
453 } // namespace
454
Add(uint32 device_id,uint32 correlation_id,const absl::string_view annotation,const absl::string_view nvtx_range)455 void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
456 const absl::string_view annotation,
457 const absl::string_view nvtx_range) {
458 if (annotation.empty() && nvtx_range.empty()) return;
459 VLOG(3) << "Add annotation: device_id: " << device_id
460 << " correlation_id: " << correlation_id
461 << " annotation: " << annotation;
462 if (device_id >= per_device_map_.size()) return;
463 auto& per_device_map = per_device_map_[device_id];
464 absl::MutexLock lock(&per_device_map.mutex);
465 if (per_device_map.annotations.size() < max_size_) {
466 AnnotationInfo info;
467 info.annotation = *per_device_map.annotations.emplace(annotation).first;
468 if (!nvtx_range.empty())
469 info.nvtx_range = *per_device_map.nvtx_ranges.emplace(nvtx_range).first;
470 per_device_map.correlation_map.emplace(correlation_id, info);
471 }
472 }
473
LookUp(uint32 device_id,uint32 correlation_id)474 AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32 device_id,
475 uint32 correlation_id) {
476 if (device_id >= per_device_map_.size()) return AnnotationInfo();
477 auto& per_device_map = per_device_map_[device_id];
478 absl::MutexLock lock(&per_device_map.mutex);
479 auto it = per_device_map.correlation_map.find(correlation_id);
480 return it != per_device_map.correlation_map.end() ? it->second
481 : AnnotationInfo();
482 }
483
484 // CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and
485 // eventually convert and filter them to XSpace.
486 class CuptiTraceCollectorImpl : public CuptiTraceCollector {
487 public:
CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions & option,uint64 start_walltime_ns,uint64 start_gpu_ns)488 CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions& option,
489 uint64 start_walltime_ns, uint64 start_gpu_ns)
490 : CuptiTraceCollector(option),
491 num_callback_events_(0),
492 num_activity_events_(0),
493 start_walltime_ns_(start_walltime_ns),
494 start_gpu_ns_(start_gpu_ns),
495 num_gpus_(option.num_gpus),
496 per_device_collector_(option.num_gpus) {}
497
AddEvent(CuptiTracerEvent && event)498 void AddEvent(CuptiTracerEvent&& event) override {
499 if (event.device_id >= num_gpus_) return;
500 if (event.source == CuptiTracerEventSource::DriverCallback) {
501 if (num_callback_events_ > options_.max_callback_api_events) {
502 OnEventsDropped("total driver(callback) events reaches max", 1);
503 return;
504 }
505 num_callback_events_++;
506 } else {
507 if (num_activity_events_ > options_.max_activity_api_events) {
508 OnEventsDropped("total device(activity) events reaches max", 1);
509 return;
510 }
511 num_activity_events_++;
512 }
513 per_device_collector_[event.device_id].AddEvent(std::move(event));
514 }
OnEventsDropped(const std::string & reason,uint32 num_events)515 void OnEventsDropped(const std::string& reason, uint32 num_events) override {
516 absl::MutexLock lock(&mutex_);
517 dropped_events_[reason] += num_events;
518 }
Flush()519 void Flush() override {}
520 // Returns true if some GPU events are captured.
Export(XSpace * space,uint64 end_gpu_ns)521 bool Export(XSpace* space, uint64 end_gpu_ns) override {
522 LOG(INFO) << " GpuTracer has collected " << num_callback_events_
523 << " callback api events and " << num_activity_events_
524 << " activity events. " << ReportDroppedEvents();
525 size_t num_events = 0;
526 XPlaneBuilder host_plane(
527 FindOrAddMutablePlaneWithName(space, kCuptiDriverApiPlaneName));
528 for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
529 std::string name = GpuPlaneName(device_ordinal);
530 XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
531 device_plane.SetId(device_ordinal);
532 VLOG(4) << "Creating plane for"
533 << " name=" << name << " ordinal=" << device_ordinal;
534
535 // Calculate device capabilities before flushing, so that device
536 // properties are available to the occupancy calculator in Flush().
537 per_device_collector_[device_ordinal].GetDeviceCapabilities(
538 device_ordinal, &device_plane);
539 num_events += per_device_collector_[device_ordinal].Flush(
540 start_gpu_ns_, end_gpu_ns, &device_plane, &host_plane);
541 NormalizeTimeStamps(&device_plane, start_walltime_ns_);
542 }
543 NormalizeTimeStamps(&host_plane, start_walltime_ns_);
544 return num_events > 0;
545 }
546
ReportDroppedEvents()547 std::string ReportDroppedEvents() {
548 absl::MutexLock lock(&mutex_);
549 string result;
550 for (const auto& dropped : dropped_events_) {
551 absl::StrAppend(&result, " ", dropped.second, " events dropped because ",
552 dropped.first, ";");
553 }
554 if (!result.empty()) result.back() = '.';
555 return result;
556 }
ReportNumEventsIfDropped()557 std::string ReportNumEventsIfDropped() override {
558 std::string events_dropped = ReportDroppedEvents();
559 if (events_dropped.empty()) return "";
560 return absl::StrCat("Detected GPU events dropped on ", port::Hostname(),
561 ": Profiler has collected ",
562 num_callback_events_.load(), " driver events and ",
563 num_activity_events_.load(), " device events.",
564 events_dropped);
565 }
566
567 private:
568 std::atomic<int> num_callback_events_;
569 std::atomic<int> num_activity_events_;
570 absl::Mutex mutex_;
571 absl::flat_hash_map<std::string, uint64> dropped_events_
572 ABSL_GUARDED_BY(mutex_);
573 uint64 start_walltime_ns_;
574 uint64 start_gpu_ns_;
575 int num_gpus_;
576
577 // Set the all XLines of specified XPlane to starting walltime.
578 // Events time in both host and device planes are CUTPI timestamps.
579 // We set initial CUPTI timestamp as start time for all lines to reflect
580 // this fact. Eventually we change line start time to corresponding
581 // start_walltime_ns to normalize with CPU wall time.
NormalizeTimeStamps(XPlaneBuilder * plane,uint64 start_walltime_ns)582 static void NormalizeTimeStamps(XPlaneBuilder* plane,
583 uint64 start_walltime_ns) {
584 plane->ForEachLine(
585 [&](XLineBuilder line) { line.SetTimestampNs(start_walltime_ns); });
586 }
587
588 absl::FixedArray<PerDeviceCollector> per_device_collector_;
589
590 TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollectorImpl);
591 };
592
CreateCuptiCollector(const CuptiTracerCollectorOptions & options,const uint64 start_walltime_ns,const uint64 start_gputime_ns)593 std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
594 const CuptiTracerCollectorOptions& options, const uint64 start_walltime_ns,
595 const uint64 start_gputime_ns) {
596 return absl::make_unique<CuptiTraceCollectorImpl>(options, start_walltime_ns,
597 start_gputime_ns);
598 }
599
600 // The strings are parser friendly and have no whitespaces in them.
GetMemoryKindName(int8_t memory_kind)601 absl::string_view GetMemoryKindName(int8_t memory_kind) {
602 switch (memory_kind) {
603 case CUPTI_ACTIVITY_MEMORY_KIND_ARRAY:
604 return "array";
605 case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE:
606 return "device";
607 case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC:
608 return "device_static";
609 case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED:
610 return "managed";
611 case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC:
612 return "managed_static";
613 case CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE:
614 return "pageable";
615 case CUPTI_ACTIVITY_MEMORY_KIND_PINNED:
616 return "pinned";
617 case CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN:
618 default:
619 return "unknown";
620 }
621 }
622
623 } // namespace profiler
624 } // namespace tensorflow
625