1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/internal/gpu/cupti_collector.h"
17
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/container/flat_hash_set.h"
20 #include "absl/hash/hash.h"
21 #include "absl/strings/str_cat.h"
22 #include "absl/strings/str_join.h"
23 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
24 #include "third_party/gpus/cuda/include/cuda.h"
25 #include "third_party/gpus/cuda/include/cuda_occupancy.h"
26 #include "tensorflow/core/platform/abi.h"
27 #include "tensorflow/core/platform/host_info.h"
28 #include "tensorflow/core/platform/mutex.h"
29 #include "tensorflow/core/profiler/utils/parse_annotation.h"
30 #include "tensorflow/core/profiler/utils/xplane_builder.h"
31 #include "tensorflow/core/profiler/utils/xplane_schema.h"
32 #include "tensorflow/core/profiler/utils/xplane_utils.h"
33
34 namespace tensorflow {
35 namespace profiler {
36
37 namespace {
38
IsHostEvent(const CuptiTracerEvent & event,int64 * line_id)39 bool IsHostEvent(const CuptiTracerEvent& event, int64* line_id) {
40 // DriverCallback(i.e. kernel launching) events are host events.
41 if (event.source == CuptiTracerEventSource::DriverCallback) {
42 *line_id = event.thread_id;
43 return true;
44 }
45 // Non-overhead activity events are device events.
46 if (event.type != CuptiTracerEventType::Overhead) {
47 *line_id = event.stream_id;
48 return false;
49 }
50 // Overhead events can be associated with a thread or a stream, etc.
51 // If a valid thread id is specified, we consider it as a host event.
52 //
53 if (event.stream_id != CuptiTracerEvent::kInvalidStreamId) {
54 *line_id = event.stream_id;
55 return false;
56 } else if (event.thread_id != CuptiTracerEvent::kInvalidThreadId &&
57 event.thread_id != 0) {
58 *line_id = event.thread_id;
59 return true;
60 } else {
61 *line_id = kThreadIdOverhead;
62 return false;
63 }
64 }
65
66 struct DeviceOccupancyParams {
67 cudaOccFuncAttributes attributes = {};
68 int block_size = 0;
69 size_t dynamic_smem_size = 0;
70
operator ==(const DeviceOccupancyParams & lhs,const DeviceOccupancyParams & rhs)71 friend bool operator==(const DeviceOccupancyParams& lhs,
72 const DeviceOccupancyParams& rhs) {
73 return 0 == memcmp(&lhs, &rhs, sizeof(lhs));
74 }
75
76 template <typename H>
AbslHashValue(H hash_state,const DeviceOccupancyParams & params)77 friend H AbslHashValue(H hash_state, const DeviceOccupancyParams& params) {
78 return H::combine(
79 std::move(hash_state), params.attributes.maxThreadsPerBlock,
80 params.attributes.numRegs, params.attributes.sharedSizeBytes,
81 static_cast<uint32_t>(params.attributes.partitionedGCConfig),
82 static_cast<uint32_t>(params.attributes.shmemLimitConfig),
83 params.attributes.maxDynamicSharedSizeBytes, params.block_size,
84 params.dynamic_smem_size);
85 }
86 };
87
88 struct OccupancyStats {
89 double occupancy_pct = 0.0;
90 int min_grid_size = 0;
91 int suggested_block_size = 0;
92 };
93
94 struct CorrelationInfo {
CorrelationInfotensorflow::profiler::__anon2da8ba0c0111::CorrelationInfo95 CorrelationInfo(uint32 t, uint32 e) : thread_id(t), enqueue_time_ns(e) {}
96 uint32 thread_id;
97 uint64 enqueue_time_ns;
98 };
99
100 class PerDeviceCollector {
101 private:
GetOccupancy(const DeviceOccupancyParams & params) const102 OccupancyStats GetOccupancy(const DeviceOccupancyParams& params) const {
103 OccupancyStats stats;
104 if (device_properties_.computeMajor == 0) {
105 return {};
106 }
107
108 const cudaOccDeviceState state = {};
109 cudaOccResult occ_result;
110 cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor(
111 &occ_result, &device_properties_, ¶ms.attributes, &state,
112 params.block_size, params.dynamic_smem_size);
113 if (status != CUDA_OCC_SUCCESS) {
114 return {};
115 }
116
117 stats.occupancy_pct =
118 occ_result.activeBlocksPerMultiprocessor * params.block_size * 100;
119 stats.occupancy_pct /= device_properties_.maxThreadsPerMultiprocessor;
120
121 status = cudaOccMaxPotentialOccupancyBlockSize(
122 &stats.min_grid_size, &stats.suggested_block_size, &device_properties_,
123 ¶ms.attributes, &state, NULL, params.dynamic_smem_size);
124 if (status != CUDA_OCC_SUCCESS) {
125 return {};
126 }
127
128 return stats;
129 }
130
CreateXEvent(const CuptiTracerEvent & event,XPlaneBuilder * plane,uint64 start_gpu_ns,uint64 end_gpu_ns,XLineBuilder * line)131 void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
132 uint64 start_gpu_ns, uint64 end_gpu_ns,
133 XLineBuilder* line) {
134 if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
135 event.start_time_ns > event.end_time_ns) {
136 VLOG(2) << "events have abnormal timestamps:" << event.name
137 << " start time(ns): " << event.start_time_ns
138 << " end time(ns): " << event.end_time_ns;
139 return;
140 }
141 std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
142 if (kernel_name.empty()) {
143 kernel_name = GetTraceEventTypeName(event.type);
144 }
145 XEventMetadata* event_metadata =
146 plane->GetOrCreateEventMetadata(std::move(kernel_name));
147 XEventBuilder xevent = line->AddEvent(*event_metadata);
148 VLOG(7) << "Adding event to line=" << line->Id();
149 xevent.SetTimestampNs(event.start_time_ns);
150 xevent.SetEndTimestampNs(event.end_time_ns);
151 if (event.source == CuptiTracerEventSource::DriverCallback) {
152 xevent.AddStatValue(
153 *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDeviceId)),
154 event.device_id);
155 }
156 if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
157 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
158 GetStatTypeStr(StatType::kCorrelationId)),
159 event.correlation_id);
160 }
161 if (!event.annotation.empty()) {
162 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
163 GetStatTypeStr(StatType::kKernelAnnotation)),
164 *plane->GetOrCreateStatMetadata(event.annotation));
165 }
166 if (!event.nvtx_range.empty()) {
167 xevent.AddStatValue(
168 *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)),
169 *plane->GetOrCreateStatMetadata(event.nvtx_range));
170 }
171 if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
172 xevent.AddStatValue(
173 *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)),
174 absl::StrCat("$$", static_cast<uint64>(event.context_id)));
175 }
176
177 if (event.type == CuptiTracerEventType::Kernel &&
178 event.source == CuptiTracerEventSource::Activity) {
179 DeviceOccupancyParams params{};
180 params.attributes.maxThreadsPerBlock = INT_MAX;
181 params.attributes.numRegs =
182 static_cast<int>(event.kernel_info.registers_per_thread);
183 params.attributes.sharedSizeBytes =
184 event.kernel_info.static_shared_memory_usage;
185 params.attributes.partitionedGCConfig = PARTITIONED_GC_OFF;
186 params.attributes.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
187 params.attributes.maxDynamicSharedSizeBytes = 0;
188 params.block_size = static_cast<int>(event.kernel_info.block_x *
189 event.kernel_info.block_y *
190 event.kernel_info.block_z);
191
192 params.dynamic_smem_size = event.kernel_info.dynamic_shared_memory_usage;
193
194 OccupancyStats& occ_stats = occupancy_cache_[params];
195 if (occ_stats.occupancy_pct == 0.0) {
196 occ_stats = GetOccupancy(params);
197 }
198 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
199 StatType::kTheoreticalOccupancyPct)),
200 occ_stats.occupancy_pct);
201 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
202 GetStatTypeStr(StatType::kOccupancyMinGridSize)),
203 static_cast<int32>(occ_stats.min_grid_size));
204 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
205 StatType::kOccupancySuggestedBlockSize)),
206 static_cast<int32>(occ_stats.suggested_block_size));
207 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
208 GetStatTypeStr(StatType::kKernelDetails)),
209 *plane->GetOrCreateStatMetadata(ToXStat(
210 event.kernel_info, occ_stats.occupancy_pct)));
211 } else if (event.type == CuptiTracerEventType::MemcpyH2D ||
212 event.type == CuptiTracerEventType::MemcpyD2H ||
213 event.type == CuptiTracerEventType::MemcpyD2D ||
214 event.type == CuptiTracerEventType::MemcpyP2P ||
215 event.type == CuptiTracerEventType::MemcpyOther) {
216 VLOG(7) << "Add Memcpy stat";
217 const auto& memcpy_info = event.memcpy_info;
218 std::string memcpy_details = absl::StrCat(
219 "kind:", GetMemoryKindName(event.memcpy_info.kind),
220 " size:", memcpy_info.num_bytes, " dest:", memcpy_info.destination,
221 " async:", memcpy_info.async);
222 xevent.AddStatValue(
223 *plane->GetOrCreateStatMetadata(
224 GetStatTypeStr(StatType::kMemcpyDetails)),
225 *plane->GetOrCreateStatMetadata(std::move(memcpy_details)));
226 } else if (event.type == CuptiTracerEventType::MemoryAlloc) {
227 VLOG(7) << "Add MemAlloc stat";
228 std::string value =
229 absl::StrCat("kind:", GetMemoryKindName(event.memalloc_info.kind),
230 " num_bytes:", event.memalloc_info.num_bytes);
231 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
232 GetStatTypeStr(StatType::kMemallocDetails)),
233 *plane->GetOrCreateStatMetadata(std::move(value)));
234 } else if (event.type == CuptiTracerEventType::MemoryFree) {
235 VLOG(7) << "Add MemFree stat";
236 std::string value =
237 absl::StrCat("kind:", GetMemoryKindName(event.memfree_info.kind),
238 " num_bytes:", event.memfree_info.num_bytes);
239 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
240 GetStatTypeStr(StatType::kMemFreeDetails)),
241 *plane->GetOrCreateStatMetadata(std::move(value)));
242 } else if (event.type == CuptiTracerEventType::Memset) {
243 VLOG(7) << "Add Memset stat";
244 auto value =
245 absl::StrCat("kind:", GetMemoryKindName(event.memset_info.kind),
246 " num_bytes:", event.memset_info.num_bytes,
247 " async:", event.memset_info.async);
248 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
249 GetStatTypeStr(StatType::kMemsetDetails)),
250 *plane->GetOrCreateStatMetadata(std::move(value)));
251 } else if (event.type == CuptiTracerEventType::MemoryResidency) {
252 VLOG(7) << "Add MemoryResidency stat";
253 std::string value = absl::StrCat(
254 "kind:", GetMemoryKindName(event.memory_residency_info.kind),
255 " num_bytes:", event.memory_residency_info.num_bytes,
256 " addr:", event.memory_residency_info.address);
257 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
258 StatType::kMemoryResidencyDetails)),
259 *plane->GetOrCreateStatMetadata(std::move(value)));
260 }
261
262 std::vector<Annotation> annotation_stack =
263 ParseAnnotationStack(event.annotation);
264 if (!annotation_stack.empty()) {
265 xevent.AddStatValue(
266 *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
267 *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
268 }
269 // If multiple metadata have the same key name, show the values from the top
270 // of the stack (innermost annotation). Concatenate the values from
271 // "hlo_op".
272 absl::flat_hash_set<absl::string_view> key_set;
273
274 for (auto annotation = annotation_stack.rbegin();
275 annotation != annotation_stack.rend(); ++annotation) {
276 for (const Annotation::Metadata& metadata : annotation->metadata) {
277 if (key_set.insert(metadata.key).second) {
278 xevent.ParseAndAddStatValue(
279 *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
280 }
281 }
282 }
283 }
284
GetDeviceAttribute(CUdevice device,CUdevice_attribute attrib)285 absl::optional<int> GetDeviceAttribute(CUdevice device,
286 CUdevice_attribute attrib) {
287 int ret_val;
288 CUresult err = cuDeviceGetAttribute(&ret_val, attrib, device);
289 if (err != CUDA_SUCCESS) return absl::nullopt;
290 return ret_val;
291 }
292
GetDeviceXLineName(int64 stream_id,absl::flat_hash_set<CuptiTracerEventType> & event_types)293 std::string GetDeviceXLineName(
294 int64 stream_id, absl::flat_hash_set<CuptiTracerEventType>& event_types) {
295 std::string line_name = absl::StrCat("Stream #", stream_id);
296 event_types.erase(CuptiTracerEventType::Unsupported);
297 if (event_types.empty()) return line_name;
298 if (event_types.count(CuptiTracerEventType::Overhead))
299 return "CUPTI overhead";
300 std::vector<const char*> type_names;
301 for (const auto event_type : event_types) {
302 type_names.emplace_back(GetTraceEventTypeName(event_type));
303 }
304 return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
305 }
306
307 public:
308 PerDeviceCollector() = default;
309
AddEvent(CuptiTracerEvent && event)310 void AddEvent(CuptiTracerEvent&& event) {
311 mutex_lock l(m_);
312 if (event.source == CuptiTracerEventSource::DriverCallback) {
313 // Cupti api callback events were used to populate launch times etc.
314 if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
315 correlation_info_.insert(
316 {event.correlation_id,
317 CorrelationInfo(event.thread_id, event.start_time_ns)});
318 }
319 events_.emplace_back(std::move(event));
320 } else {
321 // Cupti activity events measure device times etc.
322 events_.emplace_back(std::move(event));
323 }
324 }
325
Flush(int32 device_ordinal,uint64 start_walltime_ns,uint64 start_gpu_ns,StepStats * step_stats)326 void Flush(int32 device_ordinal, uint64 start_walltime_ns,
327 uint64 start_gpu_ns, StepStats* step_stats) {
328 mutex_lock l(m_);
329 absl::flat_hash_map<std::pair<int64 /*stream_id*/, CuptiTracerEventType>,
330 DeviceStepStats*>
331 stream_dev_stats_map;
332 DeviceStepStats* unknown_stream_dev_stats = nullptr;
333 DeviceStepStats* all_streams_dev_stats = nullptr;
334 DeviceStepStats* memcpy_dev_stats = nullptr;
335 DeviceStepStats* sync_dev_stats = nullptr;
336 for (const CuptiTracerEvent& event : events_) {
337 NodeExecStats* ns = new NodeExecStats;
338 ns->set_all_start_micros(
339 (start_walltime_ns + (event.start_time_ns - start_gpu_ns)) / 1000);
340 ns->set_op_start_rel_micros(0);
341 auto elapsed_ns = event.end_time_ns - event.start_time_ns;
342 ns->set_op_end_rel_micros(elapsed_ns / 1000);
343 ns->set_all_end_rel_micros(elapsed_ns / 1000);
344
345 if (event.source == CuptiTracerEventSource::DriverCallback) {
346 // Legacy code ignore all other launch events except
347 // cuStreamSynchronize.
348 if (event.name == "cuStreamSynchronize") {
349 ns->set_node_name(event.name);
350 ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id));
351 ns->set_thread_id(event.thread_id);
352 if (sync_dev_stats == nullptr) {
353 sync_dev_stats = step_stats->add_dev_stats();
354 sync_dev_stats->set_device(
355 absl::StrCat("/device:GPU:", device_ordinal, "/sync"));
356 }
357 sync_dev_stats->add_node_stats()->Swap(ns);
358 }
359 } else { // CuptiTracerEventSource::Activity
360 // Get launch information if available.
361 if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
362 auto it = correlation_info_.find(event.correlation_id);
363 if (it != correlation_info_.end()) {
364 ns->set_scheduled_micros(it->second.enqueue_time_ns / 1000);
365 ns->set_thread_id(it->second.thread_id);
366 }
367 }
368
369 auto annotation_stack = ParseAnnotationStack(event.annotation);
370 std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
371 std::string activity_name =
372 !annotation_stack.empty()
373 ? std::string(annotation_stack.back().name)
374 : kernel_name;
375 ns->set_node_name(activity_name);
376 switch (event.type) {
377 case CuptiTracerEventType::Kernel: {
378 ns->set_timeline_label(absl::StrCat(
379 kernel_name, " regs:", event.kernel_info.registers_per_thread,
380 " shm:", event.kernel_info.static_shared_memory_usage,
381 " grid: ", event.kernel_info.grid_x, ",",
382 event.kernel_info.grid_y, ",", event.kernel_info.grid_z,
383 " block:", event.kernel_info.block_x, ",",
384 event.kernel_info.block_y, ",", event.kernel_info.block_z, "@@",
385 event.annotation));
386 DeviceStepStats*& stream_dev_stats =
387 stream_dev_stats_map[std::make_pair(event.stream_id,
388 event.type)];
389 if (stream_dev_stats == nullptr) {
390 stream_dev_stats = step_stats->add_dev_stats();
391 stream_dev_stats->set_device(absl::StrCat(
392 "/device:GPU:", device_ordinal, "/stream:", event.stream_id));
393 }
394 *stream_dev_stats->add_node_stats() = *ns;
395 if (all_streams_dev_stats == nullptr) {
396 all_streams_dev_stats = step_stats->add_dev_stats();
397 all_streams_dev_stats->set_device(
398 absl::StrCat("/device:GPU:", device_ordinal, "/stream:all"));
399 }
400 all_streams_dev_stats->add_node_stats()->Swap(ns);
401 break;
402 }
403 case CuptiTracerEventType::MemcpyH2D:
404 case CuptiTracerEventType::MemcpyD2H:
405 case CuptiTracerEventType::MemcpyD2D:
406 case CuptiTracerEventType::MemcpyP2P: {
407 std::string details = absl::StrCat(
408 activity_name, " bytes:", event.memcpy_info.num_bytes);
409 if (event.memcpy_info.async) {
410 absl::StrAppend(&details, " async");
411 }
412 if (event.memcpy_info.destination != event.device_id) {
413 absl::StrAppend(&details,
414 " to device:", event.memcpy_info.destination);
415 }
416 ns->set_timeline_label(std::move(details));
417 DeviceStepStats*& stream_dev_stats =
418 stream_dev_stats_map[std::make_pair(event.stream_id,
419 event.type)];
420 if (stream_dev_stats == nullptr) {
421 stream_dev_stats = step_stats->add_dev_stats();
422 stream_dev_stats->set_device(absl::StrCat(
423 "/device:GPU:", device_ordinal, "/stream:", event.stream_id,
424 "<", GetTraceEventTypeName(event.type), ">"));
425 }
426 *stream_dev_stats->add_node_stats() = *ns;
427 if (memcpy_dev_stats == nullptr) {
428 memcpy_dev_stats = step_stats->add_dev_stats();
429 memcpy_dev_stats->set_device(
430 absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"));
431 }
432 memcpy_dev_stats->add_node_stats()->Swap(ns);
433 break;
434 }
435 default:
436 ns->set_timeline_label(activity_name);
437 if (unknown_stream_dev_stats == nullptr) {
438 unknown_stream_dev_stats = step_stats->add_dev_stats();
439 unknown_stream_dev_stats->set_device(
440 absl::StrCat("/device:GPU:", device_ordinal, "/stream:"));
441 }
442 unknown_stream_dev_stats->add_node_stats()->Swap(ns);
443 break;
444 }
445 }
446 }
447 events_.clear();
448 }
449
Flush(uint64 start_gpu_ns,uint64 end_gpu_ns,XPlaneBuilder * device_plane,XPlaneBuilder * host_plane)450 size_t Flush(uint64 start_gpu_ns, uint64 end_gpu_ns,
451 XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) {
452 mutex_lock l(m_);
453 // Tracking event types per line.
454 absl::flat_hash_map<int64, absl::flat_hash_set<CuptiTracerEventType>>
455 events_types_per_line;
456 for (auto& event : events_) {
457 int64 line_id = CuptiTracerEvent::kInvalidThreadId;
458 bool is_host_event = IsHostEvent(event, &line_id);
459 if (line_id == CuptiTracerEvent::kInvalidThreadId ||
460 line_id == CuptiTracerEvent::kInvalidStreamId) {
461 VLOG(9) << "Ignoring event, type=" << static_cast<int>(event.type);
462 continue;
463 }
464 auto* plane = is_host_event ? host_plane : device_plane;
465 VLOG(9) << "Event"
466 << " type=" << static_cast<int>(event.type)
467 << " line_id=" << line_id
468 << (is_host_event ? " host plane=" : " device plane=")
469 << plane->Name();
470 XLineBuilder line = plane->GetOrCreateLine(line_id);
471 line.SetTimestampNs(start_gpu_ns);
472 CreateXEvent(event, plane, start_gpu_ns, end_gpu_ns, &line);
473 events_types_per_line[line_id].emplace(event.type);
474 }
475 device_plane->ForEachLine([&](XLineBuilder line) {
476 line.SetName(
477 GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
478 });
479 host_plane->ForEachLine([&](XLineBuilder line) {
480 line.SetName(absl::StrCat("Host Threads/", line.Id()));
481 });
482 size_t num_events = events_.size();
483 events_.clear();
484 return num_events;
485 }
486
GetDeviceCapabilities(int32 device_ordinal,XPlaneBuilder * device_plane)487 void GetDeviceCapabilities(int32 device_ordinal,
488 XPlaneBuilder* device_plane) {
489 CUdevice device;
490 if (cuDeviceGet(&device, device_ordinal) != CUDA_SUCCESS) return;
491
492 auto clock_rate_in_khz =
493 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_CLOCK_RATE);
494 if (clock_rate_in_khz) {
495 device_plane->AddStatValue(
496 *device_plane->GetOrCreateStatMetadata(
497 GetStatTypeStr(StatType::kDevCapClockRateKHz)),
498 *clock_rate_in_khz);
499 }
500
501 auto core_count =
502 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
503 if (core_count) {
504 device_plane->AddStatValue(
505 *device_plane->GetOrCreateStatMetadata(
506 GetStatTypeStr(StatType::kDevCapCoreCount)),
507 *core_count);
508 }
509
510 auto mem_clock_khz =
511 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE);
512 auto mem_bus_width_bits =
513 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH);
514 if (mem_clock_khz && mem_bus_width_bits) {
515 // Times 2 because HBM is DDR memory; it gets two data bits per each
516 // data lane.
517 auto memory_bandwidth =
518 uint64{2} * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8;
519 device_plane->AddStatValue(
520 *device_plane->GetOrCreateStatMetadata(
521 GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
522 memory_bandwidth);
523 }
524
525 size_t total_memory = 0;
526 if (cuDeviceTotalMem(&total_memory, device) == CUDA_SUCCESS) {
527 device_plane->AddStatValue(
528 *device_plane->GetOrCreateStatMetadata(
529 GetStatTypeStr(StatType::kDevCapMemorySize)),
530 static_cast<uint64>(total_memory));
531 }
532
533 auto compute_capability_major = GetDeviceAttribute(
534 device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
535 if (compute_capability_major) {
536 device_plane->AddStatValue(
537 *device_plane->GetOrCreateStatMetadata(
538 GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
539 *compute_capability_major);
540 }
541 auto compute_capability_minor = GetDeviceAttribute(
542 device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
543 if (compute_capability_minor) {
544 device_plane->AddStatValue(
545 *device_plane->GetOrCreateStatMetadata(
546 GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
547 *compute_capability_minor);
548 }
549
550 auto max_threads_per_block =
551 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
552 auto max_threads_per_sm = GetDeviceAttribute(
553 device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
554 auto regs_per_block =
555 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK);
556 auto regs_per_sm = GetDeviceAttribute(
557 device, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR);
558 auto warp_size = GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
559 auto shared_mem_per_block = GetDeviceAttribute(
560 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
561 auto shared_mem_per_sm = GetDeviceAttribute(
562 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
563 auto shared_mem_per_block_optin = GetDeviceAttribute(
564 device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN);
565
566 // Precondition for calculating GPU occupancy is to have all of these
567 // inputs. Otherwise, GPU occupancy will be left unset as 0%.
568 if (core_count && compute_capability_major && compute_capability_minor &&
569 max_threads_per_block && max_threads_per_sm && regs_per_block &&
570 regs_per_sm && warp_size && shared_mem_per_block && shared_mem_per_sm &&
571 shared_mem_per_block_optin) {
572 device_properties_.computeMajor = *compute_capability_major;
573 device_properties_.computeMinor = *compute_capability_minor;
574 device_properties_.numSms = *core_count;
575 device_properties_.maxThreadsPerBlock = *max_threads_per_block;
576 device_properties_.maxThreadsPerMultiprocessor = *max_threads_per_sm;
577 device_properties_.regsPerBlock = *regs_per_block;
578 device_properties_.regsPerMultiprocessor = *regs_per_sm;
579 device_properties_.warpSize = *warp_size;
580 device_properties_.sharedMemPerBlock = *shared_mem_per_block;
581 device_properties_.sharedMemPerMultiprocessor = *shared_mem_per_sm;
582 device_properties_.sharedMemPerBlockOptin = *shared_mem_per_block_optin;
583 }
584 }
585
586 private:
587 mutex m_;
588 std::vector<CuptiTracerEvent> events_ TF_GUARDED_BY(m_);
589 absl::flat_hash_map<uint32, CorrelationInfo> correlation_info_
590 TF_GUARDED_BY(m_);
591 cudaOccDeviceProp device_properties_;
592 absl::flat_hash_map<DeviceOccupancyParams, OccupancyStats> occupancy_cache_;
593 };
594
595 } // namespace
596
Add(uint32 device_id,uint32 correlation_id,const absl::string_view annotation,const absl::string_view nvtx_range)597 void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
598 const absl::string_view annotation,
599 const absl::string_view nvtx_range) {
600 if (annotation.empty() && nvtx_range.empty()) return;
601 VLOG(3) << "Add annotation: device_id: " << device_id
602 << " correlation_id: " << correlation_id
603 << " annotation: " << annotation;
604 if (device_id >= per_device_map_.size()) return;
605 auto& per_device_map = per_device_map_[device_id];
606 absl::MutexLock lock(&per_device_map.mutex);
607 if (per_device_map.annotations.size() < max_size_) {
608 AnnotationInfo info;
609 info.annotation = *per_device_map.annotations.emplace(annotation).first;
610 if (!nvtx_range.empty())
611 info.nvtx_range = *per_device_map.nvtx_ranges.emplace(nvtx_range).first;
612 per_device_map.correlation_map.emplace(correlation_id, info);
613 }
614 }
615
LookUp(uint32 device_id,uint32 correlation_id)616 AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32 device_id,
617 uint32 correlation_id) {
618 if (device_id >= per_device_map_.size()) return AnnotationInfo();
619 auto& per_device_map = per_device_map_[device_id];
620 absl::MutexLock lock(&per_device_map.mutex);
621 auto it = per_device_map.correlation_map.find(correlation_id);
622 return it != per_device_map.correlation_map.end() ? it->second
623 : AnnotationInfo();
624 }
625
626 // CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and
627 // eventually convert and filter them to StepStats or XSpace.
628 class CuptiTraceCollectorImpl : public CuptiTraceCollector {
629 public:
CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions & option,uint64 start_walltime_ns,uint64 start_gpu_ns)630 CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions& option,
631 uint64 start_walltime_ns, uint64 start_gpu_ns)
632 : CuptiTraceCollector(option),
633 num_callback_events_(0),
634 num_activity_events_(0),
635 start_walltime_ns_(start_walltime_ns),
636 start_gpu_ns_(start_gpu_ns),
637 num_gpus_(option.num_gpus),
638 per_device_collector_(option.num_gpus) {}
639
AddEvent(CuptiTracerEvent && event)640 void AddEvent(CuptiTracerEvent&& event) override {
641 if (event.device_id >= num_gpus_) return;
642 if (event.source == CuptiTracerEventSource::DriverCallback) {
643 if (num_callback_events_ > options_.max_callback_api_events) {
644 OnEventsDropped("total driver(callback) events reaches max", 1);
645 return;
646 }
647 num_callback_events_++;
648 } else {
649 if (num_activity_events_ > options_.max_activity_api_events) {
650 OnEventsDropped("total device(activity) events reaches max", 1);
651 return;
652 }
653 num_activity_events_++;
654 }
655 per_device_collector_[event.device_id].AddEvent(std::move(event));
656 }
OnEventsDropped(const std::string & reason,uint32 num_events)657 void OnEventsDropped(const std::string& reason, uint32 num_events) override {
658 absl::MutexLock lock(&mutex_);
659 dropped_events_[reason] += num_events;
660 }
Flush()661 void Flush() override {}
Export(StepStats * step_stats)662 void Export(StepStats* step_stats) override {
663 LOG(INFO) << " GpuTracer has collected " << num_callback_events_
664 << " callback api events and " << num_activity_events_
665 << " activity events. " << ReportDroppedEvents();
666 for (int i = 0; i < num_gpus_; ++i) {
667 per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_,
668 step_stats);
669 }
670 }
671 // Returns true if some GPU events are captured.
Export(XSpace * space,uint64 end_gpu_ns)672 bool Export(XSpace* space, uint64 end_gpu_ns) override {
673 LOG(INFO) << " GpuTracer has collected " << num_callback_events_
674 << " callback api events and " << num_activity_events_
675 << " activity events. " << ReportDroppedEvents();
676 size_t num_events = 0;
677 XPlaneBuilder host_plane(
678 FindOrAddMutablePlaneWithName(space, kCuptiDriverApiPlaneName));
679 for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
680 std::string name = GpuPlaneName(device_ordinal);
681 XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
682 device_plane.SetId(device_ordinal);
683 VLOG(4) << "Creating plane for"
684 << " name=" << name << " ordinal=" << device_ordinal;
685
686 // Calculate device capabilities before flushing, so that device
687 // properties are available to the occupancy calculator in Flush().
688 per_device_collector_[device_ordinal].GetDeviceCapabilities(
689 device_ordinal, &device_plane);
690 num_events += per_device_collector_[device_ordinal].Flush(
691 start_gpu_ns_, end_gpu_ns, &device_plane, &host_plane);
692 NormalizeTimeStamps(&device_plane, start_walltime_ns_);
693 }
694 NormalizeTimeStamps(&host_plane, start_walltime_ns_);
695 return num_events > 0;
696 }
697
ReportDroppedEvents()698 std::string ReportDroppedEvents() {
699 absl::MutexLock lock(&mutex_);
700 string result;
701 for (const auto& dropped : dropped_events_) {
702 absl::StrAppend(&result, " ", dropped.second, " events dropped because ",
703 dropped.first, ";");
704 }
705 if (!result.empty()) result.back() = '.';
706 return result;
707 }
ReportNumEventsIfDropped()708 std::string ReportNumEventsIfDropped() override {
709 std::string events_dropped = ReportDroppedEvents();
710 if (events_dropped.empty()) return "";
711 return absl::StrCat("Detected GPU events dropped on ", port::Hostname(),
712 ": Profiler has collected ",
713 num_callback_events_.load(), " driver events and ",
714 num_activity_events_.load(), " device events.",
715 events_dropped);
716 }
717
718 private:
719 std::atomic<int> num_callback_events_;
720 std::atomic<int> num_activity_events_;
721 absl::Mutex mutex_;
722 absl::flat_hash_map<std::string, uint64> dropped_events_
723 ABSL_GUARDED_BY(mutex_);
724 uint64 start_walltime_ns_;
725 uint64 start_gpu_ns_;
726 int num_gpus_;
727
728 // Set the all XLines of specified XPlane to starting walltime.
729 // Events time in both host and device planes are CUTPI timestamps.
730 // We set initial CUPTI timestamp as start time for all lines to reflect
731 // this fact. Eventually we change line start time to corresponding
732 // start_walltime_ns to normalize with CPU wall time.
NormalizeTimeStamps(XPlaneBuilder * plane,uint64 start_walltime_ns)733 static void NormalizeTimeStamps(XPlaneBuilder* plane,
734 uint64 start_walltime_ns) {
735 plane->ForEachLine(
736 [&](XLineBuilder line) { line.SetTimestampNs(start_walltime_ns); });
737 }
738
739 absl::FixedArray<PerDeviceCollector> per_device_collector_;
740
741 TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollectorImpl);
742 };
743
CreateCuptiCollector(const CuptiTracerCollectorOptions & options,const uint64 start_walltime_ns,const uint64 start_gputime_ns)744 std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
745 const CuptiTracerCollectorOptions& options, const uint64 start_walltime_ns,
746 const uint64 start_gputime_ns) {
747 return absl::make_unique<CuptiTraceCollectorImpl>(options, start_walltime_ns,
748 start_gputime_ns);
749 }
750
751 // The strings are parser friendly and have no whitespaces in them.
GetMemoryKindName(int8 kind)752 absl::string_view GetMemoryKindName(int8 kind) {
753 auto memory_kind = static_cast<CUpti_ActivityMemoryKind>(kind);
754 switch (memory_kind) {
755 case CUPTI_ACTIVITY_MEMORY_KIND_ARRAY:
756 return "array";
757 case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE:
758 return "device";
759 case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC:
760 return "device_static";
761 case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED:
762 return "managed";
763 case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC:
764 return "managed_static";
765 case CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE:
766 return "pageable";
767 case CUPTI_ACTIVITY_MEMORY_KIND_PINNED:
768 return "pinned";
769 case CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN:
770 default:
771 return "unknown";
772 }
773 }
774
775 } // namespace profiler
776 } // namespace tensorflow
777