• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/internal/gpu/cupti_collector.h"
17 
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/container/flat_hash_set.h"
20 #include "absl/hash/hash.h"
21 #include "absl/strings/str_cat.h"
22 #include "absl/strings/str_join.h"
23 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
24 #include "third_party/gpus/cuda/include/cuda.h"
25 #include "third_party/gpus/cuda/include/cuda_occupancy.h"
26 #include "tensorflow/core/platform/abi.h"
27 #include "tensorflow/core/platform/host_info.h"
28 #include "tensorflow/core/platform/mutex.h"
29 #include "tensorflow/core/profiler/utils/parse_annotation.h"
30 #include "tensorflow/core/profiler/utils/xplane_builder.h"
31 #include "tensorflow/core/profiler/utils/xplane_schema.h"
32 #include "tensorflow/core/profiler/utils/xplane_utils.h"
33 
34 namespace tensorflow {
35 namespace profiler {
36 
37 namespace {
38 
IsHostEvent(const CuptiTracerEvent & event,int64 * line_id)39 bool IsHostEvent(const CuptiTracerEvent& event, int64* line_id) {
40   // DriverCallback(i.e. kernel launching) events are host events.
41   if (event.source == CuptiTracerEventSource::DriverCallback) {
42     *line_id = event.thread_id;
43     return true;
44   }
45   // Non-overhead activity events are device events.
46   if (event.type != CuptiTracerEventType::Overhead) {
47     *line_id = event.stream_id;
48     return false;
49   }
50   // Overhead events can be associated with a thread or a stream, etc.
51   // If a valid thread id is specified, we consider it as a host event.
52   //
53   if (event.stream_id != CuptiTracerEvent::kInvalidStreamId) {
54     *line_id = event.stream_id;
55     return false;
56   } else if (event.thread_id != CuptiTracerEvent::kInvalidThreadId &&
57              event.thread_id != 0) {
58     *line_id = event.thread_id;
59     return true;
60   } else {
61     *line_id = kThreadIdOverhead;
62     return false;
63   }
64 }
65 
66 struct DeviceOccupancyParams {
67   cudaOccFuncAttributes attributes = {};
68   int block_size = 0;
69   size_t dynamic_smem_size = 0;
70 
operator ==(const DeviceOccupancyParams & lhs,const DeviceOccupancyParams & rhs)71   friend bool operator==(const DeviceOccupancyParams& lhs,
72                          const DeviceOccupancyParams& rhs) {
73     return 0 == memcmp(&lhs, &rhs, sizeof(lhs));
74   }
75 
76   template <typename H>
AbslHashValue(H hash_state,const DeviceOccupancyParams & params)77   friend H AbslHashValue(H hash_state, const DeviceOccupancyParams& params) {
78     return H::combine(
79         std::move(hash_state), params.attributes.maxThreadsPerBlock,
80         params.attributes.numRegs, params.attributes.sharedSizeBytes,
81         static_cast<uint32_t>(params.attributes.partitionedGCConfig),
82         static_cast<uint32_t>(params.attributes.shmemLimitConfig),
83         params.attributes.maxDynamicSharedSizeBytes, params.block_size,
84         params.dynamic_smem_size);
85   }
86 };
87 
88 struct OccupancyStats {
89   double occupancy_pct = 0.0;
90   int min_grid_size = 0;
91   int suggested_block_size = 0;
92 };
93 
94 struct CorrelationInfo {
CorrelationInfotensorflow::profiler::__anon2da8ba0c0111::CorrelationInfo95   CorrelationInfo(uint32 t, uint32 e) : thread_id(t), enqueue_time_ns(e) {}
96   uint32 thread_id;
97   uint64 enqueue_time_ns;
98 };
99 
100 class PerDeviceCollector {
101  private:
GetOccupancy(const DeviceOccupancyParams & params) const102   OccupancyStats GetOccupancy(const DeviceOccupancyParams& params) const {
103     OccupancyStats stats;
104     if (device_properties_.computeMajor == 0) {
105       return {};
106     }
107 
108     const cudaOccDeviceState state = {};
109     cudaOccResult occ_result;
110     cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor(
111         &occ_result, &device_properties_, &params.attributes, &state,
112         params.block_size, params.dynamic_smem_size);
113     if (status != CUDA_OCC_SUCCESS) {
114       return {};
115     }
116 
117     stats.occupancy_pct =
118         occ_result.activeBlocksPerMultiprocessor * params.block_size * 100;
119     stats.occupancy_pct /= device_properties_.maxThreadsPerMultiprocessor;
120 
121     status = cudaOccMaxPotentialOccupancyBlockSize(
122         &stats.min_grid_size, &stats.suggested_block_size, &device_properties_,
123         &params.attributes, &state, NULL, params.dynamic_smem_size);
124     if (status != CUDA_OCC_SUCCESS) {
125       return {};
126     }
127 
128     return stats;
129   }
130 
CreateXEvent(const CuptiTracerEvent & event,XPlaneBuilder * plane,uint64 start_gpu_ns,uint64 end_gpu_ns,XLineBuilder * line)131   void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
132                     uint64 start_gpu_ns, uint64 end_gpu_ns,
133                     XLineBuilder* line) {
134     if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
135         event.start_time_ns > event.end_time_ns) {
136       VLOG(2) << "events have abnormal timestamps:" << event.name
137               << " start time(ns): " << event.start_time_ns
138               << " end time(ns): " << event.end_time_ns;
139       return;
140     }
141     std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
142     if (kernel_name.empty()) {
143       kernel_name = GetTraceEventTypeName(event.type);
144     }
145     XEventMetadata* event_metadata =
146         plane->GetOrCreateEventMetadata(std::move(kernel_name));
147     XEventBuilder xevent = line->AddEvent(*event_metadata);
148     VLOG(7) << "Adding event to line=" << line->Id();
149     xevent.SetTimestampNs(event.start_time_ns);
150     xevent.SetEndTimestampNs(event.end_time_ns);
151     if (event.source == CuptiTracerEventSource::DriverCallback) {
152       xevent.AddStatValue(
153           *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDeviceId)),
154           event.device_id);
155     }
156     if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
157       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
158                               GetStatTypeStr(StatType::kCorrelationId)),
159                           event.correlation_id);
160     }
161     if (!event.annotation.empty()) {
162       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
163                               GetStatTypeStr(StatType::kKernelAnnotation)),
164                           *plane->GetOrCreateStatMetadata(event.annotation));
165     }
166     if (!event.nvtx_range.empty()) {
167       xevent.AddStatValue(
168           *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)),
169           *plane->GetOrCreateStatMetadata(event.nvtx_range));
170     }
171     if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
172       xevent.AddStatValue(
173           *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)),
174           absl::StrCat("$$", static_cast<uint64>(event.context_id)));
175     }
176 
177     if (event.type == CuptiTracerEventType::Kernel &&
178         event.source == CuptiTracerEventSource::Activity) {
179       DeviceOccupancyParams params{};
180       params.attributes.maxThreadsPerBlock = INT_MAX;
181       params.attributes.numRegs =
182           static_cast<int>(event.kernel_info.registers_per_thread);
183       params.attributes.sharedSizeBytes =
184           event.kernel_info.static_shared_memory_usage;
185       params.attributes.partitionedGCConfig = PARTITIONED_GC_OFF;
186       params.attributes.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
187       params.attributes.maxDynamicSharedSizeBytes = 0;
188       params.block_size = static_cast<int>(event.kernel_info.block_x *
189                                            event.kernel_info.block_y *
190                                            event.kernel_info.block_z);
191 
192       params.dynamic_smem_size = event.kernel_info.dynamic_shared_memory_usage;
193 
194       OccupancyStats& occ_stats = occupancy_cache_[params];
195       if (occ_stats.occupancy_pct == 0.0) {
196         occ_stats = GetOccupancy(params);
197       }
198       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
199                               StatType::kTheoreticalOccupancyPct)),
200                           occ_stats.occupancy_pct);
201       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
202                               GetStatTypeStr(StatType::kOccupancyMinGridSize)),
203                           static_cast<int32>(occ_stats.min_grid_size));
204       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
205                               StatType::kOccupancySuggestedBlockSize)),
206                           static_cast<int32>(occ_stats.suggested_block_size));
207       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
208                               GetStatTypeStr(StatType::kKernelDetails)),
209                           *plane->GetOrCreateStatMetadata(ToXStat(
210                               event.kernel_info, occ_stats.occupancy_pct)));
211     } else if (event.type == CuptiTracerEventType::MemcpyH2D ||
212                event.type == CuptiTracerEventType::MemcpyD2H ||
213                event.type == CuptiTracerEventType::MemcpyD2D ||
214                event.type == CuptiTracerEventType::MemcpyP2P ||
215                event.type == CuptiTracerEventType::MemcpyOther) {
216       VLOG(7) << "Add Memcpy stat";
217       const auto& memcpy_info = event.memcpy_info;
218       std::string memcpy_details = absl::StrCat(
219           "kind:", GetMemoryKindName(event.memcpy_info.kind),
220           " size:", memcpy_info.num_bytes, " dest:", memcpy_info.destination,
221           " async:", memcpy_info.async);
222       xevent.AddStatValue(
223           *plane->GetOrCreateStatMetadata(
224               GetStatTypeStr(StatType::kMemcpyDetails)),
225           *plane->GetOrCreateStatMetadata(std::move(memcpy_details)));
226     } else if (event.type == CuptiTracerEventType::MemoryAlloc) {
227       VLOG(7) << "Add MemAlloc stat";
228       std::string value =
229           absl::StrCat("kind:", GetMemoryKindName(event.memalloc_info.kind),
230                        " num_bytes:", event.memalloc_info.num_bytes);
231       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
232                               GetStatTypeStr(StatType::kMemallocDetails)),
233                           *plane->GetOrCreateStatMetadata(std::move(value)));
234     } else if (event.type == CuptiTracerEventType::MemoryFree) {
235       VLOG(7) << "Add MemFree stat";
236       std::string value =
237           absl::StrCat("kind:", GetMemoryKindName(event.memfree_info.kind),
238                        " num_bytes:", event.memfree_info.num_bytes);
239       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
240                               GetStatTypeStr(StatType::kMemFreeDetails)),
241                           *plane->GetOrCreateStatMetadata(std::move(value)));
242     } else if (event.type == CuptiTracerEventType::Memset) {
243       VLOG(7) << "Add Memset stat";
244       auto value =
245           absl::StrCat("kind:", GetMemoryKindName(event.memset_info.kind),
246                        " num_bytes:", event.memset_info.num_bytes,
247                        " async:", event.memset_info.async);
248       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
249                               GetStatTypeStr(StatType::kMemsetDetails)),
250                           *plane->GetOrCreateStatMetadata(std::move(value)));
251     } else if (event.type == CuptiTracerEventType::MemoryResidency) {
252       VLOG(7) << "Add MemoryResidency stat";
253       std::string value = absl::StrCat(
254           "kind:", GetMemoryKindName(event.memory_residency_info.kind),
255           " num_bytes:", event.memory_residency_info.num_bytes,
256           " addr:", event.memory_residency_info.address);
257       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
258                               StatType::kMemoryResidencyDetails)),
259                           *plane->GetOrCreateStatMetadata(std::move(value)));
260     }
261 
262     std::vector<Annotation> annotation_stack =
263         ParseAnnotationStack(event.annotation);
264     if (!annotation_stack.empty()) {
265       xevent.AddStatValue(
266           *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
267           *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
268     }
269     // If multiple metadata have the same key name, show the values from the top
270     // of the stack (innermost annotation). Concatenate the values from
271     // "hlo_op".
272     absl::flat_hash_set<absl::string_view> key_set;
273 
274     for (auto annotation = annotation_stack.rbegin();
275          annotation != annotation_stack.rend(); ++annotation) {
276       for (const Annotation::Metadata& metadata : annotation->metadata) {
277         if (key_set.insert(metadata.key).second) {
278           xevent.ParseAndAddStatValue(
279               *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
280         }
281       }
282     }
283   }
284 
GetDeviceAttribute(CUdevice device,CUdevice_attribute attrib)285   absl::optional<int> GetDeviceAttribute(CUdevice device,
286                                          CUdevice_attribute attrib) {
287     int ret_val;
288     CUresult err = cuDeviceGetAttribute(&ret_val, attrib, device);
289     if (err != CUDA_SUCCESS) return absl::nullopt;
290     return ret_val;
291   }
292 
GetDeviceXLineName(int64 stream_id,absl::flat_hash_set<CuptiTracerEventType> & event_types)293   std::string GetDeviceXLineName(
294       int64 stream_id, absl::flat_hash_set<CuptiTracerEventType>& event_types) {
295     std::string line_name = absl::StrCat("Stream #", stream_id);
296     event_types.erase(CuptiTracerEventType::Unsupported);
297     if (event_types.empty()) return line_name;
298     if (event_types.count(CuptiTracerEventType::Overhead))
299       return "CUPTI overhead";
300     std::vector<const char*> type_names;
301     for (const auto event_type : event_types) {
302       type_names.emplace_back(GetTraceEventTypeName(event_type));
303     }
304     return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
305   }
306 
307  public:
308   PerDeviceCollector() = default;
309 
AddEvent(CuptiTracerEvent && event)310   void AddEvent(CuptiTracerEvent&& event) {
311     mutex_lock l(m_);
312     if (event.source == CuptiTracerEventSource::DriverCallback) {
313       // Cupti api callback events were used to populate launch times etc.
314       if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
315         correlation_info_.insert(
316             {event.correlation_id,
317              CorrelationInfo(event.thread_id, event.start_time_ns)});
318       }
319       events_.emplace_back(std::move(event));
320     } else {
321       // Cupti activity events measure device times etc.
322       events_.emplace_back(std::move(event));
323     }
324   }
325 
Flush(int32 device_ordinal,uint64 start_walltime_ns,uint64 start_gpu_ns,StepStats * step_stats)326   void Flush(int32 device_ordinal, uint64 start_walltime_ns,
327              uint64 start_gpu_ns, StepStats* step_stats) {
328     mutex_lock l(m_);
329     absl::flat_hash_map<std::pair<int64 /*stream_id*/, CuptiTracerEventType>,
330                         DeviceStepStats*>
331         stream_dev_stats_map;
332     DeviceStepStats* unknown_stream_dev_stats = nullptr;
333     DeviceStepStats* all_streams_dev_stats = nullptr;
334     DeviceStepStats* memcpy_dev_stats = nullptr;
335     DeviceStepStats* sync_dev_stats = nullptr;
336     for (const CuptiTracerEvent& event : events_) {
337       NodeExecStats* ns = new NodeExecStats;
338       ns->set_all_start_micros(
339           (start_walltime_ns + (event.start_time_ns - start_gpu_ns)) / 1000);
340       ns->set_op_start_rel_micros(0);
341       auto elapsed_ns = event.end_time_ns - event.start_time_ns;
342       ns->set_op_end_rel_micros(elapsed_ns / 1000);
343       ns->set_all_end_rel_micros(elapsed_ns / 1000);
344 
345       if (event.source == CuptiTracerEventSource::DriverCallback) {
346         // Legacy code ignore all other launch events except
347         // cuStreamSynchronize.
348         if (event.name == "cuStreamSynchronize") {
349           ns->set_node_name(event.name);
350           ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id));
351           ns->set_thread_id(event.thread_id);
352           if (sync_dev_stats == nullptr) {
353             sync_dev_stats = step_stats->add_dev_stats();
354             sync_dev_stats->set_device(
355                 absl::StrCat("/device:GPU:", device_ordinal, "/sync"));
356           }
357           sync_dev_stats->add_node_stats()->Swap(ns);
358         }
359       } else {  // CuptiTracerEventSource::Activity
360         // Get launch information if available.
361         if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
362           auto it = correlation_info_.find(event.correlation_id);
363           if (it != correlation_info_.end()) {
364             ns->set_scheduled_micros(it->second.enqueue_time_ns / 1000);
365             ns->set_thread_id(it->second.thread_id);
366           }
367         }
368 
369         auto annotation_stack = ParseAnnotationStack(event.annotation);
370         std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
371         std::string activity_name =
372             !annotation_stack.empty()
373                 ? std::string(annotation_stack.back().name)
374                 : kernel_name;
375         ns->set_node_name(activity_name);
376         switch (event.type) {
377           case CuptiTracerEventType::Kernel: {
378             ns->set_timeline_label(absl::StrCat(
379                 kernel_name, " regs:", event.kernel_info.registers_per_thread,
380                 " shm:", event.kernel_info.static_shared_memory_usage,
381                 " grid: ", event.kernel_info.grid_x, ",",
382                 event.kernel_info.grid_y, ",", event.kernel_info.grid_z,
383                 " block:", event.kernel_info.block_x, ",",
384                 event.kernel_info.block_y, ",", event.kernel_info.block_z, "@@",
385                 event.annotation));
386             DeviceStepStats*& stream_dev_stats =
387                 stream_dev_stats_map[std::make_pair(event.stream_id,
388                                                     event.type)];
389             if (stream_dev_stats == nullptr) {
390               stream_dev_stats = step_stats->add_dev_stats();
391               stream_dev_stats->set_device(absl::StrCat(
392                   "/device:GPU:", device_ordinal, "/stream:", event.stream_id));
393             }
394             *stream_dev_stats->add_node_stats() = *ns;
395             if (all_streams_dev_stats == nullptr) {
396               all_streams_dev_stats = step_stats->add_dev_stats();
397               all_streams_dev_stats->set_device(
398                   absl::StrCat("/device:GPU:", device_ordinal, "/stream:all"));
399             }
400             all_streams_dev_stats->add_node_stats()->Swap(ns);
401             break;
402           }
403           case CuptiTracerEventType::MemcpyH2D:
404           case CuptiTracerEventType::MemcpyD2H:
405           case CuptiTracerEventType::MemcpyD2D:
406           case CuptiTracerEventType::MemcpyP2P: {
407             std::string details = absl::StrCat(
408                 activity_name, " bytes:", event.memcpy_info.num_bytes);
409             if (event.memcpy_info.async) {
410               absl::StrAppend(&details, " async");
411             }
412             if (event.memcpy_info.destination != event.device_id) {
413               absl::StrAppend(&details,
414                               " to device:", event.memcpy_info.destination);
415             }
416             ns->set_timeline_label(std::move(details));
417             DeviceStepStats*& stream_dev_stats =
418                 stream_dev_stats_map[std::make_pair(event.stream_id,
419                                                     event.type)];
420             if (stream_dev_stats == nullptr) {
421               stream_dev_stats = step_stats->add_dev_stats();
422               stream_dev_stats->set_device(absl::StrCat(
423                   "/device:GPU:", device_ordinal, "/stream:", event.stream_id,
424                   "<", GetTraceEventTypeName(event.type), ">"));
425             }
426             *stream_dev_stats->add_node_stats() = *ns;
427             if (memcpy_dev_stats == nullptr) {
428               memcpy_dev_stats = step_stats->add_dev_stats();
429               memcpy_dev_stats->set_device(
430                   absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"));
431             }
432             memcpy_dev_stats->add_node_stats()->Swap(ns);
433             break;
434           }
435           default:
436             ns->set_timeline_label(activity_name);
437             if (unknown_stream_dev_stats == nullptr) {
438               unknown_stream_dev_stats = step_stats->add_dev_stats();
439               unknown_stream_dev_stats->set_device(
440                   absl::StrCat("/device:GPU:", device_ordinal, "/stream:"));
441             }
442             unknown_stream_dev_stats->add_node_stats()->Swap(ns);
443             break;
444         }
445       }
446     }
447     events_.clear();
448   }
449 
Flush(uint64 start_gpu_ns,uint64 end_gpu_ns,XPlaneBuilder * device_plane,XPlaneBuilder * host_plane)450   size_t Flush(uint64 start_gpu_ns, uint64 end_gpu_ns,
451                XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) {
452     mutex_lock l(m_);
453     // Tracking event types per line.
454     absl::flat_hash_map<int64, absl::flat_hash_set<CuptiTracerEventType>>
455         events_types_per_line;
456     for (auto& event : events_) {
457       int64 line_id = CuptiTracerEvent::kInvalidThreadId;
458       bool is_host_event = IsHostEvent(event, &line_id);
459       if (line_id == CuptiTracerEvent::kInvalidThreadId ||
460           line_id == CuptiTracerEvent::kInvalidStreamId) {
461         VLOG(9) << "Ignoring event, type=" << static_cast<int>(event.type);
462         continue;
463       }
464       auto* plane = is_host_event ? host_plane : device_plane;
465       VLOG(9) << "Event"
466               << " type=" << static_cast<int>(event.type)
467               << " line_id=" << line_id
468               << (is_host_event ? " host plane=" : " device plane=")
469               << plane->Name();
470       XLineBuilder line = plane->GetOrCreateLine(line_id);
471       line.SetTimestampNs(start_gpu_ns);
472       CreateXEvent(event, plane, start_gpu_ns, end_gpu_ns, &line);
473       events_types_per_line[line_id].emplace(event.type);
474     }
475     device_plane->ForEachLine([&](XLineBuilder line) {
476       line.SetName(
477           GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
478     });
479     host_plane->ForEachLine([&](XLineBuilder line) {
480       line.SetName(absl::StrCat("Host Threads/", line.Id()));
481     });
482     size_t num_events = events_.size();
483     events_.clear();
484     return num_events;
485   }
486 
GetDeviceCapabilities(int32 device_ordinal,XPlaneBuilder * device_plane)487   void GetDeviceCapabilities(int32 device_ordinal,
488                              XPlaneBuilder* device_plane) {
489     CUdevice device;
490     if (cuDeviceGet(&device, device_ordinal) != CUDA_SUCCESS) return;
491 
492     auto clock_rate_in_khz =
493         GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_CLOCK_RATE);
494     if (clock_rate_in_khz) {
495       device_plane->AddStatValue(
496           *device_plane->GetOrCreateStatMetadata(
497               GetStatTypeStr(StatType::kDevCapClockRateKHz)),
498           *clock_rate_in_khz);
499     }
500 
501     auto core_count =
502         GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
503     if (core_count) {
504       device_plane->AddStatValue(
505           *device_plane->GetOrCreateStatMetadata(
506               GetStatTypeStr(StatType::kDevCapCoreCount)),
507           *core_count);
508     }
509 
510     auto mem_clock_khz =
511         GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE);
512     auto mem_bus_width_bits =
513         GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH);
514     if (mem_clock_khz && mem_bus_width_bits) {
515       // Times 2 because HBM is DDR memory; it gets two data bits per each
516       // data lane.
517       auto memory_bandwidth =
518           uint64{2} * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8;
519       device_plane->AddStatValue(
520           *device_plane->GetOrCreateStatMetadata(
521               GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
522           memory_bandwidth);
523     }
524 
525     size_t total_memory = 0;
526     if (cuDeviceTotalMem(&total_memory, device) == CUDA_SUCCESS) {
527       device_plane->AddStatValue(
528           *device_plane->GetOrCreateStatMetadata(
529               GetStatTypeStr(StatType::kDevCapMemorySize)),
530           static_cast<uint64>(total_memory));
531     }
532 
533     auto compute_capability_major = GetDeviceAttribute(
534         device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
535     if (compute_capability_major) {
536       device_plane->AddStatValue(
537           *device_plane->GetOrCreateStatMetadata(
538               GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
539           *compute_capability_major);
540     }
541     auto compute_capability_minor = GetDeviceAttribute(
542         device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
543     if (compute_capability_minor) {
544       device_plane->AddStatValue(
545           *device_plane->GetOrCreateStatMetadata(
546               GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
547           *compute_capability_minor);
548     }
549 
550     auto max_threads_per_block =
551         GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
552     auto max_threads_per_sm = GetDeviceAttribute(
553         device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
554     auto regs_per_block =
555         GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK);
556     auto regs_per_sm = GetDeviceAttribute(
557         device, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR);
558     auto warp_size = GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
559     auto shared_mem_per_block = GetDeviceAttribute(
560         device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
561     auto shared_mem_per_sm = GetDeviceAttribute(
562         device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
563     auto shared_mem_per_block_optin = GetDeviceAttribute(
564         device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN);
565 
566     // Precondition for calculating GPU occupancy is to have all of these
567     // inputs. Otherwise, GPU occupancy will be left unset as 0%.
568     if (core_count && compute_capability_major && compute_capability_minor &&
569         max_threads_per_block && max_threads_per_sm && regs_per_block &&
570         regs_per_sm && warp_size && shared_mem_per_block && shared_mem_per_sm &&
571         shared_mem_per_block_optin) {
572       device_properties_.computeMajor = *compute_capability_major;
573       device_properties_.computeMinor = *compute_capability_minor;
574       device_properties_.numSms = *core_count;
575       device_properties_.maxThreadsPerBlock = *max_threads_per_block;
576       device_properties_.maxThreadsPerMultiprocessor = *max_threads_per_sm;
577       device_properties_.regsPerBlock = *regs_per_block;
578       device_properties_.regsPerMultiprocessor = *regs_per_sm;
579       device_properties_.warpSize = *warp_size;
580       device_properties_.sharedMemPerBlock = *shared_mem_per_block;
581       device_properties_.sharedMemPerMultiprocessor = *shared_mem_per_sm;
582       device_properties_.sharedMemPerBlockOptin = *shared_mem_per_block_optin;
583     }
584   }
585 
586  private:
587   mutex m_;
588   std::vector<CuptiTracerEvent> events_ TF_GUARDED_BY(m_);
589   absl::flat_hash_map<uint32, CorrelationInfo> correlation_info_
590       TF_GUARDED_BY(m_);
591   cudaOccDeviceProp device_properties_;
592   absl::flat_hash_map<DeviceOccupancyParams, OccupancyStats> occupancy_cache_;
593 };
594 
595 }  // namespace
596 
Add(uint32 device_id,uint32 correlation_id,const absl::string_view annotation,const absl::string_view nvtx_range)597 void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
598                         const absl::string_view annotation,
599                         const absl::string_view nvtx_range) {
600   if (annotation.empty() && nvtx_range.empty()) return;
601   VLOG(3) << "Add annotation: device_id: " << device_id
602           << " correlation_id: " << correlation_id
603           << " annotation: " << annotation;
604   if (device_id >= per_device_map_.size()) return;
605   auto& per_device_map = per_device_map_[device_id];
606   absl::MutexLock lock(&per_device_map.mutex);
607   if (per_device_map.annotations.size() < max_size_) {
608     AnnotationInfo info;
609     info.annotation = *per_device_map.annotations.emplace(annotation).first;
610     if (!nvtx_range.empty())
611       info.nvtx_range = *per_device_map.nvtx_ranges.emplace(nvtx_range).first;
612     per_device_map.correlation_map.emplace(correlation_id, info);
613   }
614 }
615 
LookUp(uint32 device_id,uint32 correlation_id)616 AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32 device_id,
617                                                     uint32 correlation_id) {
618   if (device_id >= per_device_map_.size()) return AnnotationInfo();
619   auto& per_device_map = per_device_map_[device_id];
620   absl::MutexLock lock(&per_device_map.mutex);
621   auto it = per_device_map.correlation_map.find(correlation_id);
622   return it != per_device_map.correlation_map.end() ? it->second
623                                                     : AnnotationInfo();
624 }
625 
626 // CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and
627 // eventually convert and filter them to StepStats or XSpace.
628 class CuptiTraceCollectorImpl : public CuptiTraceCollector {
629  public:
CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions & option,uint64 start_walltime_ns,uint64 start_gpu_ns)630   CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions& option,
631                           uint64 start_walltime_ns, uint64 start_gpu_ns)
632       : CuptiTraceCollector(option),
633         num_callback_events_(0),
634         num_activity_events_(0),
635         start_walltime_ns_(start_walltime_ns),
636         start_gpu_ns_(start_gpu_ns),
637         num_gpus_(option.num_gpus),
638         per_device_collector_(option.num_gpus) {}
639 
AddEvent(CuptiTracerEvent && event)640   void AddEvent(CuptiTracerEvent&& event) override {
641     if (event.device_id >= num_gpus_) return;
642     if (event.source == CuptiTracerEventSource::DriverCallback) {
643       if (num_callback_events_ > options_.max_callback_api_events) {
644         OnEventsDropped("total driver(callback) events reaches max", 1);
645         return;
646       }
647       num_callback_events_++;
648     } else {
649       if (num_activity_events_ > options_.max_activity_api_events) {
650         OnEventsDropped("total device(activity) events reaches max", 1);
651         return;
652       }
653       num_activity_events_++;
654     }
655     per_device_collector_[event.device_id].AddEvent(std::move(event));
656   }
OnEventsDropped(const std::string & reason,uint32 num_events)657   void OnEventsDropped(const std::string& reason, uint32 num_events) override {
658     absl::MutexLock lock(&mutex_);
659     dropped_events_[reason] += num_events;
660   }
Flush()661   void Flush() override {}
Export(StepStats * step_stats)662   void Export(StepStats* step_stats) override {
663     LOG(INFO) << " GpuTracer has collected " << num_callback_events_
664               << " callback api events and " << num_activity_events_
665               << " activity events. " << ReportDroppedEvents();
666     for (int i = 0; i < num_gpus_; ++i) {
667       per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_,
668                                      step_stats);
669     }
670   }
671   // Returns true if some GPU events are captured.
Export(XSpace * space,uint64 end_gpu_ns)672   bool Export(XSpace* space, uint64 end_gpu_ns) override {
673     LOG(INFO) << " GpuTracer has collected " << num_callback_events_
674               << " callback api events and " << num_activity_events_
675               << " activity events. " << ReportDroppedEvents();
676     size_t num_events = 0;
677     XPlaneBuilder host_plane(
678         FindOrAddMutablePlaneWithName(space, kCuptiDriverApiPlaneName));
679     for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
680       std::string name = GpuPlaneName(device_ordinal);
681       XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
682       device_plane.SetId(device_ordinal);
683       VLOG(4) << "Creating plane for"
684               << " name=" << name << " ordinal=" << device_ordinal;
685 
686       // Calculate device capabilities before flushing, so that device
687       // properties are available to the occupancy calculator in Flush().
688       per_device_collector_[device_ordinal].GetDeviceCapabilities(
689           device_ordinal, &device_plane);
690       num_events += per_device_collector_[device_ordinal].Flush(
691           start_gpu_ns_, end_gpu_ns, &device_plane, &host_plane);
692       NormalizeTimeStamps(&device_plane, start_walltime_ns_);
693     }
694     NormalizeTimeStamps(&host_plane, start_walltime_ns_);
695     return num_events > 0;
696   }
697 
ReportDroppedEvents()698   std::string ReportDroppedEvents() {
699     absl::MutexLock lock(&mutex_);
700     string result;
701     for (const auto& dropped : dropped_events_) {
702       absl::StrAppend(&result, " ", dropped.second, " events dropped because ",
703                       dropped.first, ";");
704     }
705     if (!result.empty()) result.back() = '.';
706     return result;
707   }
ReportNumEventsIfDropped()708   std::string ReportNumEventsIfDropped() override {
709     std::string events_dropped = ReportDroppedEvents();
710     if (events_dropped.empty()) return "";
711     return absl::StrCat("Detected GPU events dropped on ", port::Hostname(),
712                         ": Profiler has collected ",
713                         num_callback_events_.load(), " driver events and ",
714                         num_activity_events_.load(), " device events.",
715                         events_dropped);
716   }
717 
718  private:
719   std::atomic<int> num_callback_events_;
720   std::atomic<int> num_activity_events_;
721   absl::Mutex mutex_;
722   absl::flat_hash_map<std::string, uint64> dropped_events_
723       ABSL_GUARDED_BY(mutex_);
724   uint64 start_walltime_ns_;
725   uint64 start_gpu_ns_;
726   int num_gpus_;
727 
728   // Set the all XLines of specified XPlane to starting walltime.
729   // Events time in both host and device planes are CUTPI timestamps.
730   // We set initial CUPTI timestamp as start time for all lines to reflect
731   // this fact. Eventually we change line start time to corresponding
732   // start_walltime_ns to normalize with CPU wall time.
NormalizeTimeStamps(XPlaneBuilder * plane,uint64 start_walltime_ns)733   static void NormalizeTimeStamps(XPlaneBuilder* plane,
734                                   uint64 start_walltime_ns) {
735     plane->ForEachLine(
736         [&](XLineBuilder line) { line.SetTimestampNs(start_walltime_ns); });
737   }
738 
739   absl::FixedArray<PerDeviceCollector> per_device_collector_;
740 
741   TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollectorImpl);
742 };
743 
CreateCuptiCollector(const CuptiTracerCollectorOptions & options,const uint64 start_walltime_ns,const uint64 start_gputime_ns)744 std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
745     const CuptiTracerCollectorOptions& options, const uint64 start_walltime_ns,
746     const uint64 start_gputime_ns) {
747   return absl::make_unique<CuptiTraceCollectorImpl>(options, start_walltime_ns,
748                                                     start_gputime_ns);
749 }
750 
751 // The strings are parser friendly and have no whitespaces in them.
GetMemoryKindName(int8 kind)752 absl::string_view GetMemoryKindName(int8 kind) {
753   auto memory_kind = static_cast<CUpti_ActivityMemoryKind>(kind);
754   switch (memory_kind) {
755     case CUPTI_ACTIVITY_MEMORY_KIND_ARRAY:
756       return "array";
757     case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE:
758       return "device";
759     case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC:
760       return "device_static";
761     case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED:
762       return "managed";
763     case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC:
764       return "managed_static";
765     case CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE:
766       return "pageable";
767     case CUPTI_ACTIVITY_MEMORY_KIND_PINNED:
768       return "pinned";
769     case CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN:
770     default:
771       return "unknown";
772   }
773 }
774 
775 }  // namespace profiler
776 }  // namespace tensorflow
777