• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/internal/gpu/cupti_collector.h"
17 
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/container/flat_hash_set.h"
20 #include "absl/hash/hash.h"
21 #include "absl/strings/str_cat.h"
22 #include "absl/strings/str_join.h"
23 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
24 #include "third_party/gpus/cuda/include/cuda.h"
25 #include "third_party/gpus/cuda/include/cuda_occupancy.h"
26 #include "tensorflow/core/platform/abi.h"
27 #include "tensorflow/core/platform/host_info.h"
28 #include "tensorflow/core/platform/mutex.h"
29 #include "tensorflow/core/profiler/utils/parse_annotation.h"
30 #include "tensorflow/core/profiler/utils/xplane_builder.h"
31 #include "tensorflow/core/profiler/utils/xplane_schema.h"
32 #include "tensorflow/core/profiler/utils/xplane_utils.h"
33 
34 namespace tensorflow {
35 namespace profiler {
36 
37 namespace {
38 
IsHostEvent(const CuptiTracerEvent & event,int64 * line_id)39 bool IsHostEvent(const CuptiTracerEvent& event, int64* line_id) {
40   // DriverCallback(i.e. kernel launching) events are host events.
41   if (event.source == CuptiTracerEventSource::DriverCallback) {
42     *line_id = event.thread_id;
43     return true;
44   }
45   // Non-overhead activity events are device events.
46   if (event.type != CuptiTracerEventType::Overhead) {
47     *line_id = event.stream_id;
48     return false;
49   }
50   // Overhead events can be associated with a thread or a stream, etc.
51   // If a valid thread id is specified, we consider it as a host event.
52   //
53   if (event.stream_id != CuptiTracerEvent::kInvalidStreamId) {
54     *line_id = event.stream_id;
55     return false;
56   } else if (event.thread_id != CuptiTracerEvent::kInvalidThreadId &&
57              event.thread_id != 0) {
58     *line_id = event.thread_id;
59     return true;
60   } else {
61     *line_id = kThreadIdOverhead;
62     return false;
63   }
64 }
65 
66 struct DeviceOccupancyParams {
67   cudaOccFuncAttributes attributes = {};
68   int block_size = 0;
69   size_t dynamic_smem_size = 0;
70 
operator ==(const DeviceOccupancyParams & lhs,const DeviceOccupancyParams & rhs)71   friend bool operator==(const DeviceOccupancyParams& lhs,
72                          const DeviceOccupancyParams& rhs) {
73     return 0 == memcmp(&lhs, &rhs, sizeof(lhs));
74   }
75 
76   template <typename H>
AbslHashValue(H hash_state,const DeviceOccupancyParams & params)77   friend H AbslHashValue(H hash_state, const DeviceOccupancyParams& params) {
78     return H::combine(
79         std::move(hash_state), params.attributes.maxThreadsPerBlock,
80         params.attributes.numRegs, params.attributes.sharedSizeBytes,
81         static_cast<uint32_t>(params.attributes.partitionedGCConfig),
82         static_cast<uint32_t>(params.attributes.shmemLimitConfig),
83         params.attributes.maxDynamicSharedSizeBytes, params.block_size,
84         params.dynamic_smem_size);
85   }
86 };
87 
88 struct OccupancyStats {
89   double occupancy_pct = 0.0;
90   int min_grid_size = 0;
91   int suggested_block_size = 0;
92 };
93 
94 class PerDeviceCollector {
95  private:
GetOccupancy(const DeviceOccupancyParams & params) const96   OccupancyStats GetOccupancy(const DeviceOccupancyParams& params) const {
97     OccupancyStats stats;
98     if (device_properties_.computeMajor == 0) {
99       return {};
100     }
101 
102     const cudaOccDeviceState state = {};
103     cudaOccResult occ_result;
104     cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor(
105         &occ_result, &device_properties_, &params.attributes, &state,
106         params.block_size, params.dynamic_smem_size);
107     if (status != CUDA_OCC_SUCCESS) {
108       return {};
109     }
110 
111     stats.occupancy_pct =
112         occ_result.activeBlocksPerMultiprocessor * params.block_size * 100;
113     stats.occupancy_pct /= device_properties_.maxThreadsPerMultiprocessor;
114 
115     status = cudaOccMaxPotentialOccupancyBlockSize(
116         &stats.min_grid_size, &stats.suggested_block_size, &device_properties_,
117         &params.attributes, &state, NULL, params.dynamic_smem_size);
118     if (status != CUDA_OCC_SUCCESS) {
119       return {};
120     }
121 
122     return stats;
123   }
124 
CreateXEvent(const CuptiTracerEvent & event,XPlaneBuilder * plane,uint64 start_gpu_ns,uint64 end_gpu_ns,XLineBuilder * line)125   void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
126                     uint64 start_gpu_ns, uint64 end_gpu_ns,
127                     XLineBuilder* line) {
128     if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
129         event.start_time_ns > event.end_time_ns) {
130       VLOG(2) << "events have abnormal timestamps:" << event.name
131               << " start time(ns): " << event.start_time_ns
132               << " end time(ns): " << event.end_time_ns;
133       return;
134     }
135     std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
136     if (kernel_name.empty()) {
137       kernel_name = GetTraceEventTypeName(event.type);
138     }
139     XEventMetadata* event_metadata =
140         plane->GetOrCreateEventMetadata(std::move(kernel_name));
141     XEventBuilder xevent = line->AddEvent(*event_metadata);
142     VLOG(7) << "Adding event to line=" << line->Id();
143     xevent.SetTimestampNs(event.start_time_ns);
144     xevent.SetEndTimestampNs(event.end_time_ns);
145     if (event.source == CuptiTracerEventSource::DriverCallback) {
146       xevent.AddStatValue(
147           *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDeviceId)),
148           event.device_id);
149     }
150     if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
151       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
152                               GetStatTypeStr(StatType::kCorrelationId)),
153                           event.correlation_id);
154     }
155     if (!event.annotation.empty()) {
156       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
157                               GetStatTypeStr(StatType::kKernelAnnotation)),
158                           *plane->GetOrCreateStatMetadata(event.annotation));
159     }
160     if (!event.nvtx_range.empty()) {
161       xevent.AddStatValue(
162           *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)),
163           *plane->GetOrCreateStatMetadata(event.nvtx_range));
164     }
165     if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
166       xevent.AddStatValue(
167           *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)),
168           absl::StrCat("$$", static_cast<uint64>(event.context_id)));
169     }
170 
171     if (event.type == CuptiTracerEventType::Kernel &&
172         event.source == CuptiTracerEventSource::Activity) {
173       DeviceOccupancyParams params{};
174       params.attributes.maxThreadsPerBlock = INT_MAX;
175       params.attributes.numRegs =
176           static_cast<int>(event.kernel_info.registers_per_thread);
177       params.attributes.sharedSizeBytes =
178           event.kernel_info.static_shared_memory_usage;
179       params.attributes.partitionedGCConfig = PARTITIONED_GC_OFF;
180       params.attributes.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT;
181       params.attributes.maxDynamicSharedSizeBytes = 0;
182       params.block_size = static_cast<int>(event.kernel_info.block_x *
183                                            event.kernel_info.block_y *
184                                            event.kernel_info.block_z);
185 
186       params.dynamic_smem_size = event.kernel_info.dynamic_shared_memory_usage;
187 
188       OccupancyStats& occ_stats = occupancy_cache_[params];
189       if (occ_stats.occupancy_pct == 0.0) {
190         occ_stats = GetOccupancy(params);
191       }
192       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
193                               StatType::kTheoreticalOccupancyPct)),
194                           occ_stats.occupancy_pct);
195       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
196                               GetStatTypeStr(StatType::kOccupancyMinGridSize)),
197                           static_cast<int32>(occ_stats.min_grid_size));
198       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
199                               StatType::kOccupancySuggestedBlockSize)),
200                           static_cast<int32>(occ_stats.suggested_block_size));
201       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
202                               GetStatTypeStr(StatType::kKernelDetails)),
203                           *plane->GetOrCreateStatMetadata(ToXStat(
204                               event.kernel_info, occ_stats.occupancy_pct)));
205     } else if (event.type == CuptiTracerEventType::MemcpyH2D ||
206                event.type == CuptiTracerEventType::MemcpyD2H ||
207                event.type == CuptiTracerEventType::MemcpyD2D ||
208                event.type == CuptiTracerEventType::MemcpyP2P ||
209                event.type == CuptiTracerEventType::MemcpyOther) {
210       const auto& memcpy_info = event.memcpy_info;
211       std::string value = absl::StrCat(
212           "kind_src:", GetMemoryKindName(event.memcpy_info.src_mem_kind),
213           " kind_dst:", GetMemoryKindName(event.memcpy_info.dst_mem_kind),
214           " size:", memcpy_info.num_bytes, " dest:", memcpy_info.destination,
215           " async:", memcpy_info.async);
216       VLOG(7) << "Add Memcpy stat. " << value;
217       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
218                               GetStatTypeStr(StatType::kMemcpyDetails)),
219                           *plane->GetOrCreateStatMetadata(std::move(value)));
220     } else if (event.type == CuptiTracerEventType::MemoryAlloc) {
221       std::string value =
222           absl::StrCat("kind:", GetMemoryKindName(event.memalloc_info.mem_kind),
223                        " num_bytes:", event.memalloc_info.num_bytes);
224       VLOG(7) << "Add MemAlloc stat. " << value;
225       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
226                               GetStatTypeStr(StatType::kMemallocDetails)),
227                           *plane->GetOrCreateStatMetadata(std::move(value)));
228     } else if (event.type == CuptiTracerEventType::MemoryFree) {
229       std::string value =
230           absl::StrCat("kind:", GetMemoryKindName(event.memfree_info.mem_kind),
231                        " num_bytes:", event.memfree_info.num_bytes);
232       VLOG(7) << "Add MemFree stat. " << value;
233       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
234                               GetStatTypeStr(StatType::kMemFreeDetails)),
235                           *plane->GetOrCreateStatMetadata(std::move(value)));
236     } else if (event.type == CuptiTracerEventType::Memset) {
237       std::string value =
238           absl::StrCat("kind:", GetMemoryKindName(event.memset_info.mem_kind),
239                        " num_bytes:", event.memset_info.num_bytes,
240                        " async:", event.memset_info.async);
241       VLOG(7) << "Add Memset stat. " << value;
242       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
243                               GetStatTypeStr(StatType::kMemsetDetails)),
244                           *plane->GetOrCreateStatMetadata(std::move(value)));
245     } else if (event.type == CuptiTracerEventType::MemoryResidency) {
246       std::string value = absl::StrCat(
247           "kind:", GetMemoryKindName(event.memory_residency_info.mem_kind),
248           " num_bytes:", event.memory_residency_info.num_bytes, " addr:0x",
249           absl::Hex(event.memory_residency_info.address, absl::kZeroPad16));
250       VLOG(7) << "Add MemoryResidency stat. " << value;
251       xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
252                               StatType::kMemoryResidencyDetails)),
253                           *plane->GetOrCreateStatMetadata(std::move(value)));
254     }
255 
256     std::vector<Annotation> annotation_stack =
257         ParseAnnotationStack(event.annotation);
258     if (!annotation_stack.empty()) {
259       xevent.AddStatValue(
260           *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)),
261           *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name));
262     }
263     // If multiple metadata have the same key name, show the values from the top
264     // of the stack (innermost annotation). Concatenate the values from
265     // "hlo_op".
266     absl::flat_hash_set<absl::string_view> key_set;
267 
268     for (auto annotation = annotation_stack.rbegin();
269          annotation != annotation_stack.rend(); ++annotation) {
270       for (const Annotation::Metadata& metadata : annotation->metadata) {
271         if (key_set.insert(metadata.key).second) {
272           xevent.ParseAndAddStatValue(
273               *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
274         }
275       }
276     }
277   }
278 
GetDeviceAttribute(CUdevice device,CUdevice_attribute attrib)279   absl::optional<int> GetDeviceAttribute(CUdevice device,
280                                          CUdevice_attribute attrib) {
281     int ret_val;
282     CUresult err = cuDeviceGetAttribute(&ret_val, attrib, device);
283     if (err != CUDA_SUCCESS) return absl::nullopt;
284     return ret_val;
285   }
286 
GetDeviceXLineName(int64_t stream_id,absl::flat_hash_set<CuptiTracerEventType> & event_types)287   std::string GetDeviceXLineName(
288       int64_t stream_id,
289       absl::flat_hash_set<CuptiTracerEventType>& event_types) {
290     std::string line_name = absl::StrCat("Stream #", stream_id);
291     event_types.erase(CuptiTracerEventType::Unsupported);
292     if (event_types.empty()) return line_name;
293     if (event_types.count(CuptiTracerEventType::Overhead))
294       return "CUPTI overhead";
295     std::vector<const char*> type_names;
296     for (const auto event_type : event_types) {
297       type_names.emplace_back(GetTraceEventTypeName(event_type));
298     }
299     return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
300   }
301 
302  public:
303   PerDeviceCollector() = default;
304 
AddEvent(CuptiTracerEvent && event)305   void AddEvent(CuptiTracerEvent&& event) {
306     mutex_lock l(m_);
307     events_.emplace_back(std::move(event));
308   }
309 
Flush(uint64 start_gpu_ns,uint64 end_gpu_ns,XPlaneBuilder * device_plane,XPlaneBuilder * host_plane)310   size_t Flush(uint64 start_gpu_ns, uint64 end_gpu_ns,
311                XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) {
312     mutex_lock l(m_);
313     // Tracking event types per line.
314     absl::flat_hash_map<int64, absl::flat_hash_set<CuptiTracerEventType>>
315         events_types_per_line;
316     for (auto& event : events_) {
317       int64_t line_id = CuptiTracerEvent::kInvalidThreadId;
318       bool is_host_event = IsHostEvent(event, &line_id);
319       if (line_id == CuptiTracerEvent::kInvalidThreadId ||
320           line_id == CuptiTracerEvent::kInvalidStreamId) {
321         VLOG(9) << "Ignoring event, type=" << static_cast<int>(event.type);
322         continue;
323       }
324       auto* plane = is_host_event ? host_plane : device_plane;
325       VLOG(9) << "Event"
326               << " type=" << static_cast<int>(event.type)
327               << " line_id=" << line_id
328               << (is_host_event ? " host plane=" : " device plane=")
329               << plane->Name();
330       XLineBuilder line = plane->GetOrCreateLine(line_id);
331       line.SetTimestampNs(start_gpu_ns);
332       CreateXEvent(event, plane, start_gpu_ns, end_gpu_ns, &line);
333       events_types_per_line[line_id].emplace(event.type);
334     }
335     device_plane->ForEachLine([&](XLineBuilder line) {
336       line.SetName(
337           GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
338     });
339     host_plane->ForEachLine([&](XLineBuilder line) {
340       line.SetName(absl::StrCat("Host Threads/", line.Id()));
341     });
342     size_t num_events = events_.size();
343     events_.clear();
344     return num_events;
345   }
346 
GetDeviceCapabilities(int32_t device_ordinal,XPlaneBuilder * device_plane)347   void GetDeviceCapabilities(int32_t device_ordinal,
348                              XPlaneBuilder* device_plane) {
349     CUdevice device;
350     if (cuDeviceGet(&device, device_ordinal) != CUDA_SUCCESS) return;
351 
352     auto clock_rate_in_khz =
353         GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_CLOCK_RATE);
354     if (clock_rate_in_khz) {
355       device_plane->AddStatValue(
356           *device_plane->GetOrCreateStatMetadata(
357               GetStatTypeStr(StatType::kDevCapClockRateKHz)),
358           *clock_rate_in_khz);
359     }
360 
361     auto core_count =
362         GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
363     if (core_count) {
364       device_plane->AddStatValue(
365           *device_plane->GetOrCreateStatMetadata(
366               GetStatTypeStr(StatType::kDevCapCoreCount)),
367           *core_count);
368     }
369 
370     auto mem_clock_khz =
371         GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE);
372     auto mem_bus_width_bits =
373         GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH);
374     if (mem_clock_khz && mem_bus_width_bits) {
375       // Times 2 because HBM is DDR memory; it gets two data bits per each
376       // data lane.
377       auto memory_bandwidth =
378           uint64{2} * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8;
379       device_plane->AddStatValue(
380           *device_plane->GetOrCreateStatMetadata(
381               GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
382           memory_bandwidth);
383     }
384 
385     size_t total_memory = 0;
386     if (cuDeviceTotalMem(&total_memory, device) == CUDA_SUCCESS) {
387       device_plane->AddStatValue(
388           *device_plane->GetOrCreateStatMetadata(
389               GetStatTypeStr(StatType::kDevCapMemorySize)),
390           static_cast<uint64>(total_memory));
391     }
392 
393     auto compute_capability_major = GetDeviceAttribute(
394         device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
395     if (compute_capability_major) {
396       device_plane->AddStatValue(
397           *device_plane->GetOrCreateStatMetadata(
398               GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
399           *compute_capability_major);
400     }
401     auto compute_capability_minor = GetDeviceAttribute(
402         device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
403     if (compute_capability_minor) {
404       device_plane->AddStatValue(
405           *device_plane->GetOrCreateStatMetadata(
406               GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
407           *compute_capability_minor);
408     }
409 
410     auto max_threads_per_block =
411         GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
412     auto max_threads_per_sm = GetDeviceAttribute(
413         device, CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR);
414     auto regs_per_block =
415         GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK);
416     auto regs_per_sm = GetDeviceAttribute(
417         device, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR);
418     auto warp_size = GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_WARP_SIZE);
419     auto shared_mem_per_block = GetDeviceAttribute(
420         device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK);
421     auto shared_mem_per_sm = GetDeviceAttribute(
422         device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR);
423     auto shared_mem_per_block_optin = GetDeviceAttribute(
424         device, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN);
425 
426     // Precondition for calculating GPU occupancy is to have all of these
427     // inputs. Otherwise, GPU occupancy will be left unset as 0%.
428     if (core_count && compute_capability_major && compute_capability_minor &&
429         max_threads_per_block && max_threads_per_sm && regs_per_block &&
430         regs_per_sm && warp_size && shared_mem_per_block && shared_mem_per_sm &&
431         shared_mem_per_block_optin) {
432       device_properties_.computeMajor = *compute_capability_major;
433       device_properties_.computeMinor = *compute_capability_minor;
434       device_properties_.numSms = *core_count;
435       device_properties_.maxThreadsPerBlock = *max_threads_per_block;
436       device_properties_.maxThreadsPerMultiprocessor = *max_threads_per_sm;
437       device_properties_.regsPerBlock = *regs_per_block;
438       device_properties_.regsPerMultiprocessor = *regs_per_sm;
439       device_properties_.warpSize = *warp_size;
440       device_properties_.sharedMemPerBlock = *shared_mem_per_block;
441       device_properties_.sharedMemPerMultiprocessor = *shared_mem_per_sm;
442       device_properties_.sharedMemPerBlockOptin = *shared_mem_per_block_optin;
443     }
444   }
445 
446  private:
447   mutex m_;
448   std::vector<CuptiTracerEvent> events_ TF_GUARDED_BY(m_);
449   cudaOccDeviceProp device_properties_;
450   absl::flat_hash_map<DeviceOccupancyParams, OccupancyStats> occupancy_cache_;
451 };
452 
453 }  // namespace
454 
Add(uint32 device_id,uint32 correlation_id,const absl::string_view annotation,const absl::string_view nvtx_range)455 void AnnotationMap::Add(uint32 device_id, uint32 correlation_id,
456                         const absl::string_view annotation,
457                         const absl::string_view nvtx_range) {
458   if (annotation.empty() && nvtx_range.empty()) return;
459   VLOG(3) << "Add annotation: device_id: " << device_id
460           << " correlation_id: " << correlation_id
461           << " annotation: " << annotation;
462   if (device_id >= per_device_map_.size()) return;
463   auto& per_device_map = per_device_map_[device_id];
464   absl::MutexLock lock(&per_device_map.mutex);
465   if (per_device_map.annotations.size() < max_size_) {
466     AnnotationInfo info;
467     info.annotation = *per_device_map.annotations.emplace(annotation).first;
468     if (!nvtx_range.empty())
469       info.nvtx_range = *per_device_map.nvtx_ranges.emplace(nvtx_range).first;
470     per_device_map.correlation_map.emplace(correlation_id, info);
471   }
472 }
473 
LookUp(uint32 device_id,uint32 correlation_id)474 AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32 device_id,
475                                                     uint32 correlation_id) {
476   if (device_id >= per_device_map_.size()) return AnnotationInfo();
477   auto& per_device_map = per_device_map_[device_id];
478   absl::MutexLock lock(&per_device_map.mutex);
479   auto it = per_device_map.correlation_map.find(correlation_id);
480   return it != per_device_map.correlation_map.end() ? it->second
481                                                     : AnnotationInfo();
482 }
483 
484 // CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and
485 // eventually convert and filter them to XSpace.
486 class CuptiTraceCollectorImpl : public CuptiTraceCollector {
487  public:
CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions & option,uint64 start_walltime_ns,uint64 start_gpu_ns)488   CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions& option,
489                           uint64 start_walltime_ns, uint64 start_gpu_ns)
490       : CuptiTraceCollector(option),
491         num_callback_events_(0),
492         num_activity_events_(0),
493         start_walltime_ns_(start_walltime_ns),
494         start_gpu_ns_(start_gpu_ns),
495         num_gpus_(option.num_gpus),
496         per_device_collector_(option.num_gpus) {}
497 
AddEvent(CuptiTracerEvent && event)498   void AddEvent(CuptiTracerEvent&& event) override {
499     if (event.device_id >= num_gpus_) return;
500     if (event.source == CuptiTracerEventSource::DriverCallback) {
501       if (num_callback_events_ > options_.max_callback_api_events) {
502         OnEventsDropped("total driver(callback) events reaches max", 1);
503         return;
504       }
505       num_callback_events_++;
506     } else {
507       if (num_activity_events_ > options_.max_activity_api_events) {
508         OnEventsDropped("total device(activity) events reaches max", 1);
509         return;
510       }
511       num_activity_events_++;
512     }
513     per_device_collector_[event.device_id].AddEvent(std::move(event));
514   }
OnEventsDropped(const std::string & reason,uint32 num_events)515   void OnEventsDropped(const std::string& reason, uint32 num_events) override {
516     absl::MutexLock lock(&mutex_);
517     dropped_events_[reason] += num_events;
518   }
Flush()519   void Flush() override {}
520   // Returns true if some GPU events are captured.
Export(XSpace * space,uint64 end_gpu_ns)521   bool Export(XSpace* space, uint64 end_gpu_ns) override {
522     LOG(INFO) << " GpuTracer has collected " << num_callback_events_
523               << " callback api events and " << num_activity_events_
524               << " activity events. " << ReportDroppedEvents();
525     size_t num_events = 0;
526     XPlaneBuilder host_plane(
527         FindOrAddMutablePlaneWithName(space, kCuptiDriverApiPlaneName));
528     for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
529       std::string name = GpuPlaneName(device_ordinal);
530       XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name));
531       device_plane.SetId(device_ordinal);
532       VLOG(4) << "Creating plane for"
533               << " name=" << name << " ordinal=" << device_ordinal;
534 
535       // Calculate device capabilities before flushing, so that device
536       // properties are available to the occupancy calculator in Flush().
537       per_device_collector_[device_ordinal].GetDeviceCapabilities(
538           device_ordinal, &device_plane);
539       num_events += per_device_collector_[device_ordinal].Flush(
540           start_gpu_ns_, end_gpu_ns, &device_plane, &host_plane);
541       NormalizeTimeStamps(&device_plane, start_walltime_ns_);
542     }
543     NormalizeTimeStamps(&host_plane, start_walltime_ns_);
544     return num_events > 0;
545   }
546 
ReportDroppedEvents()547   std::string ReportDroppedEvents() {
548     absl::MutexLock lock(&mutex_);
549     string result;
550     for (const auto& dropped : dropped_events_) {
551       absl::StrAppend(&result, " ", dropped.second, " events dropped because ",
552                       dropped.first, ";");
553     }
554     if (!result.empty()) result.back() = '.';
555     return result;
556   }
ReportNumEventsIfDropped()557   std::string ReportNumEventsIfDropped() override {
558     std::string events_dropped = ReportDroppedEvents();
559     if (events_dropped.empty()) return "";
560     return absl::StrCat("Detected GPU events dropped on ", port::Hostname(),
561                         ": Profiler has collected ",
562                         num_callback_events_.load(), " driver events and ",
563                         num_activity_events_.load(), " device events.",
564                         events_dropped);
565   }
566 
567  private:
568   std::atomic<int> num_callback_events_;
569   std::atomic<int> num_activity_events_;
570   absl::Mutex mutex_;
571   absl::flat_hash_map<std::string, uint64> dropped_events_
572       ABSL_GUARDED_BY(mutex_);
573   uint64 start_walltime_ns_;
574   uint64 start_gpu_ns_;
575   int num_gpus_;
576 
577   // Set the all XLines of specified XPlane to starting walltime.
578   // Events time in both host and device planes are CUTPI timestamps.
579   // We set initial CUPTI timestamp as start time for all lines to reflect
580   // this fact. Eventually we change line start time to corresponding
581   // start_walltime_ns to normalize with CPU wall time.
NormalizeTimeStamps(XPlaneBuilder * plane,uint64 start_walltime_ns)582   static void NormalizeTimeStamps(XPlaneBuilder* plane,
583                                   uint64 start_walltime_ns) {
584     plane->ForEachLine(
585         [&](XLineBuilder line) { line.SetTimestampNs(start_walltime_ns); });
586   }
587 
588   absl::FixedArray<PerDeviceCollector> per_device_collector_;
589 
590   TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollectorImpl);
591 };
592 
CreateCuptiCollector(const CuptiTracerCollectorOptions & options,const uint64 start_walltime_ns,const uint64 start_gputime_ns)593 std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
594     const CuptiTracerCollectorOptions& options, const uint64 start_walltime_ns,
595     const uint64 start_gputime_ns) {
596   return absl::make_unique<CuptiTraceCollectorImpl>(options, start_walltime_ns,
597                                                     start_gputime_ns);
598 }
599 
600 // The strings are parser friendly and have no whitespaces in them.
GetMemoryKindName(int8_t memory_kind)601 absl::string_view GetMemoryKindName(int8_t memory_kind) {
602   switch (memory_kind) {
603     case CUPTI_ACTIVITY_MEMORY_KIND_ARRAY:
604       return "array";
605     case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE:
606       return "device";
607     case CUPTI_ACTIVITY_MEMORY_KIND_DEVICE_STATIC:
608       return "device_static";
609     case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED:
610       return "managed";
611     case CUPTI_ACTIVITY_MEMORY_KIND_MANAGED_STATIC:
612       return "managed_static";
613     case CUPTI_ACTIVITY_MEMORY_KIND_PAGEABLE:
614       return "pageable";
615     case CUPTI_ACTIVITY_MEMORY_KIND_PINNED:
616       return "pinned";
617     case CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN:
618     default:
619       return "unknown";
620   }
621 }
622 
623 }  // namespace profiler
624 }  // namespace tensorflow
625