• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #if GOOGLE_CUDA
17 
18 #include <stdlib.h>
19 
20 #include <memory>
21 
22 #include "absl/container/fixed_array.h"
23 #include "absl/container/flat_hash_map.h"
24 #include "absl/container/flat_hash_set.h"
25 #include "absl/strings/str_cat.h"
26 #include "absl/strings/str_format.h"
27 #include "absl/strings/str_join.h"
28 #include "tensorflow/core/common_runtime/step_stats_collector.h"
29 #include "tensorflow/core/lib/core/errors.h"
30 #include "tensorflow/core/platform/abi.h"
31 #include "tensorflow/core/platform/macros.h"
32 #include "tensorflow/core/profiler/internal/annotation_stack.h"
33 #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
34 #include "tensorflow/core/profiler/internal/gpu/cupti_wrapper.h"
35 #include "tensorflow/core/profiler/internal/parse_annotation.h"
36 #include "tensorflow/core/profiler/internal/profiler_factory.h"
37 #include "tensorflow/core/profiler/internal/profiler_interface.h"
38 #include "tensorflow/core/profiler/utils/xplane_builder.h"
39 #include "tensorflow/core/profiler/utils/xplane_schema.h"
40 #include "tensorflow/core/profiler/utils/xplane_utils.h"
41 #include "tensorflow/core/util/env_var.h"
42 
43 namespace tensorflow {
44 namespace profiler {
45 
46 namespace {
47 
IsHostEvent(const CuptiTracerEvent & event)48 bool IsHostEvent(const CuptiTracerEvent& event) {
49   // DriverCallback(i.e. kernel launching) events are host events.
50   if (event.source == CuptiTracerEventSource::DriverCallback) return true;
51   // Non-overhead activity events are device events.
52   if (event.type != CuptiTracerEventType::Overhead) return false;
53   // Overhead events can be associated with a thread or a stream, etc.
54   // If a valid thread id is specified, we consider it as a host event.
55   return event.thread_id != CuptiTracerEvent::kInvalidThreadId;
56 }
57 
CreateXEvent(const CuptiTracerEvent & event,uint64 offset_ns,XPlaneBuilder * plane,XLineBuilder * line)58 void CreateXEvent(const CuptiTracerEvent& event, uint64 offset_ns,
59                   XPlaneBuilder* plane, XLineBuilder* line) {
60   std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
61   XEventMetadata* event_metadata = plane->GetOrCreateEventMetadata(kernel_name);
62   XEventBuilder xevent = line->AddEvent(*event_metadata);
63   xevent.SetTimestampNs(event.start_time_ns + offset_ns);
64   xevent.SetEndTimestampNs(event.end_time_ns + offset_ns);
65   if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
66     xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
67                             GetStatTypeStr(StatType::kCorrelationId)),
68                         event.correlation_id);
69   }
70   if (!event.annotation.empty()) {
71     xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
72                             GetStatTypeStr(StatType::kKernelAnnotation)),
73                         event.annotation);
74   }
75   if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
76     xevent.AddStatValue(
77         *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)),
78         absl::StrCat("$$", static_cast<uint64>(event.context_id)));
79   }
80   if (event.type == CuptiTracerEventType::Kernel) {
81     const std::string kernel_details =
82         absl::StrFormat("regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u",
83                         event.kernel_info.registers_per_thread,
84                         event.kernel_info.static_shared_memory_usage,
85                         event.kernel_info.grid_x, event.kernel_info.grid_y,
86                         event.kernel_info.grid_z, event.kernel_info.block_x,
87                         event.kernel_info.block_y, event.kernel_info.block_z);
88     xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
89                             GetStatTypeStr(StatType::kKernelDetails)),
90                         kernel_details);
91   }
92   if (event.type == CuptiTracerEventType::MemcpyH2D ||
93       event.type == CuptiTracerEventType::MemcpyD2H ||
94       event.type == CuptiTracerEventType::MemcpyD2D ||
95       event.type == CuptiTracerEventType::MemcpyP2P ||
96       event.type == CuptiTracerEventType::MemcpyOther) {
97     const auto& memcpy_info = event.memcpy_info;
98     std::string memcpy_details =
99         absl::StrFormat("size:%u dest:%u async:%u", memcpy_info.num_bytes,
100                         memcpy_info.destination, memcpy_info.async);
101     xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
102                             GetStatTypeStr(StatType::kMemcpyDetails)),
103                         memcpy_details);
104   }
105   if (event.type == CuptiTracerEventType::MemoryAlloc) {
106     std::string memalloc_details =
107         absl::StrFormat("num_bytes:%u", event.memalloc_info.num_bytes);
108     xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
109                             GetStatTypeStr(StatType::kMemallocDetails)),
110                         memalloc_details);
111   }
112 
113   std::vector<Annotation> annotation_stack =
114       ParseAnnotationStack(event.annotation);
115   // If multiple metadata have the same key name, show the values from the top
116   // of the stack (innermost annotation). Concatenate the values from "hlo_op".
117   absl::flat_hash_set<absl::string_view> key_set;
118   std::vector<absl::string_view> hlo_op_names;
119   for (auto annotation = annotation_stack.rbegin();
120        annotation != annotation_stack.rend(); ++annotation) {
121     for (const Annotation::Metadata& metadata : annotation->metadata) {
122       if (metadata.key == "tf_op") {
123         continue;  // ignored, obtained from HLO proto via DebugInfoMap
124       } else if (key_set.insert(metadata.key).second) {
125         xevent.ParseAndAddStatValue(
126             *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
127       }
128     }
129   }
130 }
131 
GetDeviceAttribute(CUdevice device,CUdevice_attribute attrib)132 absl::optional<int> GetDeviceAttribute(CUdevice device,
133                                        CUdevice_attribute attrib) {
134   int ret_val;
135   CUresult err = cuDeviceGetAttribute(&ret_val, attrib, device);
136   if (err != CUDA_SUCCESS) return absl::nullopt;
137   return ret_val;
138 }
139 
GetDeviceXLineName(int64 stream_id,absl::flat_hash_set<CuptiTracerEventType> & event_types)140 std::string GetDeviceXLineName(
141     int64 stream_id, absl::flat_hash_set<CuptiTracerEventType>& event_types) {
142   std::string line_name = absl::StrCat("Stream #", stream_id);
143   event_types.erase(CuptiTracerEventType::Unsupported);
144   if (event_types.empty()) return line_name;
145   std::vector<const char*> type_names;
146   for (const auto event_type : event_types) {
147     type_names.emplace_back(GetTraceEventTypeName(event_type));
148   }
149   return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
150 }
151 
152 }  // namespace
153 
154 // CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and
155 // eventually convert and filter them to StepStats or XSpace.
156 class CuptiTraceCollectorImpl : public CuptiTraceCollector {
157  public:
CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions & option,uint64 start_walltime_ns,uint64 start_gpu_ns)158   CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions& option,
159                           uint64 start_walltime_ns, uint64 start_gpu_ns)
160       : CuptiTraceCollector(option),
161         num_callback_events_(0),
162         num_activity_events_(0),
163         start_walltime_ns_(start_walltime_ns),
164         start_gpu_ns_(start_gpu_ns),
165         num_gpus_(option.num_gpus),
166         per_device_collector_(option.num_gpus) {}
167 
AddEvent(CuptiTracerEvent && event)168   void AddEvent(CuptiTracerEvent&& event) override {
169     if (event.device_id >= num_gpus_) return;
170     if (event.source == CuptiTracerEventSource::DriverCallback) {
171       if (num_callback_events_ > options_.max_callback_api_events) {
172         OnEventsDropped("trace collector", 1);
173         return;
174       }
175       num_callback_events_++;
176     } else {
177       if (num_activity_events_ > options_.max_activity_api_events) {
178         OnEventsDropped("trace collector", 1);
179         return;
180       }
181       num_activity_events_++;
182     }
183     per_device_collector_[event.device_id].AddEvent(std::move(event));
184   }
OnEventsDropped(const std::string & reason,uint32 num_events)185   void OnEventsDropped(const std::string& reason, uint32 num_events) override {}
Flush()186   void Flush() override {}
Export(StepStatsCollector * trace_collector)187   void Export(StepStatsCollector* trace_collector) {
188     LOG(INFO) << " GpuTracer has collected " << num_callback_events_
189               << " callback api events and " << num_activity_events_
190               << " activity events.";
191     for (int i = 0; i < num_gpus_; ++i) {
192       per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_,
193                                      trace_collector);
194     }
195   }
Export(XSpace * space)196   void Export(XSpace* space) {
197     LOG(INFO) << " GpuTracer has collected " << num_callback_events_
198               << " callback api events and " << num_activity_events_
199               << " activity events.";
200     XPlaneBuilder host_plane(GetOrCreatePlane(space, kHostThreads));
201     host_plane.SetId(kHostPlaneId);
202     for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
203       std::string name = absl::StrCat(kGpuPlanePrefix, device_ordinal);
204       XPlaneBuilder device_plane(GetOrCreatePlane(space, name));
205       device_plane.SetId(kGpuPlaneBaseId + device_ordinal);
206       per_device_collector_[device_ordinal].Flush(
207           start_walltime_ns_, start_gpu_ns_, &device_plane, &host_plane);
208       per_device_collector_[device_ordinal].GetDeviceCapabilities(
209           device_ordinal, &device_plane);
210     }
211   }
212 
213  private:
214   std::atomic<int> num_callback_events_;
215   std::atomic<int> num_activity_events_;
216   uint64 start_walltime_ns_;
217   uint64 start_gpu_ns_;
218   int num_gpus_;
219 
220   struct CorrelationInfo {
CorrelationInfotensorflow::profiler::CuptiTraceCollectorImpl::CorrelationInfo221     CorrelationInfo(uint32 t, uint32 e) : thread_id(t), enqueue_time_ns(e) {}
222     uint32 thread_id;
223     uint64 enqueue_time_ns;
224   };
225   struct PerDeviceCollector {
AddEventtensorflow::profiler::CuptiTraceCollectorImpl::PerDeviceCollector226     void AddEvent(CuptiTracerEvent&& event) {
227       absl::MutexLock lock(&mutex);
228       if (event.source == CuptiTracerEventSource::DriverCallback) {
229         // Cupti api callback events were used to populate launch times etc.
230         if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
231           correlation_info.insert(
232               {event.correlation_id,
233                CorrelationInfo(event.thread_id, event.start_time_ns)});
234         }
235         events.emplace_back(std::move(event));
236       } else {
237         // Cupti activity events measure device times etc.
238         events.emplace_back(std::move(event));
239       }
240     }
241 
Flushtensorflow::profiler::CuptiTraceCollectorImpl::PerDeviceCollector242     void Flush(int32 device_ordinal, uint64 start_walltime_ns,
243                uint64 start_gpu_ns, StepStatsCollector* collector) {
244       absl::MutexLock lock(&mutex);
245       stream_device = absl::StrCat("/device:GPU:", device_ordinal, "/stream:");
246       memcpy_device = absl::StrCat("/device:GPU:", device_ordinal, "/memcpy");
247       sync_device = absl::StrCat("/device:GPU:", device_ordinal, "/sync");
248       for (auto& event : events) {
249         NodeExecStats* ns = new NodeExecStats;
250         ns->set_all_start_micros(
251             (start_walltime_ns + (event.start_time_ns - start_gpu_ns)) / 1000);
252         ns->set_op_start_rel_micros(0);
253         auto elapsed_ns = event.end_time_ns - event.start_time_ns;
254         ns->set_op_end_rel_micros(elapsed_ns / 1000);
255         ns->set_all_end_rel_micros(elapsed_ns / 1000);
256 
257         if (event.source == CuptiTracerEventSource::DriverCallback) {
258           // Legacy code ignore all other launch events except
259           // cuStreamSynchronize.
260           if (event.name == "cuStreamSynchronize") {
261             ns->set_node_name(event.name);
262             ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id));
263             ns->set_thread_id(event.thread_id);
264             collector->Save(sync_device, ns);
265           }
266         } else {  // CuptiTracerEventSource::Activity
267           // Get launch information if available.
268           if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
269             auto it = correlation_info.find(event.correlation_id);
270             if (it != correlation_info.end()) {
271               ns->set_scheduled_micros(it->second.enqueue_time_ns / 1000);
272               ns->set_thread_id(it->second.thread_id);
273             }
274           }
275 
276           auto annotation_stack = ParseAnnotationStack(event.annotation);
277           std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
278           std::string activity_name =
279               !annotation_stack.empty()
280                   ? std::string(annotation_stack.back().name)
281                   : kernel_name;
282           ns->set_node_name(activity_name);
283           switch (event.type) {
284             case CuptiTracerEventType::Kernel: {
285               const std::string details = absl::StrFormat(
286                   "regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u",
287                   event.kernel_info.registers_per_thread,
288                   event.kernel_info.static_shared_memory_usage,
289                   event.kernel_info.grid_x, event.kernel_info.grid_y,
290                   event.kernel_info.grid_z, event.kernel_info.block_x,
291                   event.kernel_info.block_y, event.kernel_info.block_z);
292               ns->set_timeline_label(absl::StrCat(kernel_name, " ", details,
293                                                   "@@", event.annotation));
294               auto nscopy = new NodeExecStats(*ns);
295               collector->Save(absl::StrCat(stream_device, "all"), ns);
296               collector->Save(absl::StrCat(stream_device, event.stream_id),
297                               nscopy);
298               break;
299             }
300             case CuptiTracerEventType::MemcpyH2D:
301             case CuptiTracerEventType::MemcpyD2H:
302             case CuptiTracerEventType::MemcpyD2D:
303             case CuptiTracerEventType::MemcpyP2P: {
304               std::string details = absl::StrCat(
305                   activity_name, " bytes:", event.memcpy_info.num_bytes);
306               if (event.memcpy_info.async) {
307                 absl::StrAppend(&details, " aync");
308               }
309               if (event.memcpy_info.destination != event.device_id) {
310                 absl::StrAppend(&details,
311                                 " to device:", event.memcpy_info.destination);
312               }
313               ns->set_timeline_label(std::move(details));
314               auto nscopy = new NodeExecStats(*ns);
315               collector->Save(memcpy_device, ns);
316               collector->Save(
317                   absl::StrCat(stream_device, event.stream_id, "<",
318                                GetTraceEventTypeName(event.type), ">"),
319                   nscopy);
320               break;
321             }
322             default:
323               ns->set_timeline_label(activity_name);
324               collector->Save(stream_device, ns);
325           }
326         }
327       }
328       events.clear();
329     }
330 
Flushtensorflow::profiler::CuptiTraceCollectorImpl::PerDeviceCollector331     void Flush(uint64 start_walltime_ns, uint64 start_gpu_ns,
332                XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) {
333       absl::MutexLock lock(&mutex);
334 
335       // Tracking event types per line.
336       absl::flat_hash_map<int64, absl::flat_hash_set<CuptiTracerEventType>>
337           events_types_per_line;
338       const uint64 offset_ns = start_walltime_ns - start_gpu_ns;
339       for (auto& event : events) {
340         bool is_host_event = IsHostEvent(event);
341         int64 line_id = is_host_event ? static_cast<int64>(event.thread_id)
342                                       : event.stream_id;
343         if (line_id == CuptiTracerEvent::kInvalidThreadId ||
344             line_id == CuptiTracerEvent::kInvalidStreamId)
345           continue;
346         auto* plane = is_host_event ? host_plane : device_plane;
347         XLineBuilder line = plane->GetOrCreateLine(line_id);
348         if (!is_host_event) line.SetTimestampNs(start_gpu_ns);
349         CreateXEvent(event, offset_ns, plane, &line);
350         events_types_per_line[line_id].emplace(event.type);
351       }
352       device_plane->ForEachLine([&](tensorflow::profiler::XLineBuilder line) {
353         line.SetName(
354             GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
355       });
356       events.clear();
357     }
358 
GetDeviceCapabilitiestensorflow::profiler::CuptiTraceCollectorImpl::PerDeviceCollector359     void GetDeviceCapabilities(int32 device_ordinal,
360                                XPlaneBuilder* device_plane) {
361       CUdevice device;
362       if (cuDeviceGet(&device, device_ordinal) != CUDA_SUCCESS) return;
363 
364       auto clock_rate_in_khz =
365           GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_CLOCK_RATE);
366       if (clock_rate_in_khz) {
367         device_plane->AddStatValue(
368             *device_plane->GetOrCreateStatMetadata(
369                 GetStatTypeStr(StatType::kDevCapClockRateKHz)),
370             *clock_rate_in_khz);
371       }
372 
373       auto core_count =
374           GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
375       if (core_count) {
376         device_plane->AddStatValue(
377             *device_plane->GetOrCreateStatMetadata(
378                 GetStatTypeStr(StatType::kDevCapCoreCount)),
379             *core_count);
380       }
381 
382       auto mem_clock_khz =
383           GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE);
384       auto mem_bus_width_bits = GetDeviceAttribute(
385           device, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH);
386       if (mem_clock_khz && mem_bus_width_bits) {
387         // Times 2 because HBM is DDR memory; it gets two data bits per each
388         // data lane.
389         auto memory_bandwidth =
390             2ULL * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8;
391         device_plane->AddStatValue(
392             *device_plane->GetOrCreateStatMetadata(
393                 GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
394             memory_bandwidth);
395       }
396 
397       size_t total_memory = 0;
398       if (cuDeviceTotalMem(&total_memory, device) == CUDA_SUCCESS) {
399         device_plane->AddStatValue(
400             *device_plane->GetOrCreateStatMetadata(
401                 GetStatTypeStr(StatType::kDevCapMemorySize)),
402             static_cast<uint64>(total_memory));
403       }
404 
405       auto compute_capability_major = GetDeviceAttribute(
406           device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
407       if (compute_capability_major) {
408         device_plane->AddStatValue(
409             *device_plane->GetOrCreateStatMetadata(
410                 GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
411             *compute_capability_major);
412       }
413       auto compute_capability_minor = GetDeviceAttribute(
414           device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
415       if (compute_capability_minor) {
416         device_plane->AddStatValue(
417             *device_plane->GetOrCreateStatMetadata(
418                 GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
419             *compute_capability_minor);
420       }
421     }
422 
423     absl::Mutex mutex;
424     std::string stream_device GUARDED_BY(mutex);
425     std::string memcpy_device GUARDED_BY(mutex);
426     std::string sync_device GUARDED_BY(mutex);
427     std::vector<CuptiTracerEvent> events GUARDED_BY(mutex);
428     absl::flat_hash_map<uint32, CorrelationInfo> correlation_info
429         GUARDED_BY(mutex);
430   };
431   absl::FixedArray<PerDeviceCollector> per_device_collector_;
432 
433   TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollectorImpl);
434 };
435 
436 // GpuTracer for GPU.
437 class GpuTracer : public profiler::ProfilerInterface {
438  public:
GpuTracer(CuptiTracer * cupti_tracer,CuptiInterface * cupti_interface)439   GpuTracer(CuptiTracer* cupti_tracer, CuptiInterface* cupti_interface)
440       : cupti_tracer_(cupti_tracer) {
441     VLOG(1) << "GpuTracer created.";
442   }
~GpuTracer()443   ~GpuTracer() override {}
444 
445   // GpuTracer interface:
446   Status Start() override;
447   Status Stop() override;
448   Status CollectData(RunMetadata* run_metadata) override;
449   Status CollectData(XSpace* space) override;
GetDeviceType()450   profiler::DeviceType GetDeviceType() override {
451     return profiler::DeviceType::kGpu;
452   }
453 
454  private:
455   Status DoStart();
456   Status DoStop();
457 
458   enum State {
459     kNotStarted,
460     kStartedOk,
461     kStartedError,
462     kStoppedOk,
463     kStoppedError
464   };
465   State profiling_state_ = State::kNotStarted;
466 
467   CuptiTracer* cupti_tracer_;
468   CuptiTracerOptions options_;
469   StepStats step_stats_;
470   std::unique_ptr<CuptiTraceCollectorImpl> cupti_collector_;
471 };
472 
DoStart()473 Status GpuTracer::DoStart() {
474   if (!cupti_tracer_->IsAvailable()) {
475     return errors::Unavailable("Another profile session running.");
476   }
477 
478   options_.cbids_selected = {
479       // KERNEL
480       CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
481       // MEMCPY
482       CUPTI_DRIVER_TRACE_CBID_cuMemcpy,
483       CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync,
484       CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2,
485       CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2,
486       CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2,
487       CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2,
488       CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2,
489       CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2,
490       CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2,
491       CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2,
492       CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2,
493       CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2,
494       CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2,
495       CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2,
496       CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2,
497       CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2,
498       CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2,
499       CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2,
500       CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2,
501       CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2,
502       // GENERIC
503       CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize,
504   };
505 
506   bool use_cupti_activity_api = true;
507   ReadBoolFromEnvVar("TF_GPU_CUPTI_USE_ACTIVITY_API", true,
508                      &use_cupti_activity_api)
509       .IgnoreError();
510   options_.enable_event_based_activity = !use_cupti_activity_api;
511 
512   bool trace_concurrent_kernels = false;
513   ReadBoolFromEnvVar("TF_GPU_CUPTI_FORCE_CONCURRENT_KERNEL", false,
514                      &trace_concurrent_kernels)
515       .IgnoreError();
516   options_.activities_selected.push_back(
517       trace_concurrent_kernels ? CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL
518                                : CUPTI_ACTIVITY_KIND_KERNEL);
519   options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_MEMCPY);
520   options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_MEMCPY2);
521   options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_OVERHEAD);
522 
523 #if CUDA_VERSION < 10000
524   if (!trace_concurrent_kernels) options_.cupti_finalize = true;
525 #endif
526 
527   CuptiTracerCollectorOptions collector_options;
528   collector_options.num_gpus = cupti_tracer_->NumGpus();
529   uint64 start_gputime_ns = CuptiTracer::GetTimestamp();
530   uint64 start_walltime_ns = tensorflow::EnvTime::NowNanos();
531   cupti_collector_ = absl::make_unique<CuptiTraceCollectorImpl>(
532       collector_options, start_walltime_ns, start_gputime_ns);
533 
534   AnnotationStack::Enable(true);
535   cupti_tracer_->Enable(options_, cupti_collector_.get());
536   return Status::OK();
537 }
538 
Start()539 Status GpuTracer::Start() {
540   Status status = DoStart();
541   if (status.ok()) {
542     profiling_state_ = State::kStartedOk;
543     return Status::OK();
544   } else {
545     profiling_state_ = State::kStartedError;
546     return status;
547   }
548 }
549 
DoStop()550 Status GpuTracer::DoStop() {
551   cupti_tracer_->Disable();
552   AnnotationStack::Enable(false);
553   return Status::OK();
554 }
555 
Stop()556 Status GpuTracer::Stop() {
557   if (profiling_state_ == State::kStartedOk) {
558     Status status = DoStop();
559     profiling_state_ = status.ok() ? State::kStoppedOk : State::kStoppedError;
560   }
561   return Status::OK();
562 }
563 
CollectData(RunMetadata * run_metadata)564 Status GpuTracer::CollectData(RunMetadata* run_metadata) {
565   switch (profiling_state_) {
566     case State::kNotStarted:
567       VLOG(1) << "No trace data collected, session wasn't started";
568       return Status::OK();
569     case State::kStartedOk:
570       return errors::FailedPrecondition("Cannot collect trace before stopping");
571     case State::kStartedError:
572       LOG(ERROR) << "Cannot collect, xprof failed to start";
573       return Status::OK();
574     case State::kStoppedError:
575       VLOG(1) << "No trace data collected";
576       return Status::OK();
577     case State::kStoppedOk: {
578       // Input run_metadata is shared by profiler interfaces, we need append.
579       StepStatsCollector step_stats_collector(&step_stats_);
580       if (cupti_collector_) {
581         cupti_collector_->Export(&step_stats_collector);
582       }
583       step_stats_collector.Finalize();
584       for (auto& dev_stats : *step_stats_.mutable_dev_stats()) {
585         run_metadata->mutable_step_stats()->add_dev_stats()->Swap(&dev_stats);
586       }
587       return Status::OK();
588     }
589   }
590   return errors::Internal("Invalid profiling state: ", profiling_state_);
591 }
592 
CollectData(XSpace * space)593 Status GpuTracer::CollectData(XSpace* space) {
594   switch (profiling_state_) {
595     case State::kNotStarted:
596       VLOG(1) << "No trace data collected, session wasn't started";
597       return Status::OK();
598     case State::kStartedOk:
599       return errors::FailedPrecondition("Cannot collect trace before stopping");
600     case State::kStartedError:
601       LOG(ERROR) << "Cannot collect, xprof failed to start";
602       return Status::OK();
603     case State::kStoppedError:
604       VLOG(1) << "No trace data collected";
605       return Status::OK();
606     case State::kStoppedOk: {
607       if (cupti_collector_) {
608         cupti_collector_->Export(space);
609       }
610       return Status::OK();
611     }
612   }
613   return errors::Internal("Invalid profiling state: ", profiling_state_);
614 }
615 
616 // Not in anonymous namespace for testing purposes.
CreateGpuTracer(const profiler::ProfilerOptions & options)617 std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
618     const profiler::ProfilerOptions& options) {
619   if (options.device_type != profiler::DeviceType::kGpu &&
620       options.device_type != profiler::DeviceType::kUnspecified)
621     return nullptr;
622   profiler::CuptiTracer* cupti_tracer =
623       profiler::CuptiTracer::GetCuptiTracerSingleton();
624   if (!cupti_tracer->IsAvailable()) {
625     return nullptr;
626   }
627   profiler::CuptiInterface* cupti_interface = profiler::GetCuptiInterface();
628   return absl::make_unique<profiler::GpuTracer>(cupti_tracer, cupti_interface);
629 }
630 
__anon260cf8950302null631 auto register_gpu_tracer_factory = [] {
632   RegisterProfilerFactory(&CreateGpuTracer);
633   return 0;
634 }();
635 
636 }  // namespace profiler
637 }  // namespace tensorflow
638 
639 #endif  // GOOGLE_CUDA
640