1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #if GOOGLE_CUDA
17
18 #include <stdlib.h>
19
20 #include <memory>
21
22 #include "absl/container/fixed_array.h"
23 #include "absl/container/flat_hash_map.h"
24 #include "absl/container/flat_hash_set.h"
25 #include "absl/strings/str_cat.h"
26 #include "absl/strings/str_format.h"
27 #include "absl/strings/str_join.h"
28 #include "tensorflow/core/common_runtime/step_stats_collector.h"
29 #include "tensorflow/core/lib/core/errors.h"
30 #include "tensorflow/core/platform/abi.h"
31 #include "tensorflow/core/platform/macros.h"
32 #include "tensorflow/core/profiler/internal/annotation_stack.h"
33 #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
34 #include "tensorflow/core/profiler/internal/gpu/cupti_wrapper.h"
35 #include "tensorflow/core/profiler/internal/parse_annotation.h"
36 #include "tensorflow/core/profiler/internal/profiler_factory.h"
37 #include "tensorflow/core/profiler/internal/profiler_interface.h"
38 #include "tensorflow/core/profiler/utils/xplane_builder.h"
39 #include "tensorflow/core/profiler/utils/xplane_schema.h"
40 #include "tensorflow/core/profiler/utils/xplane_utils.h"
41 #include "tensorflow/core/util/env_var.h"
42
43 namespace tensorflow {
44 namespace profiler {
45
46 namespace {
47
IsHostEvent(const CuptiTracerEvent & event)48 bool IsHostEvent(const CuptiTracerEvent& event) {
49 // DriverCallback(i.e. kernel launching) events are host events.
50 if (event.source == CuptiTracerEventSource::DriverCallback) return true;
51 // Non-overhead activity events are device events.
52 if (event.type != CuptiTracerEventType::Overhead) return false;
53 // Overhead events can be associated with a thread or a stream, etc.
54 // If a valid thread id is specified, we consider it as a host event.
55 return event.thread_id != CuptiTracerEvent::kInvalidThreadId;
56 }
57
CreateXEvent(const CuptiTracerEvent & event,uint64 offset_ns,XPlaneBuilder * plane,XLineBuilder * line)58 void CreateXEvent(const CuptiTracerEvent& event, uint64 offset_ns,
59 XPlaneBuilder* plane, XLineBuilder* line) {
60 std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
61 XEventMetadata* event_metadata = plane->GetOrCreateEventMetadata(kernel_name);
62 XEventBuilder xevent = line->AddEvent(*event_metadata);
63 xevent.SetTimestampNs(event.start_time_ns + offset_ns);
64 xevent.SetEndTimestampNs(event.end_time_ns + offset_ns);
65 if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
66 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
67 GetStatTypeStr(StatType::kCorrelationId)),
68 event.correlation_id);
69 }
70 if (!event.annotation.empty()) {
71 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
72 GetStatTypeStr(StatType::kKernelAnnotation)),
73 event.annotation);
74 }
75 if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
76 xevent.AddStatValue(
77 *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)),
78 absl::StrCat("$$", static_cast<uint64>(event.context_id)));
79 }
80 if (event.type == CuptiTracerEventType::Kernel) {
81 const std::string kernel_details =
82 absl::StrFormat("regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u",
83 event.kernel_info.registers_per_thread,
84 event.kernel_info.static_shared_memory_usage,
85 event.kernel_info.grid_x, event.kernel_info.grid_y,
86 event.kernel_info.grid_z, event.kernel_info.block_x,
87 event.kernel_info.block_y, event.kernel_info.block_z);
88 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
89 GetStatTypeStr(StatType::kKernelDetails)),
90 kernel_details);
91 }
92 if (event.type == CuptiTracerEventType::MemcpyH2D ||
93 event.type == CuptiTracerEventType::MemcpyD2H ||
94 event.type == CuptiTracerEventType::MemcpyD2D ||
95 event.type == CuptiTracerEventType::MemcpyP2P ||
96 event.type == CuptiTracerEventType::MemcpyOther) {
97 const auto& memcpy_info = event.memcpy_info;
98 std::string memcpy_details =
99 absl::StrFormat("size:%u dest:%u async:%u", memcpy_info.num_bytes,
100 memcpy_info.destination, memcpy_info.async);
101 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
102 GetStatTypeStr(StatType::kMemcpyDetails)),
103 memcpy_details);
104 }
105 if (event.type == CuptiTracerEventType::MemoryAlloc) {
106 std::string memalloc_details =
107 absl::StrFormat("num_bytes:%u", event.memalloc_info.num_bytes);
108 xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
109 GetStatTypeStr(StatType::kMemallocDetails)),
110 memalloc_details);
111 }
112
113 std::vector<Annotation> annotation_stack =
114 ParseAnnotationStack(event.annotation);
115 // If multiple metadata have the same key name, show the values from the top
116 // of the stack (innermost annotation). Concatenate the values from "hlo_op".
117 absl::flat_hash_set<absl::string_view> key_set;
118 std::vector<absl::string_view> hlo_op_names;
119 for (auto annotation = annotation_stack.rbegin();
120 annotation != annotation_stack.rend(); ++annotation) {
121 for (const Annotation::Metadata& metadata : annotation->metadata) {
122 if (metadata.key == "tf_op") {
123 continue; // ignored, obtained from HLO proto via DebugInfoMap
124 } else if (key_set.insert(metadata.key).second) {
125 xevent.ParseAndAddStatValue(
126 *plane->GetOrCreateStatMetadata(metadata.key), metadata.value);
127 }
128 }
129 }
130 }
131
GetDeviceAttribute(CUdevice device,CUdevice_attribute attrib)132 absl::optional<int> GetDeviceAttribute(CUdevice device,
133 CUdevice_attribute attrib) {
134 int ret_val;
135 CUresult err = cuDeviceGetAttribute(&ret_val, attrib, device);
136 if (err != CUDA_SUCCESS) return absl::nullopt;
137 return ret_val;
138 }
139
GetDeviceXLineName(int64 stream_id,absl::flat_hash_set<CuptiTracerEventType> & event_types)140 std::string GetDeviceXLineName(
141 int64 stream_id, absl::flat_hash_set<CuptiTracerEventType>& event_types) {
142 std::string line_name = absl::StrCat("Stream #", stream_id);
143 event_types.erase(CuptiTracerEventType::Unsupported);
144 if (event_types.empty()) return line_name;
145 std::vector<const char*> type_names;
146 for (const auto event_type : event_types) {
147 type_names.emplace_back(GetTraceEventTypeName(event_type));
148 }
149 return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")");
150 }
151
152 } // namespace
153
154 // CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and
155 // eventually convert and filter them to StepStats or XSpace.
156 class CuptiTraceCollectorImpl : public CuptiTraceCollector {
157 public:
CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions & option,uint64 start_walltime_ns,uint64 start_gpu_ns)158 CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions& option,
159 uint64 start_walltime_ns, uint64 start_gpu_ns)
160 : CuptiTraceCollector(option),
161 num_callback_events_(0),
162 num_activity_events_(0),
163 start_walltime_ns_(start_walltime_ns),
164 start_gpu_ns_(start_gpu_ns),
165 num_gpus_(option.num_gpus),
166 per_device_collector_(option.num_gpus) {}
167
AddEvent(CuptiTracerEvent && event)168 void AddEvent(CuptiTracerEvent&& event) override {
169 if (event.device_id >= num_gpus_) return;
170 if (event.source == CuptiTracerEventSource::DriverCallback) {
171 if (num_callback_events_ > options_.max_callback_api_events) {
172 OnEventsDropped("trace collector", 1);
173 return;
174 }
175 num_callback_events_++;
176 } else {
177 if (num_activity_events_ > options_.max_activity_api_events) {
178 OnEventsDropped("trace collector", 1);
179 return;
180 }
181 num_activity_events_++;
182 }
183 per_device_collector_[event.device_id].AddEvent(std::move(event));
184 }
OnEventsDropped(const std::string & reason,uint32 num_events)185 void OnEventsDropped(const std::string& reason, uint32 num_events) override {}
Flush()186 void Flush() override {}
Export(StepStatsCollector * trace_collector)187 void Export(StepStatsCollector* trace_collector) {
188 LOG(INFO) << " GpuTracer has collected " << num_callback_events_
189 << " callback api events and " << num_activity_events_
190 << " activity events.";
191 for (int i = 0; i < num_gpus_; ++i) {
192 per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_,
193 trace_collector);
194 }
195 }
Export(XSpace * space)196 void Export(XSpace* space) {
197 LOG(INFO) << " GpuTracer has collected " << num_callback_events_
198 << " callback api events and " << num_activity_events_
199 << " activity events.";
200 XPlaneBuilder host_plane(GetOrCreatePlane(space, kHostThreads));
201 host_plane.SetId(kHostPlaneId);
202 for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) {
203 std::string name = absl::StrCat(kGpuPlanePrefix, device_ordinal);
204 XPlaneBuilder device_plane(GetOrCreatePlane(space, name));
205 device_plane.SetId(kGpuPlaneBaseId + device_ordinal);
206 per_device_collector_[device_ordinal].Flush(
207 start_walltime_ns_, start_gpu_ns_, &device_plane, &host_plane);
208 per_device_collector_[device_ordinal].GetDeviceCapabilities(
209 device_ordinal, &device_plane);
210 }
211 }
212
213 private:
214 std::atomic<int> num_callback_events_;
215 std::atomic<int> num_activity_events_;
216 uint64 start_walltime_ns_;
217 uint64 start_gpu_ns_;
218 int num_gpus_;
219
220 struct CorrelationInfo {
CorrelationInfotensorflow::profiler::CuptiTraceCollectorImpl::CorrelationInfo221 CorrelationInfo(uint32 t, uint32 e) : thread_id(t), enqueue_time_ns(e) {}
222 uint32 thread_id;
223 uint64 enqueue_time_ns;
224 };
225 struct PerDeviceCollector {
AddEventtensorflow::profiler::CuptiTraceCollectorImpl::PerDeviceCollector226 void AddEvent(CuptiTracerEvent&& event) {
227 absl::MutexLock lock(&mutex);
228 if (event.source == CuptiTracerEventSource::DriverCallback) {
229 // Cupti api callback events were used to populate launch times etc.
230 if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
231 correlation_info.insert(
232 {event.correlation_id,
233 CorrelationInfo(event.thread_id, event.start_time_ns)});
234 }
235 events.emplace_back(std::move(event));
236 } else {
237 // Cupti activity events measure device times etc.
238 events.emplace_back(std::move(event));
239 }
240 }
241
Flushtensorflow::profiler::CuptiTraceCollectorImpl::PerDeviceCollector242 void Flush(int32 device_ordinal, uint64 start_walltime_ns,
243 uint64 start_gpu_ns, StepStatsCollector* collector) {
244 absl::MutexLock lock(&mutex);
245 stream_device = absl::StrCat("/device:GPU:", device_ordinal, "/stream:");
246 memcpy_device = absl::StrCat("/device:GPU:", device_ordinal, "/memcpy");
247 sync_device = absl::StrCat("/device:GPU:", device_ordinal, "/sync");
248 for (auto& event : events) {
249 NodeExecStats* ns = new NodeExecStats;
250 ns->set_all_start_micros(
251 (start_walltime_ns + (event.start_time_ns - start_gpu_ns)) / 1000);
252 ns->set_op_start_rel_micros(0);
253 auto elapsed_ns = event.end_time_ns - event.start_time_ns;
254 ns->set_op_end_rel_micros(elapsed_ns / 1000);
255 ns->set_all_end_rel_micros(elapsed_ns / 1000);
256
257 if (event.source == CuptiTracerEventSource::DriverCallback) {
258 // Legacy code ignore all other launch events except
259 // cuStreamSynchronize.
260 if (event.name == "cuStreamSynchronize") {
261 ns->set_node_name(event.name);
262 ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id));
263 ns->set_thread_id(event.thread_id);
264 collector->Save(sync_device, ns);
265 }
266 } else { // CuptiTracerEventSource::Activity
267 // Get launch information if available.
268 if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
269 auto it = correlation_info.find(event.correlation_id);
270 if (it != correlation_info.end()) {
271 ns->set_scheduled_micros(it->second.enqueue_time_ns / 1000);
272 ns->set_thread_id(it->second.thread_id);
273 }
274 }
275
276 auto annotation_stack = ParseAnnotationStack(event.annotation);
277 std::string kernel_name = port::MaybeAbiDemangle(event.name.c_str());
278 std::string activity_name =
279 !annotation_stack.empty()
280 ? std::string(annotation_stack.back().name)
281 : kernel_name;
282 ns->set_node_name(activity_name);
283 switch (event.type) {
284 case CuptiTracerEventType::Kernel: {
285 const std::string details = absl::StrFormat(
286 "regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u",
287 event.kernel_info.registers_per_thread,
288 event.kernel_info.static_shared_memory_usage,
289 event.kernel_info.grid_x, event.kernel_info.grid_y,
290 event.kernel_info.grid_z, event.kernel_info.block_x,
291 event.kernel_info.block_y, event.kernel_info.block_z);
292 ns->set_timeline_label(absl::StrCat(kernel_name, " ", details,
293 "@@", event.annotation));
294 auto nscopy = new NodeExecStats(*ns);
295 collector->Save(absl::StrCat(stream_device, "all"), ns);
296 collector->Save(absl::StrCat(stream_device, event.stream_id),
297 nscopy);
298 break;
299 }
300 case CuptiTracerEventType::MemcpyH2D:
301 case CuptiTracerEventType::MemcpyD2H:
302 case CuptiTracerEventType::MemcpyD2D:
303 case CuptiTracerEventType::MemcpyP2P: {
304 std::string details = absl::StrCat(
305 activity_name, " bytes:", event.memcpy_info.num_bytes);
306 if (event.memcpy_info.async) {
307 absl::StrAppend(&details, " aync");
308 }
309 if (event.memcpy_info.destination != event.device_id) {
310 absl::StrAppend(&details,
311 " to device:", event.memcpy_info.destination);
312 }
313 ns->set_timeline_label(std::move(details));
314 auto nscopy = new NodeExecStats(*ns);
315 collector->Save(memcpy_device, ns);
316 collector->Save(
317 absl::StrCat(stream_device, event.stream_id, "<",
318 GetTraceEventTypeName(event.type), ">"),
319 nscopy);
320 break;
321 }
322 default:
323 ns->set_timeline_label(activity_name);
324 collector->Save(stream_device, ns);
325 }
326 }
327 }
328 events.clear();
329 }
330
Flushtensorflow::profiler::CuptiTraceCollectorImpl::PerDeviceCollector331 void Flush(uint64 start_walltime_ns, uint64 start_gpu_ns,
332 XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) {
333 absl::MutexLock lock(&mutex);
334
335 // Tracking event types per line.
336 absl::flat_hash_map<int64, absl::flat_hash_set<CuptiTracerEventType>>
337 events_types_per_line;
338 const uint64 offset_ns = start_walltime_ns - start_gpu_ns;
339 for (auto& event : events) {
340 bool is_host_event = IsHostEvent(event);
341 int64 line_id = is_host_event ? static_cast<int64>(event.thread_id)
342 : event.stream_id;
343 if (line_id == CuptiTracerEvent::kInvalidThreadId ||
344 line_id == CuptiTracerEvent::kInvalidStreamId)
345 continue;
346 auto* plane = is_host_event ? host_plane : device_plane;
347 XLineBuilder line = plane->GetOrCreateLine(line_id);
348 if (!is_host_event) line.SetTimestampNs(start_gpu_ns);
349 CreateXEvent(event, offset_ns, plane, &line);
350 events_types_per_line[line_id].emplace(event.type);
351 }
352 device_plane->ForEachLine([&](tensorflow::profiler::XLineBuilder line) {
353 line.SetName(
354 GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()]));
355 });
356 events.clear();
357 }
358
GetDeviceCapabilitiestensorflow::profiler::CuptiTraceCollectorImpl::PerDeviceCollector359 void GetDeviceCapabilities(int32 device_ordinal,
360 XPlaneBuilder* device_plane) {
361 CUdevice device;
362 if (cuDeviceGet(&device, device_ordinal) != CUDA_SUCCESS) return;
363
364 auto clock_rate_in_khz =
365 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_CLOCK_RATE);
366 if (clock_rate_in_khz) {
367 device_plane->AddStatValue(
368 *device_plane->GetOrCreateStatMetadata(
369 GetStatTypeStr(StatType::kDevCapClockRateKHz)),
370 *clock_rate_in_khz);
371 }
372
373 auto core_count =
374 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
375 if (core_count) {
376 device_plane->AddStatValue(
377 *device_plane->GetOrCreateStatMetadata(
378 GetStatTypeStr(StatType::kDevCapCoreCount)),
379 *core_count);
380 }
381
382 auto mem_clock_khz =
383 GetDeviceAttribute(device, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE);
384 auto mem_bus_width_bits = GetDeviceAttribute(
385 device, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH);
386 if (mem_clock_khz && mem_bus_width_bits) {
387 // Times 2 because HBM is DDR memory; it gets two data bits per each
388 // data lane.
389 auto memory_bandwidth =
390 2ULL * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8;
391 device_plane->AddStatValue(
392 *device_plane->GetOrCreateStatMetadata(
393 GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
394 memory_bandwidth);
395 }
396
397 size_t total_memory = 0;
398 if (cuDeviceTotalMem(&total_memory, device) == CUDA_SUCCESS) {
399 device_plane->AddStatValue(
400 *device_plane->GetOrCreateStatMetadata(
401 GetStatTypeStr(StatType::kDevCapMemorySize)),
402 static_cast<uint64>(total_memory));
403 }
404
405 auto compute_capability_major = GetDeviceAttribute(
406 device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
407 if (compute_capability_major) {
408 device_plane->AddStatValue(
409 *device_plane->GetOrCreateStatMetadata(
410 GetStatTypeStr(StatType::kDevCapComputeCapMajor)),
411 *compute_capability_major);
412 }
413 auto compute_capability_minor = GetDeviceAttribute(
414 device, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
415 if (compute_capability_minor) {
416 device_plane->AddStatValue(
417 *device_plane->GetOrCreateStatMetadata(
418 GetStatTypeStr(StatType::kDevCapComputeCapMinor)),
419 *compute_capability_minor);
420 }
421 }
422
423 absl::Mutex mutex;
424 std::string stream_device GUARDED_BY(mutex);
425 std::string memcpy_device GUARDED_BY(mutex);
426 std::string sync_device GUARDED_BY(mutex);
427 std::vector<CuptiTracerEvent> events GUARDED_BY(mutex);
428 absl::flat_hash_map<uint32, CorrelationInfo> correlation_info
429 GUARDED_BY(mutex);
430 };
431 absl::FixedArray<PerDeviceCollector> per_device_collector_;
432
433 TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollectorImpl);
434 };
435
436 // GpuTracer for GPU.
437 class GpuTracer : public profiler::ProfilerInterface {
438 public:
GpuTracer(CuptiTracer * cupti_tracer,CuptiInterface * cupti_interface)439 GpuTracer(CuptiTracer* cupti_tracer, CuptiInterface* cupti_interface)
440 : cupti_tracer_(cupti_tracer) {
441 VLOG(1) << "GpuTracer created.";
442 }
~GpuTracer()443 ~GpuTracer() override {}
444
445 // GpuTracer interface:
446 Status Start() override;
447 Status Stop() override;
448 Status CollectData(RunMetadata* run_metadata) override;
449 Status CollectData(XSpace* space) override;
GetDeviceType()450 profiler::DeviceType GetDeviceType() override {
451 return profiler::DeviceType::kGpu;
452 }
453
454 private:
455 Status DoStart();
456 Status DoStop();
457
458 enum State {
459 kNotStarted,
460 kStartedOk,
461 kStartedError,
462 kStoppedOk,
463 kStoppedError
464 };
465 State profiling_state_ = State::kNotStarted;
466
467 CuptiTracer* cupti_tracer_;
468 CuptiTracerOptions options_;
469 StepStats step_stats_;
470 std::unique_ptr<CuptiTraceCollectorImpl> cupti_collector_;
471 };
472
DoStart()473 Status GpuTracer::DoStart() {
474 if (!cupti_tracer_->IsAvailable()) {
475 return errors::Unavailable("Another profile session running.");
476 }
477
478 options_.cbids_selected = {
479 // KERNEL
480 CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel,
481 // MEMCPY
482 CUPTI_DRIVER_TRACE_CBID_cuMemcpy,
483 CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync,
484 CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2,
485 CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2,
486 CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2,
487 CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2,
488 CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2,
489 CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2,
490 CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2,
491 CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2,
492 CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2,
493 CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2,
494 CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2,
495 CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2,
496 CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2,
497 CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2,
498 CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2,
499 CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2,
500 CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2,
501 CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2,
502 // GENERIC
503 CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize,
504 };
505
506 bool use_cupti_activity_api = true;
507 ReadBoolFromEnvVar("TF_GPU_CUPTI_USE_ACTIVITY_API", true,
508 &use_cupti_activity_api)
509 .IgnoreError();
510 options_.enable_event_based_activity = !use_cupti_activity_api;
511
512 bool trace_concurrent_kernels = false;
513 ReadBoolFromEnvVar("TF_GPU_CUPTI_FORCE_CONCURRENT_KERNEL", false,
514 &trace_concurrent_kernels)
515 .IgnoreError();
516 options_.activities_selected.push_back(
517 trace_concurrent_kernels ? CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL
518 : CUPTI_ACTIVITY_KIND_KERNEL);
519 options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_MEMCPY);
520 options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_MEMCPY2);
521 options_.activities_selected.push_back(CUPTI_ACTIVITY_KIND_OVERHEAD);
522
523 #if CUDA_VERSION < 10000
524 if (!trace_concurrent_kernels) options_.cupti_finalize = true;
525 #endif
526
527 CuptiTracerCollectorOptions collector_options;
528 collector_options.num_gpus = cupti_tracer_->NumGpus();
529 uint64 start_gputime_ns = CuptiTracer::GetTimestamp();
530 uint64 start_walltime_ns = tensorflow::EnvTime::NowNanos();
531 cupti_collector_ = absl::make_unique<CuptiTraceCollectorImpl>(
532 collector_options, start_walltime_ns, start_gputime_ns);
533
534 AnnotationStack::Enable(true);
535 cupti_tracer_->Enable(options_, cupti_collector_.get());
536 return Status::OK();
537 }
538
Start()539 Status GpuTracer::Start() {
540 Status status = DoStart();
541 if (status.ok()) {
542 profiling_state_ = State::kStartedOk;
543 return Status::OK();
544 } else {
545 profiling_state_ = State::kStartedError;
546 return status;
547 }
548 }
549
DoStop()550 Status GpuTracer::DoStop() {
551 cupti_tracer_->Disable();
552 AnnotationStack::Enable(false);
553 return Status::OK();
554 }
555
Stop()556 Status GpuTracer::Stop() {
557 if (profiling_state_ == State::kStartedOk) {
558 Status status = DoStop();
559 profiling_state_ = status.ok() ? State::kStoppedOk : State::kStoppedError;
560 }
561 return Status::OK();
562 }
563
CollectData(RunMetadata * run_metadata)564 Status GpuTracer::CollectData(RunMetadata* run_metadata) {
565 switch (profiling_state_) {
566 case State::kNotStarted:
567 VLOG(1) << "No trace data collected, session wasn't started";
568 return Status::OK();
569 case State::kStartedOk:
570 return errors::FailedPrecondition("Cannot collect trace before stopping");
571 case State::kStartedError:
572 LOG(ERROR) << "Cannot collect, xprof failed to start";
573 return Status::OK();
574 case State::kStoppedError:
575 VLOG(1) << "No trace data collected";
576 return Status::OK();
577 case State::kStoppedOk: {
578 // Input run_metadata is shared by profiler interfaces, we need append.
579 StepStatsCollector step_stats_collector(&step_stats_);
580 if (cupti_collector_) {
581 cupti_collector_->Export(&step_stats_collector);
582 }
583 step_stats_collector.Finalize();
584 for (auto& dev_stats : *step_stats_.mutable_dev_stats()) {
585 run_metadata->mutable_step_stats()->add_dev_stats()->Swap(&dev_stats);
586 }
587 return Status::OK();
588 }
589 }
590 return errors::Internal("Invalid profiling state: ", profiling_state_);
591 }
592
CollectData(XSpace * space)593 Status GpuTracer::CollectData(XSpace* space) {
594 switch (profiling_state_) {
595 case State::kNotStarted:
596 VLOG(1) << "No trace data collected, session wasn't started";
597 return Status::OK();
598 case State::kStartedOk:
599 return errors::FailedPrecondition("Cannot collect trace before stopping");
600 case State::kStartedError:
601 LOG(ERROR) << "Cannot collect, xprof failed to start";
602 return Status::OK();
603 case State::kStoppedError:
604 VLOG(1) << "No trace data collected";
605 return Status::OK();
606 case State::kStoppedOk: {
607 if (cupti_collector_) {
608 cupti_collector_->Export(space);
609 }
610 return Status::OK();
611 }
612 }
613 return errors::Internal("Invalid profiling state: ", profiling_state_);
614 }
615
616 // Not in anonymous namespace for testing purposes.
CreateGpuTracer(const profiler::ProfilerOptions & options)617 std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
618 const profiler::ProfilerOptions& options) {
619 if (options.device_type != profiler::DeviceType::kGpu &&
620 options.device_type != profiler::DeviceType::kUnspecified)
621 return nullptr;
622 profiler::CuptiTracer* cupti_tracer =
623 profiler::CuptiTracer::GetCuptiTracerSingleton();
624 if (!cupti_tracer->IsAvailable()) {
625 return nullptr;
626 }
627 profiler::CuptiInterface* cupti_interface = profiler::GetCuptiInterface();
628 return absl::make_unique<profiler::GpuTracer>(cupti_tracer, cupti_interface);
629 }
630
__anon260cf8950302null631 auto register_gpu_tracer_factory = [] {
632 RegisterProfilerFactory(&CreateGpuTracer);
633 return 0;
634 }();
635
636 } // namespace profiler
637 } // namespace tensorflow
638
639 #endif // GOOGLE_CUDA
640