• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/backends/gpu/cupti_tracer.h"
17 
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/container/flat_hash_set.h"
20 #include "absl/container/node_hash_map.h"
21 #include "absl/container/node_hash_set.h"
22 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
23 #include "third_party/gpus/cuda/extras/CUPTI/include/generated_nvtx_meta.h"
24 #include "tensorflow/core/lib/gtl/cleanup.h"
25 #include "tensorflow/core/platform/env.h"
26 #include "tensorflow/core/platform/errors.h"
27 #include "tensorflow/core/platform/host_info.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/macros.h"
30 #include "tensorflow/core/profiler/backends/cpu/annotation_stack.h"
31 #include "tensorflow/core/profiler/backends/gpu/cupti_collector.h"
32 #include "tensorflow/core/profiler/backends/gpu/nvtx_utils.h"
33 
34 namespace tensorflow {
35 namespace profiler {
36 
37 namespace {
38 
39 // CUPTI from CUDA 11.6 adds information about the hardware channel that ops
40 // run on; this makes its way into the channel_id and channel_type fields in the
41 // structs we export.
42 //
43 // Define some type aliases so we can access the hardware channel id if it's
44 // available.
45 #if CUDA_VERSION >= 11060  // CUDA 11.6
46 #define TF_CUPTI_HAS_CHANNEL_ID 1
47 using CuptiActivityKernelTy = CUpti_ActivityKernel7;
48 using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy5;
49 using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpyPtoP4;
50 using CuptiActivityMemsetTy = CUpti_ActivityMemset4;
51 #else
52 using CuptiActivityKernelTy = CUpti_ActivityKernel4;
53 using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy;
54 using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpy2;
55 using CuptiActivityMemsetTy = CUpti_ActivityMemset;
56 #endif
57 
58 static thread_local int internalCuCall = 0;
59 
60 // Temporary disable cupti api tracing for this thread during the life scope of
61 // this class. Used for the API calls that initiated by us.
62 class CuptiApiTracingDisabler {
63  public:
CuptiApiTracingDisabler()64   CuptiApiTracingDisabler() { internalCuCall++; }
~CuptiApiTracingDisabler()65   ~CuptiApiTracingDisabler() { internalCuCall--; }
66 };
67 
ToStatus(CUptiResult result)68 Status ToStatus(CUptiResult result) {
69   if (result == CUPTI_SUCCESS) {
70     return OkStatus();
71   }
72   const char *str = nullptr;
73   cuptiGetResultString(result, &str);
74   return errors::Unavailable("CUPTI error: ", str ? str : "<unknown>");
75 }
76 
ToStatus(CUresult result)77 Status ToStatus(CUresult result) {
78   if (result == CUDA_SUCCESS) {
79     return OkStatus();
80   }
81   const char *str = nullptr;
82   cuGetErrorName(result, &str);
83   return errors::Unavailable("CUDA error: ", str ? str : "<unknown>");
84 }
85 
LogIfError(const Status & status)86 inline void LogIfError(const Status &status) {
87   if (status.ok()) return;
88   LOG(ERROR) << status.error_message();
89 }
90 
91 // Maps an OverheadKind enum to a const string.
getActivityOverheadKindString(CUpti_ActivityOverheadKind kind)92 const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
93   switch (kind) {
94     case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER:
95       return "COMPILER";
96     case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH:
97       return "BUFFER_FLUSH";
98     case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION:
99       return "INSTRUMENTATION";
100     case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE:
101       return "RESOURCE";
102     default:
103       break;
104   }
105   return "<UNKNOWN>";
106 }
107 
getActivityUnifiedMemoryKindString(CUpti_ActivityUnifiedMemoryCounterKind kind)108 const char *getActivityUnifiedMemoryKindString(
109     CUpti_ActivityUnifiedMemoryCounterKind kind) {
110   switch (kind) {
111     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
112       return "UM_BYTES_TRANSFER_HTOD";
113     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
114       return "UM_BYTES_TRANSFER_DTOH";
115     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
116       return "UM_CPU_PAGE_FAULT";
117     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
118       return "UM_GPU_PAGE_FAULT";
119     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING:
120       return "UM_THRASHING";
121     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING:
122       return "UM_THROTTLING";
123     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP:
124       return "UM_REMOTE_MAP";
125     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD:
126       return "UM_BYTES_TRANSFER_DTOD";
127     default:
128       break;
129   }
130   return "<UNKNOWN>";
131 }
132 
133 // CUPTI_ERROR_INSUFFICIENT_PRIVILEGES is introduced at CUDA 10.1.
134 #if CUDA_VERSION <= 10000
135 #define CUPTI_ERROR_INSUFFICIENT_PRIVILEGES 35
136 #endif
137 
138 #define RETURN_IF_CUPTI_ERROR(expr)                                         \
139   do {                                                                      \
140     CUptiResult status = expr;                                              \
141     if (ABSL_PREDICT_FALSE(status != CUPTI_SUCCESS)) {                      \
142       const char *errstr = "";                                              \
143       cupti_interface_->GetResultString(status, &errstr);                   \
144       LOG(ERROR) << "function " << #expr << "failed with error " << errstr; \
145       if (status == CUPTI_ERROR_INSUFFICIENT_PRIVILEGES) {                  \
146         return errors::PermissionDenied("CUPTI need root access!");         \
147       } else {                                                              \
148         return errors::Internal("CUPTI call error", errstr);                \
149       }                                                                     \
150     }                                                                       \
151   } while (false)
152 
Bytes2D(const CUDA_MEMCPY2D * p)153 size_t Bytes2D(const CUDA_MEMCPY2D *p) { return p->Height * p->WidthInBytes; }
154 
Bytes3D(const CUDA_MEMCPY3D * p)155 size_t Bytes3D(const CUDA_MEMCPY3D *p) {
156   return p->Depth * p->Height * p->WidthInBytes;
157 }
158 
159 template <typename CudaMemcpy>
MemcpyKind(const CudaMemcpy * p)160 CuptiTracerEventType MemcpyKind(const CudaMemcpy *p) {
161   if (p->srcMemoryType == CU_MEMORYTYPE_HOST &&
162       p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
163     return CuptiTracerEventType::MemcpyH2D;
164   }
165   if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
166       p->dstMemoryType == CU_MEMORYTYPE_HOST) {
167     return CuptiTracerEventType::MemcpyD2H;
168   }
169   if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
170       p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
171     return CuptiTracerEventType::MemcpyD2D;
172   }
173   return CuptiTracerEventType::Unsupported;
174 }
175 
176 std::tuple<size_t /*bytes*/, CuptiTracerEventType, bool /*async*/>
DecodeDriverMemcpy(CUpti_CallbackId cbid,const void * params)177 DecodeDriverMemcpy(CUpti_CallbackId cbid, const void *params) {
178   switch (cbid) {
179     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: {
180       const auto *p = reinterpret_cast<const cuMemcpyHtoD_v2_params *>(params);
181       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyH2D,
182                              false);
183     }
184     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: {
185       const auto *p =
186           reinterpret_cast<const cuMemcpyHtoDAsync_v2_params *>(params);
187       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyH2D,
188                              true);
189     }
190     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: {
191       const auto *p = reinterpret_cast<const cuMemcpyDtoH_v2_params *>(params);
192       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2H,
193                              false);
194     }
195     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: {
196       const auto *p =
197           reinterpret_cast<const cuMemcpyDtoHAsync_v2_params *>(params);
198       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2H,
199                              true);
200     }
201     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: {
202       const auto *p = reinterpret_cast<const cuMemcpyDtoD_v2_params *>(params);
203       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2D,
204                              false);
205     }
206     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: {
207       const auto *p =
208           reinterpret_cast<const cuMemcpyDtoDAsync_v2_params *>(params);
209       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2D,
210                              true);
211     }
212     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
213       const auto *p = reinterpret_cast<const cuMemcpy_params *>(params);
214       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyOther,
215                              false);
216     }
217     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
218       const auto *p = reinterpret_cast<const cuMemcpyAsync_params *>(params);
219       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyOther,
220                              true);
221     }
222     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2: {
223       const auto *p = reinterpret_cast<const cuMemcpy2D_v2_params *>(params);
224       return std::make_tuple(Bytes2D(p->pCopy), MemcpyKind(p->pCopy), false);
225     }
226     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2: {
227       const auto *p =
228           reinterpret_cast<const cuMemcpy2DAsync_v2_params *>(params);
229       return std::make_tuple(Bytes2D(p->pCopy), MemcpyKind(p->pCopy), true);
230     }
231     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2: {
232       const auto *p = reinterpret_cast<const cuMemcpy3D_v2_params *>(params);
233       return std::make_tuple(Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true);
234     }
235     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2: {
236       const auto *p =
237           reinterpret_cast<const cuMemcpy3DAsync_v2_params *>(params);
238       return std::make_tuple(Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true);
239     }
240     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: {
241       const auto *p2p_params =
242           reinterpret_cast<const cuMemcpyPeer_params *>(params);
243       return std::make_tuple(p2p_params->ByteCount,
244                              CuptiTracerEventType::MemcpyP2P, false);
245     }
246     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: {
247       const auto *p2p_params =
248           reinterpret_cast<const cuMemcpyPeerAsync_params *>(params);
249       return std::make_tuple(p2p_params->ByteCount,
250                              CuptiTracerEventType::MemcpyP2P, true);
251     }
252     default: {
253       LOG(ERROR) << "Unsupported memcpy activity observed: " << cbid;
254       return std::make_tuple(0, CuptiTracerEventType::Unsupported, false);
255     }
256   }
257 }
258 
259 std::tuple<size_t /*bytes*/, CuptiTracerEventType, bool /*async*/>
DecodeDriverMemset(CUpti_CallbackId cbid,const void * params)260 DecodeDriverMemset(CUpti_CallbackId cbid, const void *params) {
261   switch (cbid) {
262     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2: {
263       const auto *p = reinterpret_cast<const cuMemsetD8_v2_params *>(params);
264       return std::make_tuple(p->N, CuptiTracerEventType::Memset, false);
265     }
266     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2: {
267       const auto *p = reinterpret_cast<const cuMemsetD16_v2_params *>(params);
268       return std::make_tuple(p->N, CuptiTracerEventType::Memset, false);
269     }
270     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2: {
271       const auto *p = reinterpret_cast<const cuMemsetD32_v2_params *>(params);
272       return std::make_tuple(p->N, CuptiTracerEventType::Memset, false);
273     }
274     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2: {
275       const auto *p = reinterpret_cast<const cuMemsetD2D8_v2_params *>(params);
276       return std::make_tuple(p->dstPitch * p->Height,
277                              CuptiTracerEventType::Memset, false);
278     }
279     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2: {
280       const auto *p = reinterpret_cast<const cuMemsetD2D16_v2_params *>(params);
281       return std::make_tuple(p->dstPitch * p->Height,
282                              CuptiTracerEventType::Memset, false);
283     }
284     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2: {
285       const auto *p = reinterpret_cast<const cuMemsetD2D32_v2_params *>(params);
286       return std::make_tuple(p->dstPitch * p->Height,
287                              CuptiTracerEventType::Memset, false);
288     }
289     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async: {
290       const auto *p = reinterpret_cast<const cuMemsetD8Async_params *>(params);
291       return std::make_tuple(p->N, CuptiTracerEventType::Memset, true);
292     }
293     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async: {
294       const auto *p = reinterpret_cast<const cuMemsetD16Async_params *>(params);
295       return std::make_tuple(p->N, CuptiTracerEventType::Memset, true);
296     }
297     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async: {
298       const auto *p = reinterpret_cast<const cuMemsetD32Async_params *>(params);
299       return std::make_tuple(p->N, CuptiTracerEventType::Memset, true);
300     }
301     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async: {
302       const auto *p =
303           reinterpret_cast<const cuMemsetD2D8Async_params *>(params);
304       return std::make_tuple(p->dstPitch * p->Height,
305                              CuptiTracerEventType::Memset, true);
306     }
307     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async: {
308       const auto *p =
309           reinterpret_cast<const cuMemsetD2D16Async_params *>(params);
310       return std::make_tuple(p->dstPitch * p->Height,
311                              CuptiTracerEventType::Memset, true);
312     }
313     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async: {
314       const auto *p =
315           reinterpret_cast<const cuMemsetD2D32Async_params *>(params);
316       return std::make_tuple(p->dstPitch * p->Height,
317                              CuptiTracerEventType::Memset, true);
318     }
319     default: {
320       LOG(ERROR) << "Unsupported memset activity observed: " << cbid;
321       return std::make_tuple(0, CuptiTracerEventType::Unsupported, false);
322     }
323   }
324 }
325 
326 // Cupti callback corresponding to a driver or runtime API. This global function
327 // is invoked twice for each API: at entry and at exit. The cbdata
328 // parameter is guaranteed by Cupti to be thread-safe. Most invocations are
329 // dropped to the floor and entry/exit is tracked for the APIs we deem
330 // performance-relevant.
ApiCallback(void * user_data,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)331 void CUPTIAPI ApiCallback(void *user_data, CUpti_CallbackDomain domain,
332                           CUpti_CallbackId cbid,
333                           const CUpti_CallbackData *cbdata) {
334   CuptiTracer *tracer = reinterpret_cast<CuptiTracer *>(user_data);
335   tracer->HandleCallback(domain, cbid, cbdata).IgnoreError();
336 }
337 
338 // Callback which is invoked when an empty buffer is requested by CUPTI.
339 // Allocates an empty aligned-memory buffer. The buffer is used by CUPTI as a
340 // ring buffer where device maintains activity profiles that have been
341 // collected.
RequestCuptiActivityBuffer(uint8_t ** buffer,size_t * size,size_t * maxNumRecords)342 void CUPTIAPI RequestCuptiActivityBuffer(uint8_t **buffer, size_t *size,
343                                          size_t *maxNumRecords) {
344   CuptiTracer::GetCuptiTracerSingleton()->RequestActivityBuffer(buffer, size);
345   VLOG(3) << "Requested CUPTI Buffer, buffer=" << std::hex
346           << reinterpret_cast<uintptr_t>(*buffer) << std::dec
347           << " size=" << *size;
348   // Request CUPTI to fill as many records as possible in the buffer.
349   *maxNumRecords = 0;
350 }
351 
352 // Callback which is invoked when a buffer containing activity records is
353 // available from CUPTI. Processes the buffer after reading activity records
354 // from it.
ProcessCuptiActivityBuffer(CUcontext context,uint32_t stream_id,uint8_t * buffer,size_t size,size_t valid_size)355 void CUPTIAPI ProcessCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
356                                          uint8_t *buffer, size_t size,
357                                          size_t valid_size) {
358   VLOG(3) << "Processing CUPTI Buffer, buffer:" << std::hex
359           << reinterpret_cast<uintptr_t>(buffer) << std::dec
360           << " size: " << size << " valid_size: " << valid_size;
361   VLOG(3) << "Activity profile for stream " << stream_id;
362 
363   Status status = CuptiTracer::GetCuptiTracerSingleton()->ProcessActivityBuffer(
364       context, stream_id, buffer, valid_size);
365   if (!status.ok()) {
366     LOG(ERROR) << status;
367   }
368 }
369 
AddKernelEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)370 void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
371                                const CUpti_CallbackData *cbdata,
372                                uint64 start_time, uint64 end_time) {
373   CuptiTracerEvent event{};
374   event.type = CuptiTracerEventType::Kernel;
375   event.source = CuptiTracerEventSource::DriverCallback;
376   event.name = cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
377   event.start_time_ns = start_time;
378   event.end_time_ns = end_time;
379   event.thread_id = Env::Default()->GetCurrentThreadId();
380   event.device_id = device_id;
381   event.context_id = cbdata->contextUid;
382   event.correlation_id = cbdata->correlationId;
383   VLOG(3) << "Cuda Kernel launch API exit. name=" << event.name;
384   collector->AddEvent(std::move(event));
385 }
386 
387 // Performs the actual callback for both normal and P2P memcpy operations.
PopulateMemcpyCallbackEvent(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,size_t num_bytes,uint32 src_device,uint32 dst_device,bool async,uint64 start_time,uint64 end_time)388 CuptiTracerEvent PopulateMemcpyCallbackEvent(
389     CuptiTracerEventType type, const CUpti_CallbackData *cbdata,
390     size_t num_bytes, uint32 src_device, uint32 dst_device, bool async,
391     uint64 start_time, uint64 end_time) {
392   CuptiTracerEvent event{};
393   event.type = type;
394   event.source = CuptiTracerEventSource::DriverCallback;
395   event.start_time_ns = start_time;
396   event.end_time_ns = end_time;
397   event.thread_id = Env::Default()->GetCurrentThreadId();
398   event.device_id = src_device;
399   event.context_id = cbdata->contextUid;
400   event.correlation_id = cbdata->correlationId;
401   event.memcpy_info.num_bytes = num_bytes;
402   event.memcpy_info.destination = dst_device;
403   event.memcpy_info.async = async;
404   // These are not populated during callback for API activities.
405   event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
406   event.memcpy_info.dst_mem_kind = CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN;
407   event.memcpy_info.src_mem_kind = CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN;
408   return event;
409 }
410 
AddNormalMemcpyEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)411 void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector,
412                                      uint32 device_id, CUpti_CallbackId cbid,
413                                      const CUpti_CallbackData *cbdata,
414                                      uint64 start_time, uint64 end_time) {
415   size_t num_bytes;
416   CuptiTracerEventType type;
417   bool async;
418   std::tie(num_bytes, type, async) =
419       DecodeDriverMemcpy(cbid, cbdata->functionParams);
420 
421   VLOG(3) << "Cuda Memcpy API exit. sz=" << num_bytes;
422   CuptiTracerEvent event =
423       PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, device_id, device_id,
424                                   async, start_time, end_time);
425   collector->AddEvent(std::move(event));
426 }
427 
AddCuMemsetEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)428 void AddCuMemsetEventUponApiExit(CuptiTraceCollector *collector,
429                                  uint32 device_id, CUpti_CallbackId cbid,
430                                  const CUpti_CallbackData *cbdata,
431                                  uint64 start_time, uint64 end_time) {
432   // We are casting all variants of cuMemset to cuMemsetD8 for accessing the
433   // first member attribute, a CUdeviceptr.
434   const auto *params =
435       static_cast<const cuMemsetD8_v2_params *>(cbdata->functionParams);
436   size_t num_bytes;
437   bool async;
438   CuptiTracerEventType type;
439   std::tie(num_bytes, type, async) =
440       DecodeDriverMemset(cbid, cbdata->functionParams);
441 
442   CuptiTracerEvent event{};
443   event.type = type;
444   event.source = CuptiTracerEventSource::DriverCallback;
445   event.start_time_ns = start_time;
446   event.end_time_ns = end_time;
447   event.thread_id = Env::Default()->GetCurrentThreadId();
448   event.device_id = device_id;
449   event.context_id = cbdata->contextUid;
450   event.correlation_id = cbdata->correlationId;
451   event.memset_info.num_bytes = num_bytes;
452   // memset_info.kind cannot be determined from API.
453   event.memset_info.async = async;
454   VLOG(3) << "Cuda Memset API exit."
455           << " dptr=" << reinterpret_cast<void *>(params->dstDevice)
456           << " sz=" << num_bytes;
457   collector->AddEvent(std::move(event));
458 }
459 
AddP2PMemcpyEventUponApiExit(CuptiTraceCollector * collector,CuptiInterface * cupti_interface,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)460 void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector,
461                                   CuptiInterface *cupti_interface,
462                                   uint32 device_id, CUpti_CallbackId cbid,
463                                   const CUpti_CallbackData *cbdata,
464                                   uint64 start_time, uint64 end_time) {
465   size_t num_bytes;
466   CuptiTracerEventType type;
467   bool async;
468   std::tie(num_bytes, type, async) =
469       DecodeDriverMemcpy(cbid, cbdata->functionParams);
470 
471   uint32 dst_device = -1, src_device = -1;
472   const auto *p2p_params =
473       static_cast<const cuMemcpyPeer_params *>(cbdata->functionParams);
474   cupti_interface->GetDeviceId(p2p_params->srcContext, &src_device);
475   cupti_interface->GetDeviceId(p2p_params->dstContext, &dst_device);
476   VLOG(3) << "Cuda P2P Memcpy API exit, src: " << src_device
477           << " dst: " << dst_device << " size:" << num_bytes;
478   CuptiTracerEvent event =
479       PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, src_device,
480                                   dst_device, async, start_time, end_time);
481   collector->AddEvent(std::move(event));
482 }
483 
AddCuMemAllocEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)484 void AddCuMemAllocEventUponApiExit(CuptiTraceCollector *collector,
485                                    uint32 device_id, CUpti_CallbackId cbid,
486                                    const CUpti_CallbackData *cbdata,
487                                    uint64 start_time, uint64 end_time) {
488   const auto *params =
489       static_cast<const cuMemAlloc_v2_params *>(cbdata->functionParams);
490   CuptiTracerEvent event{};
491   event.type = CuptiTracerEventType::MemoryAlloc;
492   event.source = CuptiTracerEventSource::DriverCallback;
493   event.name = cbdata->functionName;
494   event.start_time_ns = start_time;
495   event.end_time_ns = end_time;
496   event.thread_id = Env::Default()->GetCurrentThreadId();
497   event.device_id = device_id;
498   event.context_id = cbdata->contextUid;
499   event.correlation_id = cbdata->correlationId;
500   event.memalloc_info.num_bytes = params->bytesize;
501   VLOG(3) << "Cuda MemAlloc API exit."
502           << " dptr=" << reinterpret_cast<void *>(*params->dptr)
503           << " sz=" << params->bytesize;
504   collector->AddEvent(std::move(event));
505 }
506 
AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)507 void AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector *collector,
508                                         uint32 device_id, CUpti_CallbackId cbid,
509                                         const CUpti_CallbackData *cbdata,
510                                         uint64 start_time, uint64 end_time) {
511   const auto *params =
512       static_cast<const cuMemAllocPitch_v2_params *>(cbdata->functionParams);
513   CuptiTracerEvent event{};
514   event.type = CuptiTracerEventType::MemoryAlloc;
515   event.source = CuptiTracerEventSource::DriverCallback;
516   event.name = cbdata->functionName;
517   event.start_time_ns = start_time;
518   event.end_time_ns = end_time;
519   event.thread_id = Env::Default()->GetCurrentThreadId();
520   event.device_id = device_id;
521   event.context_id = cbdata->contextUid;
522   event.correlation_id = cbdata->correlationId;
523   const size_t size_in_bytes = *params->pPitch * params->Height;
524   event.memalloc_info.num_bytes = size_in_bytes;
525   VLOG(3) << "Cuda MemAllocPitch API exit."
526           << " dptr=" << reinterpret_cast<void *>(*params->dptr)
527           << " sz=" << size_in_bytes;
528   collector->AddEvent(std::move(event));
529 }
530 
AddCuMemFreeEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)531 void AddCuMemFreeEventUponApiExit(CuptiTraceCollector *collector,
532                                   uint32 device_id, CUpti_CallbackId cbid,
533                                   const CUpti_CallbackData *cbdata,
534                                   uint64 start_time, uint64 end_time) {
535   const auto *params =
536       static_cast<const cuMemFree_v2_params *>(cbdata->functionParams);
537   CuptiTracerEvent event{};
538   event.type = CuptiTracerEventType::MemoryFree;
539   event.source = CuptiTracerEventSource::DriverCallback;
540   event.name = cbdata->functionName;
541   event.start_time_ns = start_time;
542   event.end_time_ns = end_time;
543   event.thread_id = Env::Default()->GetCurrentThreadId();
544   event.device_id = device_id;
545   event.context_id = cbdata->contextUid;
546   event.correlation_id = cbdata->correlationId;
547   VLOG(3) << "Cuda MemFree API exit."
548           << " dptr=" << reinterpret_cast<void *>(params->dptr);
549   collector->AddEvent(std::move(event));
550 }
551 
AddGenericEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)552 void AddGenericEventUponApiExit(CuptiTraceCollector *collector,
553                                 uint32 device_id, CUpti_CallbackId cbid,
554                                 const CUpti_CallbackData *cbdata,
555                                 uint64 start_time, uint64 end_time) {
556   CuptiTracerEvent event{};
557   event.type = CuptiTracerEventType::Generic;
558   event.source = CuptiTracerEventSource::DriverCallback;
559   event.name = cbdata->functionName;
560   event.start_time_ns = start_time;
561   event.end_time_ns = end_time;
562   event.thread_id = Env::Default()->GetCurrentThreadId();
563   event.device_id = device_id;
564   event.context_id = cbdata->contextUid;
565   event.correlation_id = cbdata->correlationId;
566   VLOG(3) << "Observed generic API exit."
567           << " name=" << cbdata->functionName;
568   collector->AddEvent(std::move(event));
569 }
570 
AddKernelActivityEvent(CuptiTraceCollector * collector,const CuptiActivityKernelTy * kernel)571 void AddKernelActivityEvent(CuptiTraceCollector *collector,
572                             const CuptiActivityKernelTy *kernel) {
573   CuptiTracerEvent event{};
574   event.type = CuptiTracerEventType::Kernel;
575   event.source = CuptiTracerEventSource::Activity;
576   event.name = kernel->name;
577   event.start_time_ns = kernel->start;
578   event.end_time_ns = kernel->end;
579   event.device_id = kernel->deviceId;
580   event.context_id = kernel->contextId;
581   event.stream_id = kernel->streamId;
582   event.correlation_id = kernel->correlationId;
583   AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
584       event.device_id, event.correlation_id);
585   event.annotation = info.annotation;
586   event.nvtx_range = info.nvtx_range;
587   event.kernel_info.registers_per_thread = kernel->registersPerThread;
588   event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
589   event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
590   event.kernel_info.block_x = kernel->blockX;
591   event.kernel_info.block_y = kernel->blockY;
592   event.kernel_info.block_z = kernel->blockZ;
593   event.kernel_info.grid_x = kernel->gridX;
594   event.kernel_info.grid_y = kernel->gridY;
595   event.kernel_info.grid_z = kernel->gridZ;
596 #if TF_CUPTI_HAS_CHANNEL_ID
597   event.kernel_info.channel_id = kernel->channelID;
598   event.kernel_info.channel_type = kernel->channelType;
599 #endif
600   collector->AddEvent(std::move(event));
601 }
602 
AddMemcpyActivityEvent(CuptiTraceCollector * collector,const CuptiActivityMemcpyTy * memcpy)603 void AddMemcpyActivityEvent(CuptiTraceCollector *collector,
604                             const CuptiActivityMemcpyTy *memcpy) {
605   CuptiTracerEvent event{};
606   switch (memcpy->copyKind) {
607     case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
608       event.type = CuptiTracerEventType::MemcpyH2D;
609       event.name = "MemcpyH2D";
610       break;
611     case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
612       event.type = CuptiTracerEventType::MemcpyD2H;
613       event.name = "MemcpyD2H";
614       break;
615     case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
616       event.type = CuptiTracerEventType::MemcpyD2D;
617       event.name = "MemcpyD2D";
618       break;
619     case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
620       event.type = CuptiTracerEventType::MemcpyP2P;
621       event.name = "MemcpyP2P";
622       break;
623     default:
624       event.type = CuptiTracerEventType::MemcpyOther;
625       event.name = "MemcpyOther";
626       break;
627   }
628 
629   event.source = CuptiTracerEventSource::Activity;
630   event.start_time_ns = memcpy->start;
631   event.end_time_ns = memcpy->end;
632   event.device_id = memcpy->deviceId;
633   event.context_id = memcpy->contextId;
634   event.stream_id = memcpy->streamId;
635   event.correlation_id = memcpy->correlationId;
636   AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
637       event.device_id, event.correlation_id);
638   event.annotation = info.annotation;
639   event.memcpy_info.copy_kind = memcpy->copyKind;
640   event.memcpy_info.num_bytes = memcpy->bytes;
641   event.memcpy_info.destination = memcpy->deviceId;
642   event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
643   event.memcpy_info.src_mem_kind = memcpy->srcKind;
644   event.memcpy_info.dst_mem_kind = memcpy->dstKind;
645 #if TF_CUPTI_HAS_CHANNEL_ID
646   event.memcpy_info.channel_id = memcpy->channelID;
647   event.memcpy_info.channel_type = memcpy->channelType;
648 #endif
649   collector->AddEvent(std::move(event));
650 }
651 
652 // Invokes callback upon peer-2-peer memcpy between different GPU devices.
AddMemcpyP2PActivityEvent(CuptiTraceCollector * collector,const CuptiActivityMemcpyP2PTy * memcpy)653 void AddMemcpyP2PActivityEvent(CuptiTraceCollector *collector,
654                                const CuptiActivityMemcpyP2PTy *memcpy) {
655   CuptiTracerEvent event{};
656   event.type = CuptiTracerEventType::MemcpyP2P;
657   event.name = "MemcpyP2P";
658   event.source = CuptiTracerEventSource::Activity;
659   event.start_time_ns = memcpy->start;
660   event.end_time_ns = memcpy->end;
661   event.device_id = memcpy->srcDeviceId;
662   event.context_id = memcpy->contextId;
663   event.stream_id = memcpy->streamId;
664   event.correlation_id = memcpy->correlationId;
665   AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
666       event.device_id, event.correlation_id);
667   event.annotation = info.annotation;
668   event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
669   event.memcpy_info.num_bytes = memcpy->bytes;
670   event.memcpy_info.destination = memcpy->dstDeviceId;
671   event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
672   event.memcpy_info.src_mem_kind = memcpy->srcKind;
673   event.memcpy_info.dst_mem_kind = memcpy->dstKind;
674 #if TF_CUPTI_HAS_CHANNEL_ID
675   event.memcpy_info.channel_id = memcpy->channelID;
676   event.memcpy_info.channel_type = memcpy->channelType;
677 #endif
678   collector->AddEvent(std::move(event));
679 }
680 
AddCuptiOverheadActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityOverhead * overhead)681 void AddCuptiOverheadActivityEvent(CuptiTraceCollector *collector,
682                                    const CUpti_ActivityOverhead *overhead) {
683   CuptiTracerEvent event{};
684   event.type = CuptiTracerEventType::Overhead;
685   event.name = getActivityOverheadKindString(overhead->overheadKind);
686   event.source = CuptiTracerEventSource::Activity;
687   event.start_time_ns = overhead->start;
688   event.end_time_ns = overhead->end;
689   // If the overhead is not related to a device, we assign it to device 0.
690   event.device_id = 0;
691   // NOTE: no correlation id.
692   switch (overhead->objectKind) {
693     case CUPTI_ACTIVITY_OBJECT_UNKNOWN:
694       // Don't know how to deal with such activities because of we need either
695       // attribute it to a GPU stream or a CPU thread.
696       return;
697 
698     case CUPTI_ACTIVITY_OBJECT_THREAD:
699     case CUPTI_ACTIVITY_OBJECT_PROCESS:
700       event.thread_id = overhead->objectId.pt.threadId;
701       break;
702     case CUPTI_ACTIVITY_OBJECT_STREAM:
703       event.stream_id = overhead->objectId.dcs.streamId;
704       TF_FALLTHROUGH_INTENDED;
705     case CUPTI_ACTIVITY_OBJECT_DEVICE:
706     case CUPTI_ACTIVITY_OBJECT_CONTEXT:
707       event.device_id = overhead->objectId.dcs.deviceId;
708       break;
709     default:
710       LOG(ERROR) << "Unexpected object kind: " << overhead->objectKind;
711       return;
712   }
713   collector->AddEvent(std::move(event));
714 }
715 
AddUnifiedMemoryActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityUnifiedMemoryCounter2 * record)716 void AddUnifiedMemoryActivityEvent(
717     CuptiTraceCollector *collector,
718     const CUpti_ActivityUnifiedMemoryCounter2 *record) {
719   VLOG(3) << "Cuda Unified Memory Activity, kind: " << record->counterKind
720           << " src: " << record->srcId << " dst: " << record->dstId;
721   CuptiTracerEvent event{};
722   event.type = CuptiTracerEventType::UnifiedMemory;
723   event.name = getActivityUnifiedMemoryKindString(record->counterKind);
724   event.source = CuptiTracerEventSource::Activity;
725   event.start_time_ns = record->start;
726   if (record->counterKind ==
727           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT ||
728       record->counterKind ==
729           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING ||
730       record->counterKind ==
731           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP ||
732       record->end <= record->start) {
733     // If the end time is not valid, trim it so that it can be shown on the UI.
734     event.end_time_ns = record->start + 1;
735   } else {
736     event.end_time_ns = record->end;
737   }
738   event.device_id = record->srcId;
739   // NOTE: not context id and correlation id.
740 
741   // For visualization purpose, we assign a pseudo stream id for each
742   // record->counterKind of unified memory related events.
743   constexpr int kPseudoStreamId = 0x10000000;
744   event.stream_id = kPseudoStreamId + record->counterKind;
745   event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
746   // Check whether the activity is byte transfer.
747   if (record->counterKind ==
748           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD ||
749       record->counterKind ==
750           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH ||
751       record->counterKind ==
752           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD) {
753     event.memcpy_info.num_bytes = record->value;
754   } else {
755     event.memcpy_info.num_bytes = 0;
756   }
757   event.memcpy_info.destination = record->dstId;
758   event.memcpy_info.async = false;
759   collector->AddEvent(std::move(event));
760 }
761 
AddMemoryActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemory * memory)762 void AddMemoryActivityEvent(CuptiTraceCollector *collector,
763                             const CUpti_ActivityMemory *memory) {
764   CuptiTracerEvent event{};
765   event.name = absl::StrCat("Memory ", GetMemoryKindName(memory->memoryKind));
766   event.type = CuptiTracerEventType::MemoryResidency;
767   event.source = CuptiTracerEventSource::Activity;
768   event.start_time_ns = memory->start;
769   event.end_time_ns = std::max(memory->end, memory->start + 1);
770   event.device_id = memory->deviceId;
771   event.context_id = memory->contextId;
772   // Assign to default stream (0) so that event is included during Flush().
773   event.stream_id = 0;
774   event.memory_residency_info.num_bytes = memory->bytes;
775   event.memory_residency_info.mem_kind = memory->memoryKind;
776   event.memory_residency_info.address = memory->address;
777   VLOG(5) << "Cuda activity " << event.name
778           << " addr: " << reinterpret_cast<void *>(memory->address)
779           << " bytes: " << memory->bytes;
780   collector->AddEvent(std::move(event));
781 }
782 
AddMemsetActivityEvent(CuptiTraceCollector * collector,const CuptiActivityMemsetTy * memset)783 void AddMemsetActivityEvent(CuptiTraceCollector *collector,
784                             const CuptiActivityMemsetTy *memset) {
785   auto mem_kind = memset->memoryKind;
786   CuptiTracerEvent event{};
787   event.type = CuptiTracerEventType::Memset;
788   event.source = CuptiTracerEventSource::Activity;
789   event.name = absl::StrCat("Memset ", mem_kind);
790   event.start_time_ns = memset->start;
791   event.end_time_ns = std::max(memset->end, memset->start + 1);
792   event.device_id = memset->deviceId;
793   event.correlation_id = memset->correlationId;
794   event.context_id = memset->contextId;
795   event.stream_id = memset->streamId;
796   event.memset_info.num_bytes = memset->bytes;
797   event.memset_info.mem_kind = mem_kind;
798   event.memset_info.async = (memset->flags & CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC);
799 #if TF_CUPTI_HAS_CHANNEL_ID
800   event.memset_info.channel_id = memset->channelID;
801   event.memset_info.channel_type = memset->channelType;
802 #endif
803   VLOG(5) << "Cuda activity " << event.name << " bytes: " << memset->bytes
804           << " async: " << event.memset_info.async;
805   collector->AddEvent(std::move(event));
806 }
807 
AddSynchronizationActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivitySynchronization * sync)808 void AddSynchronizationActivityEvent(
809     CuptiTraceCollector *collector, const CUpti_ActivitySynchronization *sync) {
810   CuptiTracerEvent event{};
811   event.type = CuptiTracerEventType::Generic;
812   event.source = CuptiTracerEventSource::Activity;
813   switch (sync->type) {
814     case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE:
815       event.name = "cuEventSynchronize";
816       break;
817     case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT:
818       event.name = "cuStreamWaitEvent";
819       break;
820     case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE:
821       event.name = "cuStreamSynchronize";
822       break;
823     case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_CONTEXT_SYNCHRONIZE:
824       event.name = "cuCtxSynchronize";
825       break;
826     default:
827       event.name = "unknown synchronization event";
828       break;
829   }
830   event.start_time_ns = sync->start;
831   event.end_time_ns = std::max(sync->end, sync->start + 1);
832   event.correlation_id = sync->correlationId;
833   event.context_id = sync->contextId;
834   VLOG(5) << "Cuda activity " << event.name;
835   collector->AddEvent(std::move(event));
836 }
837 
838 // This hook uses cupti activity api to measure device side activities.
839 class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
840  public:
CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions & option,CuptiInterface * cupti_interface,CuptiTraceCollector * collector)841   CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions &option,
842                                     CuptiInterface *cupti_interface,
843                                     CuptiTraceCollector *collector)
844       : option_(option),
845         cupti_interface_(cupti_interface),
846         collector_(collector) {}
847 
OnDriverApiEnter(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)848   Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
849                           CUpti_CallbackId cbid,
850                           const CUpti_CallbackData *cbdata) override {
851     // Stash away the current Cupti timestamp into cbdata.
852     *cbdata->correlationData =
853         option_.required_callback_api_events ? CuptiTracer::GetTimestamp() : 0;
854     return OkStatus();
855   }
OnDriverApiExit(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)856   Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
857                          CUpti_CallbackId cbid,
858                          const CUpti_CallbackData *cbdata) override {
859     // If we are not collecting CPU events from Callback API, we can return now.
860     if (!option_.required_callback_api_events) {
861       return OkStatus();
862     }
863 
864     // Grab timestamp for API exit. API entry timestamp saved in cbdata.
865     uint64 end_tsc = CuptiTracer::GetTimestamp();
866     uint64 start_tsc = *cbdata->correlationData;
867     TrackContext(cbid, cbdata->context);
868     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
869                                      start_tsc, end_tsc, domain, cbid, cbdata);
870   }
SyncAndFlush()871   Status SyncAndFlush() override {
872     if (option_.sync_devices_before_stop) {
873       CuptiApiTracingDisabler disabler;
874       absl::MutexLock lock(&mutex_);
875       for (auto &ctx : contexts_) {
876         cuCtxPushCurrent(ctx);
877         cuCtxSynchronize();  // Ignore error here for best effort.
878         CUcontext current;
879         cuCtxPopCurrent(&current);
880       }
881     }
882     return OkStatus();
883   }
884 
885  private:
TrackContext(CUpti_CallbackId cbid,CUcontext ctx)886   void TrackContext(CUpti_CallbackId cbid, CUcontext ctx) {
887     if (!option_.sync_devices_before_stop) return;
888     if (ctx == nullptr) return;
889     absl::MutexLock lock(&mutex_);
890     if (cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2 ||
891         cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy) {
892       contexts_.erase(ctx);
893     } else {
894       contexts_.emplace(ctx);
895     }
896   }
897 
898   const CuptiTracerOptions option_;
899   CuptiInterface *cupti_interface_;
900   CuptiTraceCollector *collector_;
901   absl::Mutex mutex_;
902   absl::flat_hash_set<CUcontext> contexts_ TF_GUARDED_BY(mutex_);
903 
904   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithActivityApi);
905 };
906 
907 struct KernelRecord {
908   const char *kernel_name;
909   // TODO(csigg): cuStreamGetCtx introduced in CUDA 9.2 would allow us to only
910   // record the stream and infer the context during collection.
911   CUcontext context;
912   CUstream stream;
913   uint32 correlation_id;
914   CUevent start_event;
915   CUevent stop_event;
916   KernelDetails details;
917   uint64 start_timestamp;
918 };
919 
920 struct MemcpyRecord {
921   CuptiTracerEventType type;
922   size_t size_bytes;
923   CUcontext context;
924   CUstream stream;
925   uint32 correlation_id;
926   bool async;
927   CUevent start_event;
928   CUevent stop_event;
929   uint64 start_timestamp;
930 };
931 
CreateAndRecordEvent(CUevent * event,CUstream stream)932 Status CreateAndRecordEvent(CUevent *event, CUstream stream) {
933   CuptiApiTracingDisabler disabler;
934   TF_RETURN_IF_ERROR(ToStatus(cuEventCreate(event, CU_EVENT_DEFAULT)));
935   return ToStatus(cuEventRecord(*event, stream));
936 }
937 
938 #if CUDA_VERSION >= 10000
939 // Maintain and restore current thread's CUDA context.
940 // Note: cuStreamGetCtx only available after CUDA 9.2.
941 class ScopedCudaContext {
942  public:
ScopedCudaContext(CUstream stream)943   explicit ScopedCudaContext(CUstream stream) : stream_(stream) {
944     CuptiApiTracingDisabler disabler;  // don't trace cuda call in this func.
945     CUcontext context;
946     if (cuStreamGetCtx(stream, &context) != CUDA_SUCCESS) return;
947     context_ = context;
948     uint32 device_ordinal;
949     if (cuptiGetDeviceId(context, &device_ordinal) != CUPTI_SUCCESS) return;
950     device_ordinal_ = device_ordinal;
951     context_pushed_ = cuCtxPushCurrent(context) == CUDA_SUCCESS;
952   }
~ScopedCudaContext()953   ~ScopedCudaContext() {
954     if (!context_pushed_) return;
955     CuptiApiTracingDisabler disabler;  // don't trace cuda call in this func.
956     cuCtxPopCurrent(&*context_);
957   }
958 
959   // If successful, return the device ordinal of the relevant cuda stream.
960   // Otherwise absl::nullopt;
GetDeviceOrdinal()961   absl::optional<uint32> GetDeviceOrdinal() { return device_ordinal_; }
962 
963   // If successful, return the cuda context of the relevant cuda stream.
964   // Otherwise absl::nullopt;
GetContext()965   absl::optional<CUcontext> GetContext() { return context_; }
966 
967  private:
968   CUstream stream_;
969   absl::optional<CUcontext> context_;
970   absl::optional<uint32> device_ordinal_;
971   bool context_pushed_ = false;
972 };
973 #endif
974 
975 // Stores a series of kernel and memcpy records.
976 class CudaEventRecorder {
977  public:
CudaEventRecorder(CuptiInterface * cupti_interface,CuptiTraceCollector * collector,int ordinal)978   CudaEventRecorder(CuptiInterface *cupti_interface,
979                     CuptiTraceCollector *collector, int ordinal)
980       : cupti_interface_(cupti_interface),
981         collector_(collector),
982         ordinal_(ordinal) {
983     device_name_ = absl::StrCat("gpu ", ordinal);  // default.
984     CUdevice device;
985     if (cuDeviceGet(&device, ordinal) == CUDA_SUCCESS) {
986       char name[100];
987       if (cuDeviceGetName(name, sizeof(name), device) == CUDA_SUCCESS) {
988         device_name_ = name;
989       }
990     }
991   }
992 
993   // Registers the start of a kernel launch. The returned index should be passed
994   // to StopKernel() after the kernel launch has completed.
995   template <typename T>
StartKernel(const char * kernel_name,CUcontext context,uint32 correlation_id,const T * params)996   size_t StartKernel(const char *kernel_name, CUcontext context,
997                      uint32 correlation_id, const T *params) {
998     CUstream stream = params->hStream;
999     KernelRecord record = {kernel_name, context, stream, correlation_id};
1000     record.details.registers_per_thread = 0;  // unknown.
1001     record.details.static_shared_memory_usage = params->sharedMemBytes;
1002     record.details.dynamic_shared_memory_usage = 0;  // unknown
1003     record.details.block_x = params->blockDimX;
1004     record.details.block_y = params->blockDimY;
1005     record.details.block_z = params->blockDimZ;
1006     record.details.grid_x = params->gridDimX;
1007     record.details.grid_y = params->gridDimY;
1008     record.details.grid_z = params->gridDimZ;
1009     record.start_timestamp = CuptiTracer::GetTimestamp();
1010     LogIfError(CreateAndRecordEvent(&record.start_event, stream));
1011     absl::MutexLock lock(&mutex_);
1012     if (stopped_) return -1;
1013     kernel_records_.push_back(record);
1014     return kernel_records_.size() - 1;
1015   }
StopKernel(size_t index)1016   uint64 StopKernel(size_t index) {
1017     absl::MutexLock lock(&mutex_);
1018     if (index >= kernel_records_.size()) return 0;
1019     auto &record = kernel_records_[index];
1020     LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
1021     return record.start_timestamp;
1022   }
1023 
1024   // Registers the start of a copy operation. The returned index should be
1025   // passed to StopMemcpy() after the memcpy has completed.
StartMemcpy(CuptiTracerEventType type,size_t size_bytes,CUcontext context,CUstream stream,uint32 correlation_id,bool async)1026   size_t StartMemcpy(CuptiTracerEventType type, size_t size_bytes,
1027                      CUcontext context, CUstream stream, uint32 correlation_id,
1028                      bool async) {
1029     MemcpyRecord record = {type,   size_bytes,     context,
1030                            stream, correlation_id, async};
1031     record.start_timestamp = CuptiTracer::GetTimestamp();
1032     LogIfError(CreateAndRecordEvent(&record.start_event, stream));
1033     absl::MutexLock lock(&mutex_);
1034     if (stopped_) return -1;
1035     memcpy_records_.push_back(record);
1036     return memcpy_records_.size() - 1;
1037   }
StopMemcpy(size_t index)1038   uint64 StopMemcpy(size_t index) {
1039     absl::MutexLock lock(&mutex_);
1040     if (index >= memcpy_records_.size()) return 0;
1041     auto &record = memcpy_records_[index];
1042     LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
1043     return record.start_timestamp;
1044   }
1045 
Stop()1046   Status Stop() {
1047     {
1048       absl::MutexLock lock(&mutex_);
1049       stopped_ = true;
1050       LOG(INFO) << "Collecting " << kernel_records_.size()
1051                 << " kernel records, " << memcpy_records_.size()
1052                 << " memcpy records.";
1053 
1054       // Gather all profiled streams and contexts.
1055       for (const auto &record : kernel_records_) {
1056         TF_RETURN_IF_ERROR(
1057             AddStreamInfo(record.context, record.stream, "Kernel"));
1058       }
1059       for (const auto &record : memcpy_records_) {
1060         TF_RETURN_IF_ERROR(AddStreamInfo(record.context, record.stream,
1061                                          GetTraceEventTypeName(record.type)));
1062       }
1063     }
1064 
1065     // Synchronize all contexts, record end events, synchronize again.
1066     // This scheme is an unreliable measure to associate a event with the wall
1067     // time. There are chances that other threads might enque kernels which
1068     // delay the second synchronization.
1069     TF_RETURN_IF_ERROR(Synchronize());
1070     for (auto &pair : context_infos_) {
1071       TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
1072       TF_RETURN_IF_ERROR(CreateAndRecordEvent(&pair.second.end_event, nullptr));
1073     }
1074 
1075     TF_RETURN_IF_ERROR(Synchronize());
1076     end_walltime_us_ = Env::Default()->NowMicros();
1077     return OkStatus();
1078   }
1079 
Flush(AnnotationMap * annotation_map)1080   Status Flush(AnnotationMap *annotation_map) {
1081     auto kernel_records = ConsumeKernelRecords();
1082     auto memcpy_records = ConsumeMemcpyRecords();
1083     for (const auto &record : kernel_records) {
1084       TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
1085     }
1086     for (const auto &record : memcpy_records) {
1087       TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
1088     }
1089     return OkStatus();
1090   }
1091 
ConsumeKernelRecords()1092   std::vector<KernelRecord> ConsumeKernelRecords() {
1093     absl::MutexLock lock(&mutex_);
1094     return std::move(kernel_records_);
1095   }
ConsumeMemcpyRecords()1096   std::vector<MemcpyRecord> ConsumeMemcpyRecords() {
1097     absl::MutexLock lock(&mutex_);
1098     return std::move(memcpy_records_);
1099   }
1100 
1101  private:
1102   struct ContextInfo {
1103     uint32 context_id = 0;
1104     int num_streams = 0;
1105     CUevent end_event;
1106   };
1107 
1108   struct StreamInfo {
1109     uint32 stream_id = 0;
1110     std::string name;
1111     int index;  // 0 is reserved for null stream.
1112     const ContextInfo *ctx_info;
1113   };
1114 
1115   // Synchronizes all contexts.
Synchronize() const1116   Status Synchronize() const {
1117     CuptiApiTracingDisabler disabler;
1118     for (const auto &pair : context_infos_) {
1119       TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
1120       TF_RETURN_IF_ERROR(ToStatus(cuCtxSynchronize()));
1121     }
1122     return OkStatus();
1123   }
1124 
1125   // Returns element from context_infos_, adding it if not yet present.
GetContextInfo(CUcontext context,ContextInfo ** ctx_info_ptr)1126   Status GetContextInfo(CUcontext context, ContextInfo **ctx_info_ptr) {
1127     auto it = context_infos_.find(context);
1128 
1129     if (it == context_infos_.end()) {
1130       uint32 context_id = 0;
1131       RETURN_IF_CUPTI_ERROR(
1132           cupti_interface_->GetContextId(context, &context_id));
1133       ContextInfo ctx_info = {context_id};
1134       it = context_infos_.emplace(context, ctx_info).first;
1135     }
1136 
1137     *ctx_info_ptr = &it->second;
1138     return OkStatus();
1139   }
1140 
1141   // Adds element to stream_infos_ if not yet present. If present, clear name
1142   // if it doesn't match parameter.
AddStreamInfo(CUcontext context,CUstream stream,absl::string_view name)1143   Status AddStreamInfo(CUcontext context, CUstream stream,
1144                        absl::string_view name) {
1145     StreamKey key(context, stream);
1146     auto it = stream_infos_.find(key);
1147     if (it != stream_infos_.end()) {
1148       if (it->second.name != name) {
1149         it->second.name.clear();  // Stream with inconsistent names, clear it.
1150       }
1151       return OkStatus();
1152     }
1153 
1154     ContextInfo *ctx_info;
1155     TF_RETURN_IF_ERROR(GetContextInfo(context, &ctx_info));
1156     int index = stream ? ++ctx_info->num_streams : 0;
1157     uint32 stream_id = 0;
1158 #if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
1159     RETURN_IF_CUPTI_ERROR(
1160         cupti_interface_->GetStreamIdEx(context, stream, 1, &stream_id));
1161 #else
1162     RETURN_IF_CUPTI_ERROR(
1163         cupti_interface_->GetStreamIdEx(context, stream, 0, &stream_id));
1164 #endif
1165 
1166     StreamInfo stream_info = {stream_id, static_cast<std::string>(name), index,
1167                               ctx_info};
1168     stream_infos_.emplace(key, stream_info);
1169     return OkStatus();
1170   }
1171 
1172   // Returns time in microseconds between events recorded on the GPU.
GetElapsedTimeUs(CUevent start,CUevent stop)1173   static uint64_t GetElapsedTimeUs(CUevent start, CUevent stop) {
1174     CuptiApiTracingDisabler disabler;
1175     float elapsed_ms = 0.0f;
1176     LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop)));
1177     return static_cast<uint64>(
1178         std::llroundf(1000 * std::max(elapsed_ms, 0.0f)));
1179   }
1180 
SaveRecord(const KernelRecord & record,AnnotationMap * annotation_map) const1181   Status SaveRecord(const KernelRecord &record,
1182                     AnnotationMap *annotation_map) const {
1183     if (!record.start_event || !record.stop_event) {
1184       return OkStatus();
1185     }
1186     const auto &stream_info =
1187         stream_infos_.at(StreamKey(record.context, record.stream));
1188     auto start_us =
1189         GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
1190     auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
1191 
1192     std::string annotation;
1193 
1194     CuptiTracerEvent event{};
1195     event.type = CuptiTracerEventType::Kernel;
1196     event.source = CuptiTracerEventSource::Activity;  // on gpu device.
1197     event.name = record.kernel_name;
1198     event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
1199     event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
1200     event.device_id = ordinal_;
1201     event.context_id = stream_info.ctx_info->context_id;
1202     event.stream_id = stream_info.stream_id;
1203     event.correlation_id = record.correlation_id;
1204     AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
1205         event.device_id, event.correlation_id);
1206     event.annotation = info.annotation;
1207     event.kernel_info = record.details;
1208     collector_->AddEvent(std::move(event));
1209     return OkStatus();
1210   }
1211 
SaveRecord(const MemcpyRecord & record,AnnotationMap * annotation_map) const1212   Status SaveRecord(const MemcpyRecord &record,
1213                     AnnotationMap *annotation_map) const {
1214     if (!record.start_event || !record.stop_event) {
1215       return OkStatus();
1216     }
1217     const auto &stream_info =
1218         stream_infos_.at(StreamKey(record.context, record.stream));
1219     auto start_us =
1220         GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
1221     auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
1222 
1223     CuptiTracerEvent event{};
1224     event.type = record.type;
1225     event.name = GetTraceEventTypeName(event.type);
1226     event.source = CuptiTracerEventSource::Activity;
1227     event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
1228     event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
1229     event.device_id = ordinal_;
1230     event.context_id = stream_info.ctx_info->context_id;
1231     event.stream_id = stream_info.stream_id;
1232     event.correlation_id = record.correlation_id;
1233     AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
1234         event.device_id, event.correlation_id);
1235     event.annotation = info.annotation;
1236     event.memcpy_info.num_bytes = record.size_bytes;
1237     // TODO: support MemcpyD2D where destination != source;
1238     event.memcpy_info.destination = ordinal_;
1239     event.memcpy_info.async = record.async;
1240     // TODO: set src_mem_kind and dst_mem_kind.
1241     collector_->AddEvent(std::move(event));
1242     return OkStatus();
1243   }
1244 
1245   absl::Mutex mutex_;
1246   bool stopped_ TF_GUARDED_BY(mutex_) = false;
1247   std::vector<KernelRecord> kernel_records_ TF_GUARDED_BY(mutex_);
1248   std::vector<MemcpyRecord> memcpy_records_ TF_GUARDED_BY(mutex_);
1249 
1250   CuptiInterface *cupti_interface_;
1251   CuptiTraceCollector *collector_;
1252   const int ordinal_;
1253   std::string device_name_;
1254   uint64 end_walltime_us_;
1255   // Include context in key to distinguish null streams.
1256   using StreamKey = std::pair<CUcontext, CUstream>;
1257 
1258   absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
1259   absl::flat_hash_map<StreamKey, StreamInfo> stream_infos_;
1260 };
1261 
1262 // This hook uses cuda events to measure device side activities.
1263 class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
1264  public:
CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions & option,CuptiInterface * cupti_interface,CuptiTraceCollector * collector)1265   CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions &option,
1266                                   CuptiInterface *cupti_interface,
1267                                   CuptiTraceCollector *collector)
1268       : option_(option),
1269         cupti_interface_(cupti_interface),
1270         collector_(collector) {
1271     int num_gpus = CuptiTracer::NumGpus();
1272     cuda_event_recorders_.reserve(num_gpus);
1273     for (int i = 0; i < num_gpus; ++i) {
1274       cuda_event_recorders_.emplace_back(
1275           std::make_unique<CudaEventRecorder>(cupti_interface, collector, i));
1276     }
1277   }
~CuptiDriverApiHookWithCudaEvent()1278   ~CuptiDriverApiHookWithCudaEvent() {
1279     for (auto *callback_context : callback_contexts_) delete callback_context;
1280   }
1281 
OnDriverApiEnter(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1282   Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
1283                           CUpti_CallbackId cbid,
1284                           const CUpti_CallbackData *cbdata) override {
1285     auto *recorder = cuda_event_recorders_[device_id].get();
1286     switch (cbid) {
1287       case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: {
1288         DCHECK_NE(cbdata->symbolName, nullptr);
1289         const auto *params =
1290             static_cast<const cuLaunchKernel_params *>(cbdata->functionParams);
1291         *cbdata->correlationData = recorder->StartKernel<cuLaunchKernel_params>(
1292             cbdata->symbolName, cbdata->context, cbdata->correlationId, params);
1293         break;
1294       }
1295       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: {
1296         DCHECK_NE(cbdata->symbolName, nullptr);
1297         const auto *params =
1298             static_cast<const cuLaunchCooperativeKernel_params *>(
1299                 cbdata->functionParams);
1300         *cbdata->correlationData =
1301             recorder->StartKernel<cuLaunchCooperativeKernel_params>(
1302                 cbdata->symbolName, cbdata->context, cbdata->correlationId,
1303                 params);
1304         break;
1305       }
1306       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
1307 #if CUDA_VERSION >= 10000
1308         const auto *params =
1309             static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
1310                 cbdata->functionParams);
1311         std::vector<uint32> record_indices;
1312         record_indices.reserve(params->numDevices);
1313         *cbdata->correlationData = -1;  // Invalid value.
1314         const auto &annotation = AnnotationStack::Get();
1315         for (int i = 0; i < params->numDevices; ++i) {
1316           CUstream stream = params->launchParamsList[i].hStream;
1317           ScopedCudaContext scoped_cuda_context(stream);
1318           auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
1319           auto context = scoped_cuda_context.GetContext();
1320           if (!dev_id) return errors::Internal("Invalid CUDA stream");
1321           // Because annotation are per device, therefore we need to populate
1322           // annotation for each device involved.
1323           collector_->annotation_map()->Add(*dev_id, cbdata->correlationId,
1324                                             annotation, "");
1325           record_indices.push_back(
1326               cuda_event_recorders_[*dev_id]->StartKernel<CUDA_LAUNCH_PARAMS>(
1327                   "CooperativeKernelMultiDevice", *context,
1328                   cbdata->correlationId, &(params->launchParamsList[i])));
1329         }
1330         auto *callback_context =
1331             new CuptiApiCallbackContext(std::move(record_indices));
1332         callback_contexts_.insert(callback_context);
1333         *cbdata->correlationData = reinterpret_cast<uint64>(callback_context);
1334 #else
1335         VLOG(1) << "Unhandled cuLaunchCooperativeKernelMultiDevice.";
1336 #endif
1337       } break;
1338       case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
1339         const auto *params =
1340             static_cast<const cuMemcpy_params *>(cbdata->functionParams);
1341         StartMemcpy<cuMemcpy_params>(GetMemcpyType(params->src, params->dst),
1342                                      cbdata, recorder);
1343         break;
1344       }
1345       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
1346         const auto *params =
1347             static_cast<const cuMemcpyAsync_params *>(cbdata->functionParams);
1348         StartMemcpyAsync<cuMemcpyAsync_params>(
1349             GetMemcpyType(params->src, params->dst), cbdata, recorder);
1350         break;
1351       }
1352       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1353         StartMemcpy<cuMemcpyHtoD_v2_params>(CuptiTracerEventType::MemcpyH2D,
1354                                             cbdata, recorder);
1355         break;
1356       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1357         StartMemcpyAsync<cuMemcpyHtoDAsync_v2_params>(
1358             CuptiTracerEventType::MemcpyH2D, cbdata, recorder);
1359         break;
1360       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1361         StartMemcpy<cuMemcpyDtoH_v2_params>(CuptiTracerEventType::MemcpyD2H,
1362                                             cbdata, recorder);
1363         break;
1364       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1365         StartMemcpyAsync<cuMemcpyDtoHAsync_v2_params>(
1366             CuptiTracerEventType::MemcpyD2H, cbdata, recorder);
1367         break;
1368       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1369         StartMemcpy<cuMemcpyDtoD_v2_params>(CuptiTracerEventType::MemcpyD2D,
1370                                             cbdata, recorder);
1371         break;
1372       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1373         StartMemcpyAsync<cuMemcpyDtoDAsync_v2_params>(
1374             CuptiTracerEventType::MemcpyD2D, cbdata, recorder);
1375         break;
1376       default:
1377         VLOG(1) << "Unexpected callback id: " << cbid;
1378         break;
1379     }
1380     return OkStatus();
1381   }
1382 
OnDriverApiExit(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1383   Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
1384                          CUpti_CallbackId cbid,
1385                          const CUpti_CallbackData *cbdata) override {
1386     auto *recorder = cuda_event_recorders_[device_id].get();
1387     if (*cbdata->correlationData == static_cast<size_t>(-1)) return OkStatus();
1388     uint64 start_tsc = 0;
1389     switch (cbid) {
1390       case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
1391       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
1392         start_tsc = recorder->StopKernel(*cbdata->correlationData);
1393         break;
1394       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
1395 #if CUDA_VERSION >= 10000
1396         auto *callback_context = reinterpret_cast<CuptiApiCallbackContext *>(
1397             *cbdata->correlationData);
1398         callback_contexts_.erase(callback_context);
1399         auto record_indices = std::move(callback_context->record_indices);
1400         delete callback_context;
1401         const auto *params =
1402             static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
1403                 cbdata->functionParams);
1404         if (record_indices.size() != params->numDevices)
1405           return errors::Internal("Invalid correlation data");
1406         for (int i = 0; i < params->numDevices; ++i) {
1407           CUstream stream = params->launchParamsList[i].hStream;
1408           ScopedCudaContext scoped_cuda_context(stream);
1409           auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
1410           if (!dev_id) return errors::Internal("Invalid CUDA stream");
1411           start_tsc =
1412               cuda_event_recorders_[*dev_id]->StopKernel(record_indices[i]);
1413         }
1414 #endif
1415       } break;
1416       case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
1417       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
1418       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1419       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1420       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1421       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1422       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1423       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1424         start_tsc = recorder->StopMemcpy(*cbdata->correlationData);
1425         break;
1426       default:
1427         VLOG(1) << "Unexpected callback id: " << cbid;
1428         // TODO: figure out how to get start timestamp in this case.
1429         return OkStatus();
1430     }
1431     // If we are not collecting CPU events from Callback API, we can return now.
1432     if (!option_.required_callback_api_events) {
1433       return OkStatus();
1434     }
1435 
1436     // Grab timestamp for API exit. API entry timestamp saved in cbdata.
1437     uint64 end_tsc = CuptiTracer::GetTimestamp();
1438     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
1439                                      start_tsc, end_tsc, domain, cbid, cbdata);
1440   }
SyncAndFlush()1441   Status SyncAndFlush() override {
1442     for (auto &recorder : cuda_event_recorders_) {
1443       TF_RETURN_IF_ERROR(recorder->Stop());
1444     }
1445     for (auto &recorder : cuda_event_recorders_) {
1446       TF_RETURN_IF_ERROR(recorder->Flush(collector_->annotation_map()));
1447     }
1448     return OkStatus();
1449   }
1450 
1451  private:
1452   template <typename T>
StartMemcpy(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,CudaEventRecorder * recorder)1453   static void StartMemcpy(CuptiTracerEventType type,
1454                           const CUpti_CallbackData *cbdata,
1455                           CudaEventRecorder *recorder) {
1456     const auto *params = static_cast<const T *>(cbdata->functionParams);
1457     *cbdata->correlationData =
1458         recorder->StartMemcpy(type, params->ByteCount, cbdata->context, nullptr,
1459                               cbdata->correlationId, /*async*/ false);
1460   }
1461 
1462   template <typename T>
StartMemcpyAsync(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,CudaEventRecorder * recorder)1463   static void StartMemcpyAsync(CuptiTracerEventType type,
1464                                const CUpti_CallbackData *cbdata,
1465                                CudaEventRecorder *recorder) {
1466     const auto *params = static_cast<const T *>(cbdata->functionParams);
1467     *cbdata->correlationData = recorder->StartMemcpy(
1468         type, params->ByteCount, cbdata->context, params->hStream,
1469         cbdata->correlationId, /*async*/ true);
1470   }
1471 
GetMemoryType(CUdeviceptr ptr)1472   static CUmemorytype GetMemoryType(CUdeviceptr ptr) {
1473     CuptiApiTracingDisabler disabler;
1474     CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
1475     auto status =
1476         cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr);
1477     if (status == CUDA_ERROR_INVALID_VALUE) {
1478       // Pointer not registered with CUDA, must be host memory.
1479       return CU_MEMORYTYPE_HOST;
1480     }
1481     LogIfError(ToStatus(status));
1482     return mem_type;
1483   }
1484 
GetMemcpyType(CUdeviceptr src,CUdeviceptr dst)1485   static CuptiTracerEventType GetMemcpyType(CUdeviceptr src, CUdeviceptr dst) {
1486     CUmemorytype src_type = GetMemoryType(src);
1487     CUmemorytype dst_type = GetMemoryType(dst);
1488     // TODO: handle CU_MEMORYTYPE_ARRAY case
1489     if (src_type == CU_MEMORYTYPE_HOST && dst_type == CU_MEMORYTYPE_DEVICE) {
1490       return CuptiTracerEventType::MemcpyH2D;
1491     } else if (src_type == CU_MEMORYTYPE_DEVICE &&
1492                dst_type == CU_MEMORYTYPE_HOST) {
1493       return CuptiTracerEventType::MemcpyD2H;
1494     } else if (src_type == CU_MEMORYTYPE_DEVICE &&
1495                dst_type == CU_MEMORYTYPE_DEVICE) {
1496       return CuptiTracerEventType::MemcpyD2D;
1497     }
1498     return CuptiTracerEventType::MemcpyOther;
1499   }
1500 
1501   // Each cuLaunchCooperativeKernelMultiDevice will need to add an entry in
1502   // each corresponding device, therefore we need to keep records of all
1503   // the record indices in each device's record array.
1504   // We allocate such data structure during API entry and free during API exit.
1505   // However there is no guarantee that we receive such callbacks in pairs, we
1506   // maintain a on-going API calls to make sure no memory leaks.
1507   struct CuptiApiCallbackContext {
CuptiApiCallbackContexttensorflow::profiler::__anonec3d75710111::CuptiDriverApiHookWithCudaEvent::CuptiApiCallbackContext1508     explicit CuptiApiCallbackContext(std::vector<uint32> &&r)
1509         : record_indices(std::move(r)) {}
1510     std::vector<uint32> record_indices;
1511   };
1512 
1513   const CuptiTracerOptions option_;
1514   CuptiInterface *cupti_interface_;
1515   CuptiTraceCollector *collector_;
1516   absl::node_hash_set<CuptiApiCallbackContext *> callback_contexts_;
1517   std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
1518   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
1519 };
1520 
ErrorWithHostname(absl::string_view error_message)1521 /*static*/ std::string ErrorWithHostname(absl::string_view error_message) {
1522   return absl::StrCat(port::Hostname(), ": ", error_message);
1523 }
1524 
1525 }  // namespace
1526 
AddDriverApiCallbackEvent(CuptiTraceCollector * collector,CuptiInterface * cupti_interface,int device_id,uint64 start_tsc,uint64 end_tsc,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1527 /*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
1528     CuptiTraceCollector *collector, CuptiInterface *cupti_interface,
1529     int device_id, uint64 start_tsc, uint64 end_tsc,
1530     CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
1531     const CUpti_CallbackData *cbdata) {
1532   switch (cbid) {
1533     case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
1534     case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
1535     case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice:
1536       AddKernelEventUponApiExit(collector, device_id, cbdata, start_tsc,
1537                                 end_tsc);
1538       break;
1539     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
1540     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
1541     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1542     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1543     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1544     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1545     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1546     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1547     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2:
1548     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2:
1549     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2:
1550     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2:
1551     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2:
1552     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2:
1553     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2:
1554     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2:
1555     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2:
1556     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2:
1557     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2:
1558     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2:
1559       // This would be the place to populate the memcpy API activity's src and
1560       // dst memory kind by casting cbdata->functionParams. However, we are not
1561       // doing that because that will incur significant overhead to get the
1562       // memory aperture of each argument.
1563       AddNormalMemcpyEventUponApiExit(collector, device_id, cbid, cbdata,
1564                                       start_tsc, end_tsc);
1565       break;
1566     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer:
1567     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync:
1568       AddP2PMemcpyEventUponApiExit(collector, cupti_interface, device_id, cbid,
1569                                    cbdata, start_tsc, end_tsc);
1570       break;
1571     case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2:
1572       AddCuMemAllocEventUponApiExit(collector, device_id, cbid, cbdata,
1573                                     start_tsc, end_tsc);
1574       break;
1575     case CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2:
1576       AddCuMemAllocPitchEventUponApiExit(collector, device_id, cbid, cbdata,
1577                                          start_tsc, end_tsc);
1578       break;
1579     case CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2:
1580       AddCuMemFreeEventUponApiExit(collector, device_id, cbid, cbdata,
1581                                    start_tsc, end_tsc);
1582       break;
1583     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2:
1584     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2:
1585     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2:
1586     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2:
1587     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2:
1588     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2:
1589     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async:
1590     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async:
1591     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async:
1592     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async:
1593     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async:
1594     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async:
1595       AddCuMemsetEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc,
1596                                   end_tsc);
1597       break;
1598     default:
1599       AddGenericEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc,
1600                                  end_tsc);
1601       break;
1602   }
1603   return OkStatus();
1604 }
1605 
GetTraceEventTypeName(const CuptiTracerEventType & type)1606 const char *GetTraceEventTypeName(const CuptiTracerEventType &type) {
1607   // Do not use a default so that this gives a build error when
1608   // CuptiTracerEventType is extended but this is not.
1609   switch (type) {
1610     case CuptiTracerEventType::MemcpyH2D:
1611       return "MemcpyH2D";
1612     case CuptiTracerEventType::MemcpyD2H:
1613       return "MemcpyD2H";
1614     case CuptiTracerEventType::MemcpyD2D:
1615       return "MemcpyD2D";
1616     case CuptiTracerEventType::MemcpyP2P:
1617       return "MemcpyP2P";
1618     case CuptiTracerEventType::MemcpyOther:
1619       return "MemcpyOther";
1620     case CuptiTracerEventType::Kernel:
1621       return "Compute";
1622     case CuptiTracerEventType::MemoryAlloc:
1623       return "MemoryAlloc";
1624     case CuptiTracerEventType::MemoryFree:
1625       return "MemoryFree";
1626     case CuptiTracerEventType::Memset:
1627       return "Memset";
1628     case CuptiTracerEventType::Overhead:
1629       return "Overhead";
1630     case CuptiTracerEventType::UnifiedMemory:
1631       return "UnifiedMemory";
1632     case CuptiTracerEventType::Generic:
1633       return "Generic";
1634     case CuptiTracerEventType::MemoryResidency:
1635       return "MemoryResidency";
1636     case CuptiTracerEventType::Unsupported:
1637       return "";
1638   }
1639 }
1640 
CuptiTracer(CuptiInterface * cupti_interface)1641 CuptiTracer::CuptiTracer(CuptiInterface *cupti_interface)
1642     : num_gpus_(NumGpus()),
1643       cupti_interface_(cupti_interface),
1644       buffer_pool_(kBufferSizeInBytes) {}
1645 
GetCuptiTracerSingleton()1646 /* static */ CuptiTracer *CuptiTracer::GetCuptiTracerSingleton() {
1647   static auto *singleton = new CuptiTracer(GetCuptiInterface());
1648   return singleton;
1649 }
1650 
IsAvailable() const1651 bool CuptiTracer::IsAvailable() const {
1652   return NumGpus() && !activity_tracing_enabled_ && !api_tracing_enabled_;
1653 }
1654 
NumGpus()1655 int CuptiTracer::NumGpus() {
1656   static int num_gpus = []() -> int {
1657     if (cuInit(0) != CUDA_SUCCESS) {
1658       return 0;
1659     }
1660     int gpu_count;
1661     if (cuDeviceGetCount(&gpu_count) != CUDA_SUCCESS) {
1662       return 0;
1663     }
1664     LOG(INFO) << "Profiler found " << gpu_count << " GPUs";
1665     return gpu_count;
1666   }();
1667   return num_gpus;
1668 }
1669 
Enable(const CuptiTracerOptions & option,CuptiTraceCollector * collector)1670 void CuptiTracer::Enable(const CuptiTracerOptions &option,
1671                          CuptiTraceCollector *collector) {
1672   option_ = option;
1673   collector_ = collector;
1674   if (option_->enable_event_based_activity) {
1675     option_->enable_activity_api = false;
1676     cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithCudaEvent(
1677         option, cupti_interface_, collector));
1678   } else {
1679     cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithActivityApi(
1680         option, cupti_interface_, collector));
1681   }
1682 
1683   Status status = EnableApiTracing();
1684   need_root_access_ |= status.code() == error::PERMISSION_DENIED;
1685   if (!status.ok()) return;
1686 
1687   if (option_->enable_activity_api) {
1688     EnableActivityTracing().IgnoreError();
1689   }
1690   tensorflow::profiler::AnnotationStack::Enable(true);
1691 }
1692 
Disable()1693 void CuptiTracer::Disable() {
1694   DisableApiTracing().IgnoreError();
1695   if (option_->enable_activity_api) {
1696     DisableActivityTracing().IgnoreError();
1697   }
1698   cupti_interface_->CleanUp();
1699   Finalize().IgnoreError();
1700   cupti_driver_api_hook_->SyncAndFlush().IgnoreError();
1701   collector_->Flush();
1702   collector_ = nullptr;
1703   option_.reset();
1704   cupti_driver_api_hook_.reset();
1705   tensorflow::profiler::AnnotationStack::Enable(false);
1706 }
1707 
EnableApiTracing()1708 Status CuptiTracer::EnableApiTracing() {
1709   if (api_tracing_enabled_) return OkStatus();
1710 
1711   VLOG(1) << "Enable subscriber";
1712   // Subscribe can return CUPTI_ERROR_MAX_LIMIT_REACHED.
1713   // The application which calls CUPTI APIs cannot be used with Nvidia tools
1714   // like nvprof, Nvidia Visual Profiler, Nsight Compute, Nsight Systems.
1715   RETURN_IF_CUPTI_ERROR(cupti_interface_->Subscribe(
1716       &subscriber_, (CUpti_CallbackFunc)ApiCallback, this));
1717   api_tracing_enabled_ = true;
1718 
1719   if (!option_->cbids_selected.empty()) {
1720     for (auto cbid : option_->cbids_selected) {
1721       RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
1722           1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
1723     }
1724   } else {  // select all callback ids.
1725     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1726         1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
1727   }
1728 
1729   if (option_->enable_nvtx_tracking) {
1730     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1731         1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
1732   }
1733   return OkStatus();
1734 }
1735 
DisableApiTracing()1736 Status CuptiTracer::DisableApiTracing() {
1737   if (!api_tracing_enabled_) return OkStatus();
1738 
1739   api_tracing_enabled_ = false;
1740 
1741   if (!option_->cbids_selected.empty()) {
1742     for (auto cbid : option_->cbids_selected) {
1743       RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
1744           0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
1745     }
1746   } else {
1747     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1748         0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
1749   }
1750 
1751   if (option_->enable_nvtx_tracking) {
1752     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1753         0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
1754   }
1755 
1756   VLOG(1) << "Disable subscriber";
1757   RETURN_IF_CUPTI_ERROR(cupti_interface_->Unsubscribe(subscriber_));
1758   return OkStatus();
1759 }
1760 
EnableActivityTracing()1761 Status CuptiTracer::EnableActivityTracing() {
1762   if (!option_->activities_selected.empty()) {
1763     // Initialize callback functions for Cupti Activity API.
1764     VLOG(1) << "Registering CUPTI activity callbacks";
1765     RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityRegisterCallbacks(
1766         RequestCuptiActivityBuffer, ProcessCuptiActivityBuffer));
1767 
1768     VLOG(1) << "Enabling activity tracing for "
1769             << option_->activities_selected.size() << " activities";
1770     for (auto activity : option_->activities_selected) {
1771       VLOG(1) << "Enabling activity tracing for: " << activity;
1772       if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
1773         ConfigureActivityUnifiedMemoryCounter(true);
1774       }
1775       RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityEnable(activity));
1776     }
1777   }
1778   activity_tracing_enabled_ = true;
1779   return OkStatus();
1780 }
1781 
DisableActivityTracing()1782 Status CuptiTracer::DisableActivityTracing() {
1783   if (activity_tracing_enabled_) {
1784     VLOG(1) << "Disabling activity tracing for "
1785             << option_->activities_selected.size() << " activities";
1786     for (auto activity : option_->activities_selected) {
1787       VLOG(1) << "Disabling activity tracing for: " << activity;
1788       if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
1789         ConfigureActivityUnifiedMemoryCounter(false);
1790       }
1791       RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityDisable(activity));
1792     }
1793     option_->activities_selected.clear();
1794 
1795     VLOG(1) << "Flushing CUPTI activity buffer";
1796     RETURN_IF_CUPTI_ERROR(
1797         cupti_interface_->ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
1798     LOG(INFO) << "CUPTI activity buffer flushed";
1799   }
1800   activity_tracing_enabled_ = false;
1801   return OkStatus();
1802 }
1803 
Finalize()1804 Status CuptiTracer::Finalize() {
1805   if (option_->cupti_finalize) {
1806     VLOG(1) << "CuptiFinalize";
1807     RETURN_IF_CUPTI_ERROR(cupti_interface_->Finalize());
1808   }
1809   return OkStatus();
1810 }
1811 
GetTimestamp()1812 /*static*/ uint64 CuptiTracer::GetTimestamp() {
1813   uint64_t tsc;
1814   CuptiInterface *cupti_interface = GetCuptiInterface();
1815   if (cupti_interface && cupti_interface->GetTimestamp(&tsc) == CUPTI_SUCCESS) {
1816     return tsc;
1817   }
1818   // Return 0 on error. If an activity timestamp is 0, the activity will be
1819   // dropped during time normalization.
1820   return 0;
1821 }
1822 
HandleNVTXCallback(CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1823 Status CuptiTracer::HandleNVTXCallback(CUpti_CallbackId cbid,
1824                                        const CUpti_CallbackData *cbdata) {
1825   const CUpti_NvtxData *pdata =
1826       reinterpret_cast<const CUpti_NvtxData *>(cbdata);
1827   if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePushEx) {
1828     const nvtxDomainRangePushEx_params *params =
1829         reinterpret_cast<const nvtxDomainRangePushEx_params *>(
1830             pdata->functionParams);
1831     // TODO(profiler): The messageType is actually NVTX_MESSAGE_TYPE_REGISTERED
1832     // (which is 3), However it seems to me that we can not get the registered
1833     // string from nvtxDomainRegisterStringA_params. If we reinterpret the
1834     // payload as ascii, it happen to work.
1835     NVTXRangeTracker::EnterRange(params->core.eventAttrib->message.ascii);
1836   } else if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePop) {
1837     NVTXRangeTracker::ExitRange();
1838   }
1839   return OkStatus();
1840 }
1841 
HandleCallback(CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1842 Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
1843                                    CUpti_CallbackId cbid,
1844                                    const CUpti_CallbackData *cbdata) {
1845   if (!api_tracing_enabled_) return OkStatus();    // already unsubscribed.
1846   if (!cupti_driver_api_hook_) return OkStatus();  // already unsubscribed.
1847   if (domain == CUPTI_CB_DOMAIN_NVTX) return HandleNVTXCallback(cbid, cbdata);
1848   if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return OkStatus();
1849   if (internalCuCall) return OkStatus();
1850 
1851   if (cbdata->context == nullptr) {
1852     // API callback is called before any CUDA context is created.
1853     // This is expected to be rare, and we ignore this case.
1854     VLOG(3) << "API callback received before creation of CUDA context\n";
1855     return errors::Internal("cutpi callback without context");
1856   }
1857 
1858   // Grab a correct device ID.
1859   uint32 device_id = -1;
1860   RETURN_IF_CUPTI_ERROR(
1861       cupti_interface_->GetDeviceId(cbdata->context, &device_id));
1862   if (device_id >= num_gpus_) {
1863     return errors::Internal("Invalid device id:", device_id);
1864   }
1865 
1866   if (cbdata->callbackSite == CUPTI_API_ENTER) {
1867     TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiEnter(
1868         device_id, domain, cbid, cbdata));
1869   } else if (cbdata->callbackSite == CUPTI_API_EXIT) {
1870     // Set up the map from correlation id to annotation string.
1871     const auto &annotation = AnnotationStack::Get();
1872     if (!annotation.empty()) {
1873       if (cbid ==
1874           CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice) {
1875         // Kernels are launched on different devices by this API call, therefore
1876         // we need to populate per device annotation map respectively.
1877         for (int i = 0; i < num_gpus_; ++i) {
1878           collector_->annotation_map()->Add(i, cbdata->correlationId,
1879                                             annotation, "");
1880         }
1881       } else {
1882         absl::string_view nvtx_range = NVTXRangeTracker::CurrentRange();
1883         collector_->annotation_map()->Add(device_id, cbdata->correlationId,
1884                                           annotation, nvtx_range);
1885       }
1886     }
1887 
1888     TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiExit(
1889         device_id, domain, cbid, cbdata));
1890   }
1891   return OkStatus();
1892 }
1893 
ConfigureActivityUnifiedMemoryCounter(bool enable)1894 void CuptiTracer::ConfigureActivityUnifiedMemoryCounter(bool enable) {
1895   CUpti_ActivityUnifiedMemoryCounterConfig config[2];
1896   // By experiments, currently only measurements from these two activities are
1897   // trustworthy. Others like GPU page fault may be problematic.
1898   config[0].kind =
1899       CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD;
1900   config[1].kind =
1901       CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH;
1902 
1903   for (size_t i = 0; i < 2; i++) {
1904     config[i].enable = enable;
1905   }
1906 
1907   CUptiResult res;
1908 
1909   res = cupti_interface_->ActivityConfigureUnifiedMemoryCounter(config, 2);
1910   if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED) {
1911     LOG(ERROR) << "Unified memory is not supported on the "
1912                   "underlying platform.\n";
1913   } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE) {
1914     LOG(ERROR) << "Unified memory is not supported on the device.\n";
1915   } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES) {
1916     LOG(ERROR) << "Unified memory is not supported on the "
1917                   "non-P2P multi-gpu setup.\n";
1918   } else if (res != CUPTI_SUCCESS) {
1919     const char *errstr = "";
1920     cuptiGetResultString(res, &errstr);
1921     LOG(ERROR) << "Error while enabling unified memory profiling: " << errstr;
1922   } else {
1923     VLOG(1) << "Configuring Unified memory profiling: " << res;
1924   }
1925 }
1926 
RequestActivityBuffer(uint8_t ** buffer,size_t * size)1927 void CuptiTracer::RequestActivityBuffer(uint8_t **buffer, size_t *size) {
1928   *buffer = buffer_pool_.GetOrCreateBuffer();
1929   if (*buffer == nullptr) {
1930     LOG(WARNING)
1931         << "CUPTI Buffer not allocated, activity records will be dropped";
1932     *size = 0;
1933     return;
1934   }
1935   *size = buffer_pool_.GetBufferSizeInBytes();
1936 }
1937 
ProcessActivityBuffer(CUcontext context,uint32_t stream_id,uint8_t * buffer,size_t size)1938 Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
1939                                           uint8_t *buffer, size_t size) {
1940   auto buffer_cleanup =
1941       gtl::MakeCleanup([&]() { buffer_pool_.ReclaimBuffer(buffer); });
1942   if (size == 0) {
1943     return OkStatus();
1944   }
1945   if (!activity_tracing_enabled_) {
1946     LOG(WARNING) << "CUPTI activity buffer is reclaimed after flush.";
1947     return OkStatus();
1948   }
1949   if (cupti_interface_->Disabled()) return errors::Internal("Disabled.");
1950 
1951   CUpti_Activity *record = nullptr;
1952   while (true) {
1953     CUptiResult status =
1954         cupti_interface_->ActivityGetNextRecord(buffer, size, &record);
1955     if (status == CUPTI_SUCCESS) {
1956       switch (record->kind) {
1957         case CUPTI_ACTIVITY_KIND_KERNEL:  // sequential
1958         case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
1959           AddKernelActivityEvent(
1960               collector_, reinterpret_cast<CuptiActivityKernelTy *>(record));
1961           break;
1962         case CUPTI_ACTIVITY_KIND_MEMCPY:
1963           AddMemcpyActivityEvent(
1964               collector_, reinterpret_cast<CuptiActivityMemcpyTy *>(record));
1965           break;
1966         case CUPTI_ACTIVITY_KIND_MEMCPY2:
1967           AddMemcpyP2PActivityEvent(
1968               collector_, reinterpret_cast<CuptiActivityMemcpyP2PTy *>(record));
1969           break;
1970         case CUPTI_ACTIVITY_KIND_OVERHEAD:
1971           AddCuptiOverheadActivityEvent(
1972               collector_, reinterpret_cast<CUpti_ActivityOverhead *>(record));
1973           break;
1974         case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER:
1975           AddUnifiedMemoryActivityEvent(
1976               collector_,
1977               reinterpret_cast<CUpti_ActivityUnifiedMemoryCounter2 *>(record));
1978           break;
1979         case CUPTI_ACTIVITY_KIND_MEMORY: {
1980           AddMemoryActivityEvent(
1981               collector_, reinterpret_cast<CUpti_ActivityMemory *>(record));
1982         } break;
1983         case CUPTI_ACTIVITY_KIND_MEMSET:
1984           AddMemsetActivityEvent(
1985               collector_, reinterpret_cast<CuptiActivityMemsetTy *>(record));
1986           break;
1987         case CUPTI_ACTIVITY_KIND_SYNCHRONIZATION:
1988           AddSynchronizationActivityEvent(
1989               collector_,
1990               reinterpret_cast<CUpti_ActivitySynchronization *>(record));
1991           break;
1992         default:
1993           VLOG(3) << "Activity type " << record->kind << " is not supported.";
1994           break;
1995       }
1996     } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
1997       break;
1998     } else {
1999       return errors::Internal("Parse cupti activity buffer error.");
2000     }
2001   }
2002 
2003   // Report dropped records.
2004   size_t dropped;
2005   RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityGetNumDroppedRecords(
2006       context, stream_id, &dropped));
2007   if (dropped != 0) {
2008     uint32 device_id = -1;
2009     RETURN_IF_CUPTI_ERROR(cupti_interface_->GetDeviceId(context, &device_id));
2010     collector_->OnEventsDropped("cupti activity buffer full", dropped);
2011   }
2012   return OkStatus();
2013 }
2014 
ErrorIfAny()2015 /*static*/ std::string CuptiTracer::ErrorIfAny() {
2016   if (CuptiTracer::NumGpus() == 0) {
2017     return ErrorWithHostname("No GPU detected.");
2018   } else if (CuptiTracer::GetCuptiTracerSingleton()->NeedRootAccess()) {
2019     return ErrorWithHostname(
2020         "Insufficient privilege to run libcupti (you need root permission).");
2021   } else if (CuptiTracer::GetTimestamp() == 0) {
2022     return ErrorWithHostname(
2023         "Failed to load libcupti (is it installed and accessible?)");
2024   }
2025   return "";
2026 }
2027 
2028 }  // namespace profiler
2029 }  // namespace tensorflow
2030