• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
17 
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/container/flat_hash_set.h"
20 #include "absl/container/node_hash_map.h"
21 #include "absl/container/node_hash_set.h"
22 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
23 #include "third_party/gpus/cuda/extras/CUPTI/include/generated_nvtx_meta.h"
24 #include "tensorflow/core/lib/gtl/cleanup.h"
25 #include "tensorflow/core/platform/env.h"
26 #include "tensorflow/core/platform/errors.h"
27 #include "tensorflow/core/platform/host_info.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/macros.h"
30 #include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
31 #include "tensorflow/core/profiler/internal/gpu/cupti_collector.h"
32 #include "tensorflow/core/profiler/internal/gpu/nvtx_utils.h"
33 
34 namespace tensorflow {
35 namespace profiler {
36 
37 namespace {
38 
39 static thread_local int internalCuCall = 0;
40 
41 // Temporary disable cupti api tracing for this thread during the life scope of
42 // this class. Used for the API calls that initiated by us.
43 class CuptiApiTracingDisabler {
44  public:
CuptiApiTracingDisabler()45   CuptiApiTracingDisabler() { internalCuCall++; }
~CuptiApiTracingDisabler()46   ~CuptiApiTracingDisabler() { internalCuCall--; }
47 };
48 
ToStatus(CUptiResult result)49 Status ToStatus(CUptiResult result) {
50   if (result == CUPTI_SUCCESS) {
51     return Status::OK();
52   }
53   const char *str = nullptr;
54   cuptiGetResultString(result, &str);
55   return errors::Unavailable("CUPTI error: ", str ? str : "<unknown>");
56 }
57 
ToStatus(CUresult result)58 Status ToStatus(CUresult result) {
59   if (result == CUDA_SUCCESS) {
60     return Status::OK();
61   }
62   const char *str = nullptr;
63   cuGetErrorName(result, &str);
64   return errors::Unavailable("CUDA error: ", str ? str : "<unknown>");
65 }
66 
LogIfError(const Status & status)67 inline void LogIfError(const Status &status) {
68   if (status.ok()) return;
69   LOG(ERROR) << status.error_message();
70 }
71 
72 // Maps an OverheadKind enum to a const string.
getActivityOverheadKindString(CUpti_ActivityOverheadKind kind)73 const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
74   switch (kind) {
75     case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER:
76       return "COMPILER";
77     case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH:
78       return "BUFFER_FLUSH";
79     case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION:
80       return "INSTRUMENTATION";
81     case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE:
82       return "RESOURCE";
83     default:
84       break;
85   }
86   return "<UNKNOWN>";
87 }
88 
getActivityUnifiedMemoryKindString(CUpti_ActivityUnifiedMemoryCounterKind kind)89 const char *getActivityUnifiedMemoryKindString(
90     CUpti_ActivityUnifiedMemoryCounterKind kind) {
91   switch (kind) {
92     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
93       return "UM_BYTES_TRANSFER_HTOD";
94     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
95       return "UM_BYTES_TRANSFER_DTOH";
96     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
97       return "UM_CPU_PAGE_FAULT";
98     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
99       return "UM_GPU_PAGE_FAULT";
100     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING:
101       return "UM_THRASHING";
102     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING:
103       return "UM_THROTTLING";
104     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP:
105       return "UM_REMOTE_MAP";
106     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD:
107       return "UM_BYTES_TRANSFER_DTOD";
108     default:
109       break;
110   }
111   return "<UNKNOWN>";
112 }
113 
114 // CUPTI_ERROR_INSUFFICIENT_PRIVILEGES is introduced at CUDA 10.1.
115 #if CUDA_VERSION <= 10000
116 #define CUPTI_ERROR_INSUFFICIENT_PRIVILEGES 35
117 #endif
118 
119 #define RETURN_IF_CUPTI_ERROR(expr)                                         \
120   do {                                                                      \
121     CUptiResult status = expr;                                              \
122     if (ABSL_PREDICT_FALSE(status != CUPTI_SUCCESS)) {                      \
123       const char *errstr = "";                                              \
124       cupti_interface_->GetResultString(status, &errstr);                   \
125       LOG(ERROR) << "function " << #expr << "failed with error " << errstr; \
126       if (status == CUPTI_ERROR_INSUFFICIENT_PRIVILEGES) {                  \
127         return errors::PermissionDenied("CUPTI need root access!");         \
128       } else {                                                              \
129         return errors::Internal("CUPTI call error", errstr);                \
130       }                                                                     \
131     }                                                                       \
132   } while (false)
133 
Bytes2D(const CUDA_MEMCPY2D * p)134 size_t Bytes2D(const CUDA_MEMCPY2D *p) { return p->Height * p->WidthInBytes; }
135 
Bytes3D(const CUDA_MEMCPY3D * p)136 size_t Bytes3D(const CUDA_MEMCPY3D *p) {
137   return p->Depth * p->Height * p->WidthInBytes;
138 }
139 
140 template <typename CudaMemcpy>
MemcpyKind(const CudaMemcpy * p)141 CuptiTracerEventType MemcpyKind(const CudaMemcpy *p) {
142   if (p->srcMemoryType == CU_MEMORYTYPE_HOST &&
143       p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
144     return CuptiTracerEventType::MemcpyH2D;
145   }
146   if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
147       p->dstMemoryType == CU_MEMORYTYPE_HOST) {
148     return CuptiTracerEventType::MemcpyD2H;
149   }
150   if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
151       p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
152     return CuptiTracerEventType::MemcpyD2D;
153   }
154   return CuptiTracerEventType::Unsupported;
155 }
156 
157 std::tuple<size_t /*bytes*/, CuptiTracerEventType, bool /*async*/>
DecodeDriverMemcpy(CUpti_CallbackId cbid,const void * params)158 DecodeDriverMemcpy(CUpti_CallbackId cbid, const void *params) {
159   switch (cbid) {
160     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: {
161       const auto *p = reinterpret_cast<const cuMemcpyHtoD_v2_params *>(params);
162       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyH2D,
163                              false);
164     }
165     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: {
166       const auto *p =
167           reinterpret_cast<const cuMemcpyHtoDAsync_v2_params *>(params);
168       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyH2D,
169                              true);
170     }
171     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: {
172       const auto *p = reinterpret_cast<const cuMemcpyDtoH_v2_params *>(params);
173       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2H,
174                              false);
175     }
176     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: {
177       const auto *p =
178           reinterpret_cast<const cuMemcpyDtoHAsync_v2_params *>(params);
179       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2H,
180                              true);
181     }
182     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: {
183       const auto *p = reinterpret_cast<const cuMemcpyDtoD_v2_params *>(params);
184       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2D,
185                              false);
186     }
187     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: {
188       const auto *p =
189           reinterpret_cast<const cuMemcpyDtoDAsync_v2_params *>(params);
190       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2D,
191                              true);
192     }
193     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
194       const auto *p = reinterpret_cast<const cuMemcpy_params *>(params);
195       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyOther,
196                              false);
197     }
198     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
199       const auto *p = reinterpret_cast<const cuMemcpyAsync_params *>(params);
200       return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyOther,
201                              true);
202     }
203     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2: {
204       const auto *p = reinterpret_cast<const cuMemcpy2D_v2_params *>(params);
205       return std::make_tuple(Bytes2D(p->pCopy), MemcpyKind(p->pCopy), false);
206     }
207     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2: {
208       const auto *p =
209           reinterpret_cast<const cuMemcpy2DAsync_v2_params *>(params);
210       return std::make_tuple(Bytes2D(p->pCopy), MemcpyKind(p->pCopy), true);
211     }
212     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2: {
213       const auto *p = reinterpret_cast<const cuMemcpy3D_v2_params *>(params);
214       return std::make_tuple(Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true);
215     }
216     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2: {
217       const auto *p =
218           reinterpret_cast<const cuMemcpy3DAsync_v2_params *>(params);
219       return std::make_tuple(Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true);
220     }
221     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: {
222       const auto *p2p_params =
223           reinterpret_cast<const cuMemcpyPeer_params *>(params);
224       return std::make_tuple(p2p_params->ByteCount,
225                              CuptiTracerEventType::MemcpyP2P, false);
226     }
227     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: {
228       const auto *p2p_params =
229           reinterpret_cast<const cuMemcpyPeerAsync_params *>(params);
230       return std::make_tuple(p2p_params->ByteCount,
231                              CuptiTracerEventType::MemcpyP2P, true);
232     }
233     default: {
234       LOG(ERROR) << "Unsupported memcpy activity observed: " << cbid;
235       return std::make_tuple(0, CuptiTracerEventType::Unsupported, false);
236     }
237   }
238 }
239 
240 std::tuple<size_t /*bytes*/, CuptiTracerEventType, bool /*async*/>
DecodeDriverMemset(CUpti_CallbackId cbid,const void * params)241 DecodeDriverMemset(CUpti_CallbackId cbid, const void *params) {
242   switch (cbid) {
243     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2: {
244       const auto *p = reinterpret_cast<const cuMemsetD8_v2_params *>(params);
245       return std::make_tuple(p->N, CuptiTracerEventType::Memset, false);
246     }
247     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2: {
248       const auto *p = reinterpret_cast<const cuMemsetD16_v2_params *>(params);
249       return std::make_tuple(p->N, CuptiTracerEventType::Memset, false);
250     }
251     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2: {
252       const auto *p = reinterpret_cast<const cuMemsetD32_v2_params *>(params);
253       return std::make_tuple(p->N, CuptiTracerEventType::Memset, false);
254     }
255     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2: {
256       const auto *p = reinterpret_cast<const cuMemsetD2D8_v2_params *>(params);
257       return std::make_tuple(p->dstPitch * p->Height,
258                              CuptiTracerEventType::Memset, false);
259     }
260     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2: {
261       const auto *p = reinterpret_cast<const cuMemsetD2D16_v2_params *>(params);
262       return std::make_tuple(p->dstPitch * p->Height,
263                              CuptiTracerEventType::Memset, false);
264     }
265     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2: {
266       const auto *p = reinterpret_cast<const cuMemsetD2D32_v2_params *>(params);
267       return std::make_tuple(p->dstPitch * p->Height,
268                              CuptiTracerEventType::Memset, false);
269     }
270     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async: {
271       const auto *p = reinterpret_cast<const cuMemsetD8Async_params *>(params);
272       return std::make_tuple(p->N, CuptiTracerEventType::Memset, true);
273     }
274     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async: {
275       const auto *p = reinterpret_cast<const cuMemsetD16Async_params *>(params);
276       return std::make_tuple(p->N, CuptiTracerEventType::Memset, true);
277     }
278     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async: {
279       const auto *p = reinterpret_cast<const cuMemsetD32Async_params *>(params);
280       return std::make_tuple(p->N, CuptiTracerEventType::Memset, true);
281     }
282     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async: {
283       const auto *p =
284           reinterpret_cast<const cuMemsetD2D8Async_params *>(params);
285       return std::make_tuple(p->dstPitch * p->Height,
286                              CuptiTracerEventType::Memset, true);
287     }
288     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async: {
289       const auto *p =
290           reinterpret_cast<const cuMemsetD2D16Async_params *>(params);
291       return std::make_tuple(p->dstPitch * p->Height,
292                              CuptiTracerEventType::Memset, true);
293     }
294     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async: {
295       const auto *p =
296           reinterpret_cast<const cuMemsetD2D32Async_params *>(params);
297       return std::make_tuple(p->dstPitch * p->Height,
298                              CuptiTracerEventType::Memset, true);
299     }
300     default: {
301       LOG(ERROR) << "Unsupported memset activity observed: " << cbid;
302       return std::make_tuple(0, CuptiTracerEventType::Unsupported, false);
303     }
304   }
305 }
306 
307 // Cupti callback corresponding to a driver or runtime API. This global function
308 // is invoked twice for each API: at entry and at exit. The cbdata
309 // parameter is guaranteed by Cupti to be thread-safe. Most invocations are
310 // dropped to the floor and entry/exit is tracked for the APIs we deem
311 // performance-relevant.
ApiCallback(void * user_data,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)312 void CUPTIAPI ApiCallback(void *user_data, CUpti_CallbackDomain domain,
313                           CUpti_CallbackId cbid,
314                           const CUpti_CallbackData *cbdata) {
315   CuptiTracer *tracer = reinterpret_cast<CuptiTracer *>(user_data);
316   tracer->HandleCallback(domain, cbid, cbdata).IgnoreError();
317 }
318 
319 // Callback which is invoked when an empty buffer is requested by CUPTI.
320 // Allocates an empty aligned-memory buffer. The buffer is used by CUPTI as a
321 // ring buffer where device maintains activity profiles that have been
322 // collected.
RequestCuptiActivityBuffer(uint8_t ** buffer,size_t * size,size_t * maxNumRecords)323 void CUPTIAPI RequestCuptiActivityBuffer(uint8_t **buffer, size_t *size,
324                                          size_t *maxNumRecords) {
325   CuptiTracer::GetCuptiTracerSingleton()->RequestActivityBuffer(buffer, size);
326   VLOG(3) << "Requested CUPTI Buffer, buffer=" << std::hex
327           << reinterpret_cast<uintptr_t>(*buffer) << std::dec
328           << " size=" << *size;
329   // Request CUPTI to fill as many records as possible in the buffer.
330   *maxNumRecords = 0;
331 }
332 
333 // Callback which is invoked when a buffer containing activity records is
334 // available from CUPTI. Processes the buffer after reading activity records
335 // from it.
ProcessCuptiActivityBuffer(CUcontext context,uint32_t stream_id,uint8_t * buffer,size_t size,size_t valid_size)336 void CUPTIAPI ProcessCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
337                                          uint8_t *buffer, size_t size,
338                                          size_t valid_size) {
339   VLOG(3) << "Processing CUPTI Buffer, buffer:" << std::hex
340           << reinterpret_cast<uintptr_t>(buffer) << std::dec
341           << " size: " << size << " valid_size: " << valid_size;
342   VLOG(3) << "Activity profile for stream " << stream_id;
343 
344   Status status = CuptiTracer::GetCuptiTracerSingleton()->ProcessActivityBuffer(
345       context, stream_id, buffer, valid_size);
346   if (!status.ok()) {
347     LOG(ERROR) << status;
348   }
349 }
350 
AddKernelEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)351 void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
352                                const CUpti_CallbackData *cbdata,
353                                uint64 start_time, uint64 end_time) {
354   CuptiTracerEvent event{};
355   event.type = CuptiTracerEventType::Kernel;
356   event.source = CuptiTracerEventSource::DriverCallback;
357   event.name = cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
358   event.start_time_ns = start_time;
359   event.end_time_ns = end_time;
360   event.thread_id = Env::Default()->GetCurrentThreadId();
361   event.device_id = device_id;
362   event.context_id = cbdata->contextUid;
363   event.correlation_id = cbdata->correlationId;
364   VLOG(3) << "Cuda Kernel launch API exit. name=" << event.name;
365   collector->AddEvent(std::move(event));
366 }
367 
368 // Performs the actual callback for both normal and P2P memcpy operations.
PopulateMemcpyCallbackEvent(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,size_t num_bytes,uint32 src_device,uint32 dst_device,bool async,uint64 start_time,uint64 end_time)369 CuptiTracerEvent PopulateMemcpyCallbackEvent(
370     CuptiTracerEventType type, const CUpti_CallbackData *cbdata,
371     size_t num_bytes, uint32 src_device, uint32 dst_device, bool async,
372     uint64 start_time, uint64 end_time) {
373   CuptiTracerEvent event{};
374   event.type = type;
375   event.source = CuptiTracerEventSource::DriverCallback;
376   event.start_time_ns = start_time;
377   event.end_time_ns = end_time;
378   event.thread_id = Env::Default()->GetCurrentThreadId();
379   event.device_id = src_device;
380   event.context_id = cbdata->contextUid;
381   event.correlation_id = cbdata->correlationId;
382   event.memcpy_info.num_bytes = num_bytes;
383   event.memcpy_info.destination = dst_device;
384   event.memcpy_info.async = async;
385   // These are not populated during callback for API activities.
386   event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
387   event.memcpy_info.dst_mem_kind = CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN;
388   event.memcpy_info.src_mem_kind = CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN;
389   return event;
390 }
391 
AddNormalMemcpyEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)392 void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector,
393                                      uint32 device_id, CUpti_CallbackId cbid,
394                                      const CUpti_CallbackData *cbdata,
395                                      uint64 start_time, uint64 end_time) {
396   size_t num_bytes;
397   CuptiTracerEventType type;
398   bool async;
399   std::tie(num_bytes, type, async) =
400       DecodeDriverMemcpy(cbid, cbdata->functionParams);
401 
402   VLOG(3) << "Cuda Memcpy API exit. sz=" << num_bytes;
403   CuptiTracerEvent event =
404       PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, device_id, device_id,
405                                   async, start_time, end_time);
406   collector->AddEvent(std::move(event));
407 }
408 
AddCuMemsetEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)409 void AddCuMemsetEventUponApiExit(CuptiTraceCollector *collector,
410                                  uint32 device_id, CUpti_CallbackId cbid,
411                                  const CUpti_CallbackData *cbdata,
412                                  uint64 start_time, uint64 end_time) {
413   // We are casting all variants of cuMemset to cuMemsetD8 for accessing the
414   // first member attribute, a CUdeviceptr.
415   const auto *params =
416       static_cast<const cuMemsetD8_v2_params *>(cbdata->functionParams);
417   size_t num_bytes;
418   bool async;
419   CuptiTracerEventType type;
420   std::tie(num_bytes, type, async) =
421       DecodeDriverMemset(cbid, cbdata->functionParams);
422 
423   CuptiTracerEvent event{};
424   event.type = type;
425   event.source = CuptiTracerEventSource::DriverCallback;
426   event.start_time_ns = start_time;
427   event.end_time_ns = end_time;
428   event.thread_id = Env::Default()->GetCurrentThreadId();
429   event.device_id = device_id;
430   event.context_id = cbdata->contextUid;
431   event.correlation_id = cbdata->correlationId;
432   event.memset_info.num_bytes = num_bytes;
433   // memset_info.kind cannot be determined from API.
434   event.memset_info.async = async;
435   VLOG(3) << "Cuda Memset API exit."
436           << " dptr=" << reinterpret_cast<void *>(params->dstDevice)
437           << " sz=" << num_bytes;
438   collector->AddEvent(std::move(event));
439 }
440 
AddP2PMemcpyEventUponApiExit(CuptiTraceCollector * collector,CuptiInterface * cupti_interface,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)441 void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector,
442                                   CuptiInterface *cupti_interface,
443                                   uint32 device_id, CUpti_CallbackId cbid,
444                                   const CUpti_CallbackData *cbdata,
445                                   uint64 start_time, uint64 end_time) {
446   size_t num_bytes;
447   CuptiTracerEventType type;
448   bool async;
449   std::tie(num_bytes, type, async) =
450       DecodeDriverMemcpy(cbid, cbdata->functionParams);
451 
452   uint32 dst_device = -1, src_device = -1;
453   const auto *p2p_params =
454       static_cast<const cuMemcpyPeer_params *>(cbdata->functionParams);
455   cupti_interface->GetDeviceId(p2p_params->srcContext, &src_device);
456   cupti_interface->GetDeviceId(p2p_params->dstContext, &dst_device);
457   VLOG(3) << "Cuda P2P Memcpy API exit, src: " << src_device
458           << " dst: " << dst_device << " size:" << num_bytes;
459   CuptiTracerEvent event =
460       PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, src_device,
461                                   dst_device, async, start_time, end_time);
462   collector->AddEvent(std::move(event));
463 }
464 
AddCuMemAllocEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)465 void AddCuMemAllocEventUponApiExit(CuptiTraceCollector *collector,
466                                    uint32 device_id, CUpti_CallbackId cbid,
467                                    const CUpti_CallbackData *cbdata,
468                                    uint64 start_time, uint64 end_time) {
469   const auto *params =
470       static_cast<const cuMemAlloc_v2_params *>(cbdata->functionParams);
471   CuptiTracerEvent event{};
472   event.type = CuptiTracerEventType::MemoryAlloc;
473   event.source = CuptiTracerEventSource::DriverCallback;
474   event.name = cbdata->functionName;
475   event.start_time_ns = start_time;
476   event.end_time_ns = end_time;
477   event.thread_id = Env::Default()->GetCurrentThreadId();
478   event.device_id = device_id;
479   event.context_id = cbdata->contextUid;
480   event.correlation_id = cbdata->correlationId;
481   event.memalloc_info.num_bytes = params->bytesize;
482   VLOG(3) << "Cuda MemAlloc API exit."
483           << " dptr=" << reinterpret_cast<void *>(*params->dptr)
484           << " sz=" << params->bytesize;
485   collector->AddEvent(std::move(event));
486 }
487 
AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)488 void AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector *collector,
489                                         uint32 device_id, CUpti_CallbackId cbid,
490                                         const CUpti_CallbackData *cbdata,
491                                         uint64 start_time, uint64 end_time) {
492   const auto *params =
493       static_cast<const cuMemAllocPitch_v2_params *>(cbdata->functionParams);
494   CuptiTracerEvent event{};
495   event.type = CuptiTracerEventType::MemoryAlloc;
496   event.source = CuptiTracerEventSource::DriverCallback;
497   event.name = cbdata->functionName;
498   event.start_time_ns = start_time;
499   event.end_time_ns = end_time;
500   event.thread_id = Env::Default()->GetCurrentThreadId();
501   event.device_id = device_id;
502   event.context_id = cbdata->contextUid;
503   event.correlation_id = cbdata->correlationId;
504   const size_t size_in_bytes = *params->pPitch * params->Height;
505   event.memalloc_info.num_bytes = size_in_bytes;
506   VLOG(3) << "Cuda MemAllocPitch API exit."
507           << " dptr=" << reinterpret_cast<void *>(*params->dptr)
508           << " sz=" << size_in_bytes;
509   collector->AddEvent(std::move(event));
510 }
511 
AddCuMemFreeEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)512 void AddCuMemFreeEventUponApiExit(CuptiTraceCollector *collector,
513                                   uint32 device_id, CUpti_CallbackId cbid,
514                                   const CUpti_CallbackData *cbdata,
515                                   uint64 start_time, uint64 end_time) {
516   const auto *params =
517       static_cast<const cuMemFree_v2_params *>(cbdata->functionParams);
518   CuptiTracerEvent event{};
519   event.type = CuptiTracerEventType::MemoryFree;
520   event.source = CuptiTracerEventSource::DriverCallback;
521   event.name = cbdata->functionName;
522   event.start_time_ns = start_time;
523   event.end_time_ns = end_time;
524   event.thread_id = Env::Default()->GetCurrentThreadId();
525   event.device_id = device_id;
526   event.context_id = cbdata->contextUid;
527   event.correlation_id = cbdata->correlationId;
528   VLOG(3) << "Cuda MemFree API exit."
529           << " dptr=" << reinterpret_cast<void *>(params->dptr);
530   collector->AddEvent(std::move(event));
531 }
532 
AddGenericEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)533 void AddGenericEventUponApiExit(CuptiTraceCollector *collector,
534                                 uint32 device_id, CUpti_CallbackId cbid,
535                                 const CUpti_CallbackData *cbdata,
536                                 uint64 start_time, uint64 end_time) {
537   CuptiTracerEvent event{};
538   event.type = CuptiTracerEventType::Generic;
539   event.source = CuptiTracerEventSource::DriverCallback;
540   event.name = cbdata->functionName;
541   event.start_time_ns = start_time;
542   event.end_time_ns = end_time;
543   event.thread_id = Env::Default()->GetCurrentThreadId();
544   event.device_id = device_id;
545   event.context_id = cbdata->contextUid;
546   event.correlation_id = cbdata->correlationId;
547   VLOG(3) << "Observed generic API exit."
548           << " name=" << cbdata->functionName;
549   collector->AddEvent(std::move(event));
550 }
551 
AddKernelActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityKernel4 * kernel)552 void AddKernelActivityEvent(CuptiTraceCollector *collector,
553                             const CUpti_ActivityKernel4 *kernel) {
554   CuptiTracerEvent event{};
555   event.type = CuptiTracerEventType::Kernel;
556   event.source = CuptiTracerEventSource::Activity;
557   event.name = kernel->name;
558   event.start_time_ns = kernel->start;
559   event.end_time_ns = kernel->end;
560   event.device_id = kernel->deviceId;
561   event.context_id = kernel->contextId;
562   event.stream_id = kernel->streamId;
563   event.correlation_id = kernel->correlationId;
564   AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
565       event.device_id, event.correlation_id);
566   event.annotation = info.annotation;
567   event.nvtx_range = info.nvtx_range;
568   event.kernel_info.registers_per_thread = kernel->registersPerThread;
569   event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
570   event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
571   event.kernel_info.block_x = kernel->blockX;
572   event.kernel_info.block_y = kernel->blockY;
573   event.kernel_info.block_z = kernel->blockZ;
574   event.kernel_info.grid_x = kernel->gridX;
575   event.kernel_info.grid_y = kernel->gridY;
576   event.kernel_info.grid_z = kernel->gridZ;
577   collector->AddEvent(std::move(event));
578 }
579 
AddMemcpyActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemcpy * memcpy)580 void AddMemcpyActivityEvent(CuptiTraceCollector *collector,
581                             const CUpti_ActivityMemcpy *memcpy) {
582   CuptiTracerEvent event{};
583   switch (memcpy->copyKind) {
584     case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
585       event.type = CuptiTracerEventType::MemcpyH2D;
586       event.name = "MemcpyH2D";
587       break;
588     case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
589       event.type = CuptiTracerEventType::MemcpyD2H;
590       event.name = "MemcpyD2H";
591       break;
592     case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
593       event.type = CuptiTracerEventType::MemcpyD2D;
594       event.name = "MemcpyD2D";
595       break;
596     case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
597       event.type = CuptiTracerEventType::MemcpyP2P;
598       event.name = "MemcpyP2P";
599       break;
600     default:
601       event.type = CuptiTracerEventType::MemcpyOther;
602       event.name = "MemcpyOther";
603       break;
604   }
605 
606   event.source = CuptiTracerEventSource::Activity;
607   event.start_time_ns = memcpy->start;
608   event.end_time_ns = memcpy->end;
609   event.device_id = memcpy->deviceId;
610   event.context_id = memcpy->contextId;
611   event.stream_id = memcpy->streamId;
612   event.correlation_id = memcpy->correlationId;
613   AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
614       event.device_id, event.correlation_id);
615   event.annotation = info.annotation;
616   event.memcpy_info.copy_kind = memcpy->copyKind;
617   event.memcpy_info.num_bytes = memcpy->bytes;
618   event.memcpy_info.destination = memcpy->deviceId;
619   event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
620   event.memcpy_info.src_mem_kind = memcpy->srcKind;
621   event.memcpy_info.dst_mem_kind = memcpy->dstKind;
622   collector->AddEvent(std::move(event));
623 }
624 
625 // Invokes callback upon peer-2-peer memcpy between different GPU devices.
AddMemcpy2ActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemcpy2 * memcpy2)626 void AddMemcpy2ActivityEvent(CuptiTraceCollector *collector,
627                              const CUpti_ActivityMemcpy2 *memcpy2) {
628   CuptiTracerEvent event{};
629   event.type = CuptiTracerEventType::MemcpyP2P;
630   event.name = "MemcpyP2P";
631   event.source = CuptiTracerEventSource::Activity;
632   event.start_time_ns = memcpy2->start;
633   event.end_time_ns = memcpy2->end;
634   event.device_id = memcpy2->srcDeviceId;
635   event.context_id = memcpy2->contextId;
636   event.stream_id = memcpy2->streamId;
637   event.correlation_id = memcpy2->correlationId;
638   AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
639       event.device_id, event.correlation_id);
640   event.annotation = info.annotation;
641   event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
642   event.memcpy_info.num_bytes = memcpy2->bytes;
643   event.memcpy_info.destination = memcpy2->dstDeviceId;
644   event.memcpy_info.async = memcpy2->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
645   event.memcpy_info.src_mem_kind = memcpy2->srcKind;
646   event.memcpy_info.dst_mem_kind = memcpy2->dstKind;
647   collector->AddEvent(std::move(event));
648 }
649 
AddCuptiOverheadActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityOverhead * overhead)650 void AddCuptiOverheadActivityEvent(CuptiTraceCollector *collector,
651                                    const CUpti_ActivityOverhead *overhead) {
652   CuptiTracerEvent event{};
653   event.type = CuptiTracerEventType::Overhead;
654   event.name = getActivityOverheadKindString(overhead->overheadKind);
655   event.source = CuptiTracerEventSource::Activity;
656   event.start_time_ns = overhead->start;
657   event.end_time_ns = overhead->end;
658   // If the overhead is not related to a device, we assign it to device 0.
659   event.device_id = 0;
660   // NOTE: no correlation id.
661   switch (overhead->objectKind) {
662     case CUPTI_ACTIVITY_OBJECT_UNKNOWN:
663       // Don't know how to deal with such activities because of we need either
664       // attribute it to a GPU stream or a CPU thread.
665       return;
666 
667     case CUPTI_ACTIVITY_OBJECT_THREAD:
668     case CUPTI_ACTIVITY_OBJECT_PROCESS:
669       event.thread_id = overhead->objectId.pt.threadId;
670       break;
671     case CUPTI_ACTIVITY_OBJECT_STREAM:
672       event.stream_id = overhead->objectId.dcs.streamId;
673       TF_FALLTHROUGH_INTENDED;
674     case CUPTI_ACTIVITY_OBJECT_DEVICE:
675     case CUPTI_ACTIVITY_OBJECT_CONTEXT:
676       event.device_id = overhead->objectId.dcs.deviceId;
677       break;
678     default:
679       LOG(ERROR) << "Unexpected object kind: " << overhead->objectKind;
680       return;
681   }
682   collector->AddEvent(std::move(event));
683 }
684 
AddUnifiedMemoryActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityUnifiedMemoryCounter2 * record)685 void AddUnifiedMemoryActivityEvent(
686     CuptiTraceCollector *collector,
687     const CUpti_ActivityUnifiedMemoryCounter2 *record) {
688   VLOG(3) << "Cuda Unified Memory Activity, kind: " << record->counterKind
689           << " src: " << record->srcId << " dst: " << record->dstId;
690   CuptiTracerEvent event{};
691   event.type = CuptiTracerEventType::UnifiedMemory;
692   event.name = getActivityUnifiedMemoryKindString(record->counterKind);
693   event.source = CuptiTracerEventSource::Activity;
694   event.start_time_ns = record->start;
695   if (record->counterKind ==
696           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT ||
697       record->counterKind ==
698           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING ||
699       record->counterKind ==
700           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP ||
701       record->end <= record->start) {
702     // If the end time is not valid, trim it so that it can be shown on the UI.
703     event.end_time_ns = record->start + 1;
704   } else {
705     event.end_time_ns = record->end;
706   }
707   event.device_id = record->srcId;
708   // NOTE: not context id and correlation id.
709 
710   // For visualization purpose, we assign a pseudo stream id for each
711   // record->counterKind of unified memory related events.
712   constexpr int kPseudoStreamId = 0x10000000;
713   event.stream_id = kPseudoStreamId + record->counterKind;
714   event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
715   // Check whether the activity is byte transfer.
716   if (record->counterKind ==
717           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD ||
718       record->counterKind ==
719           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH ||
720       record->counterKind ==
721           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD) {
722     event.memcpy_info.num_bytes = record->value;
723   } else {
724     event.memcpy_info.num_bytes = 0;
725   }
726   event.memcpy_info.destination = record->dstId;
727   event.memcpy_info.async = false;
728   collector->AddEvent(std::move(event));
729 }
730 
AddMemoryActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemory * memory)731 void AddMemoryActivityEvent(CuptiTraceCollector *collector,
732                             const CUpti_ActivityMemory *memory) {
733   CuptiTracerEvent event{};
734   event.name = absl::StrCat("Memory ", GetMemoryKindName(memory->memoryKind));
735   event.type = CuptiTracerEventType::MemoryResidency;
736   event.source = CuptiTracerEventSource::Activity;
737   event.start_time_ns = memory->start;
738   event.end_time_ns = std::max(memory->end, memory->start + 1);
739   event.device_id = memory->deviceId;
740   event.context_id = memory->contextId;
741   // Assign to default stream (0) so that event is included during Flush().
742   event.stream_id = 0;
743   event.memory_residency_info.num_bytes = memory->bytes;
744   event.memory_residency_info.mem_kind = memory->memoryKind;
745   event.memory_residency_info.address = memory->address;
746   VLOG(5) << "Cuda activity " << event.name
747           << " addr: " << reinterpret_cast<void *>(memory->address)
748           << " bytes: " << memory->bytes;
749   collector->AddEvent(std::move(event));
750 }
751 
AddMemsetActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemset * memset)752 void AddMemsetActivityEvent(CuptiTraceCollector *collector,
753                             const CUpti_ActivityMemset *memset) {
754   auto mem_kind = memset->memoryKind;
755   CuptiTracerEvent event{};
756   event.type = CuptiTracerEventType::Memset;
757   event.source = CuptiTracerEventSource::Activity;
758   event.name = absl::StrCat("Memset ", mem_kind);
759   event.start_time_ns = memset->start;
760   event.end_time_ns = std::max(memset->end, memset->start + 1);
761   event.device_id = memset->deviceId;
762   event.correlation_id = memset->correlationId;
763   event.context_id = memset->contextId;
764   event.stream_id = memset->streamId;
765   event.memset_info.num_bytes = memset->bytes;
766   event.memset_info.mem_kind = mem_kind;
767   event.memset_info.async = (memset->flags & CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC);
768   VLOG(5) << "Cuda activity " << event.name << " bytes: " << memset->bytes
769           << " async: " << event.memset_info.async;
770   collector->AddEvent(std::move(event));
771 }
772 
AddSynchronizationActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivitySynchronization * sync)773 void AddSynchronizationActivityEvent(
774     CuptiTraceCollector *collector, const CUpti_ActivitySynchronization *sync) {
775   CuptiTracerEvent event{};
776   event.type = CuptiTracerEventType::Generic;
777   event.source = CuptiTracerEventSource::Activity;
778   switch (sync->type) {
779     case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE:
780       event.name = "cuEventSynchronize";
781       break;
782     case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT:
783       event.name = "cuStreamWaitEvent";
784       break;
785     case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE:
786       event.name = "cuStreamSynchronize";
787       break;
788     case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_CONTEXT_SYNCHRONIZE:
789       event.name = "cuCtxSynchronize";
790       break;
791     default:
792       event.name = "unknown synchronization event";
793       break;
794   }
795   event.start_time_ns = sync->start;
796   event.end_time_ns = std::max(sync->end, sync->start + 1);
797   event.correlation_id = sync->correlationId;
798   event.context_id = sync->contextId;
799   VLOG(5) << "Cuda activity " << event.name;
800   collector->AddEvent(std::move(event));
801 }
802 
803 // This hook uses cupti activity api to measure device side activities.
804 class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
805  public:
CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions & option,CuptiInterface * cupti_interface,CuptiTraceCollector * collector)806   CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions &option,
807                                     CuptiInterface *cupti_interface,
808                                     CuptiTraceCollector *collector)
809       : option_(option),
810         cupti_interface_(cupti_interface),
811         collector_(collector) {}
812 
OnDriverApiEnter(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)813   Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
814                           CUpti_CallbackId cbid,
815                           const CUpti_CallbackData *cbdata) override {
816     // Stash away the current Cupti timestamp into cbdata.
817     *cbdata->correlationData =
818         option_.required_callback_api_events ? CuptiTracer::GetTimestamp() : 0;
819     return Status::OK();
820   }
OnDriverApiExit(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)821   Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
822                          CUpti_CallbackId cbid,
823                          const CUpti_CallbackData *cbdata) override {
824     // If we are not collecting CPU events from Callback API, we can return now.
825     if (!option_.required_callback_api_events) {
826       return Status::OK();
827     }
828 
829     // Grab timestamp for API exit. API entry timestamp saved in cbdata.
830     uint64 end_tsc = CuptiTracer::GetTimestamp();
831     uint64 start_tsc = *cbdata->correlationData;
832     TrackContext(cbid, cbdata->context);
833     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
834                                      start_tsc, end_tsc, domain, cbid, cbdata);
835   }
SyncAndFlush()836   Status SyncAndFlush() override {
837     if (option_.sync_devices_before_stop) {
838       CuptiApiTracingDisabler disabler;
839       absl::MutexLock lock(&mutex_);
840       for (auto &ctx : contexts_) {
841         cuCtxPushCurrent(ctx);
842         cuCtxSynchronize();  // Ignore error here for best effort.
843         CUcontext current;
844         cuCtxPopCurrent(&current);
845       }
846     }
847     return Status::OK();
848   }
849 
850  private:
TrackContext(CUpti_CallbackId cbid,CUcontext ctx)851   void TrackContext(CUpti_CallbackId cbid, CUcontext ctx) {
852     if (!option_.sync_devices_before_stop) return;
853     if (ctx == NULL) return;
854     absl::MutexLock lock(&mutex_);
855     if (cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2 ||
856         cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy) {
857       contexts_.erase(ctx);
858     } else {
859       contexts_.emplace(ctx);
860     }
861   }
862 
863   const CuptiTracerOptions option_;
864   CuptiInterface *cupti_interface_;
865   CuptiTraceCollector *collector_;
866   absl::Mutex mutex_;
867   absl::flat_hash_set<CUcontext> contexts_ TF_GUARDED_BY(mutex_);
868 
869   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithActivityApi);
870 };
871 
872 struct KernelRecord {
873   const char *kernel_name;
874   // TODO(csigg): cuStreamGetCtx introduced in CUDA 9.2 would allow us to only
875   // record the stream and infer the context during collection.
876   CUcontext context;
877   CUstream stream;
878   uint32 correlation_id;
879   CUevent start_event;
880   CUevent stop_event;
881   KernelDetails details;
882   uint64 start_timestamp;
883 };
884 
885 struct MemcpyRecord {
886   CuptiTracerEventType type;
887   size_t size_bytes;
888   CUcontext context;
889   CUstream stream;
890   uint32 correlation_id;
891   bool async;
892   CUevent start_event;
893   CUevent stop_event;
894   uint64 start_timestamp;
895 };
896 
CreateAndRecordEvent(CUevent * event,CUstream stream)897 Status CreateAndRecordEvent(CUevent *event, CUstream stream) {
898   CuptiApiTracingDisabler disabler;
899   TF_RETURN_IF_ERROR(ToStatus(cuEventCreate(event, CU_EVENT_DEFAULT)));
900   return ToStatus(cuEventRecord(*event, stream));
901 }
902 
903 #if CUDA_VERSION >= 10000
904 // Maintain and restore current thread's CUDA context.
905 // Note: cuStreamGetCtx only available after CUDA 9.2.
906 class ScopedCudaContext {
907  public:
ScopedCudaContext(CUstream stream)908   explicit ScopedCudaContext(CUstream stream) : stream_(stream) {
909     CuptiApiTracingDisabler disabler;  // don't trace cuda call in this func.
910     CUcontext context;
911     if (cuStreamGetCtx(stream, &context) != CUDA_SUCCESS) return;
912     context_ = context;
913     uint32 device_ordinal;
914     if (cuptiGetDeviceId(context, &device_ordinal) != CUPTI_SUCCESS) return;
915     device_ordinal_ = device_ordinal;
916     context_pushed_ = cuCtxPushCurrent(context) == CUDA_SUCCESS;
917   }
~ScopedCudaContext()918   ~ScopedCudaContext() {
919     if (!context_pushed_) return;
920     CuptiApiTracingDisabler disabler;  // don't trace cuda call in this func.
921     cuCtxPopCurrent(&*context_);
922   }
923 
924   // If successful, return the device ordinal of the relevant cuda stream.
925   // Otherwise absl::nullopt;
GetDeviceOrdinal()926   absl::optional<uint32> GetDeviceOrdinal() { return device_ordinal_; }
927 
928   // If successful, return the cuda context of the relevant cuda stream.
929   // Otherwise absl::nullopt;
GetContext()930   absl::optional<CUcontext> GetContext() { return context_; }
931 
932  private:
933   CUstream stream_;
934   absl::optional<CUcontext> context_;
935   absl::optional<uint32> device_ordinal_;
936   bool context_pushed_ = false;
937 };
938 #endif
939 
940 // Stores a series of kernel and memcpy records.
941 class CudaEventRecorder {
942  public:
CudaEventRecorder(CuptiInterface * cupti_interface,CuptiTraceCollector * collector,int ordinal)943   CudaEventRecorder(CuptiInterface *cupti_interface,
944                     CuptiTraceCollector *collector, int ordinal)
945       : cupti_interface_(cupti_interface),
946         collector_(collector),
947         ordinal_(ordinal) {
948     device_name_ = absl::StrCat("gpu ", ordinal);  // default.
949     CUdevice device;
950     if (cuDeviceGet(&device, ordinal) == CUDA_SUCCESS) {
951       char name[100];
952       if (cuDeviceGetName(name, sizeof(name), device) == CUDA_SUCCESS) {
953         device_name_ = name;
954       }
955     }
956   }
957 
958   // Registers the start of a kernel launch. The returned index should be passed
959   // to StopKernel() after the kernel launch has completed.
960   template <typename T>
StartKernel(const char * kernel_name,CUcontext context,uint32 correlation_id,const T * params)961   size_t StartKernel(const char *kernel_name, CUcontext context,
962                      uint32 correlation_id, const T *params) {
963     CUstream stream = params->hStream;
964     KernelRecord record = {kernel_name, context, stream, correlation_id};
965     record.details.registers_per_thread = 0;  // unknown.
966     record.details.static_shared_memory_usage = params->sharedMemBytes;
967     record.details.dynamic_shared_memory_usage = 0;  // unknown
968     record.details.block_x = params->blockDimX;
969     record.details.block_y = params->blockDimY;
970     record.details.block_z = params->blockDimZ;
971     record.details.grid_x = params->gridDimX;
972     record.details.grid_y = params->gridDimY;
973     record.details.grid_z = params->gridDimZ;
974     record.start_timestamp = CuptiTracer::GetTimestamp();
975     LogIfError(CreateAndRecordEvent(&record.start_event, stream));
976     absl::MutexLock lock(&mutex_);
977     if (stopped_) return -1;
978     kernel_records_.push_back(record);
979     return kernel_records_.size() - 1;
980   }
StopKernel(size_t index)981   uint64 StopKernel(size_t index) {
982     absl::MutexLock lock(&mutex_);
983     if (index >= kernel_records_.size()) return 0;
984     auto &record = kernel_records_[index];
985     LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
986     return record.start_timestamp;
987   }
988 
989   // Registers the start of a copy operation. The returned index should be
990   // passed to StopMemcpy() after the memcpy has completed.
StartMemcpy(CuptiTracerEventType type,size_t size_bytes,CUcontext context,CUstream stream,uint32 correlation_id,bool async)991   size_t StartMemcpy(CuptiTracerEventType type, size_t size_bytes,
992                      CUcontext context, CUstream stream, uint32 correlation_id,
993                      bool async) {
994     MemcpyRecord record = {type,   size_bytes,     context,
995                            stream, correlation_id, async};
996     record.start_timestamp = CuptiTracer::GetTimestamp();
997     LogIfError(CreateAndRecordEvent(&record.start_event, stream));
998     absl::MutexLock lock(&mutex_);
999     if (stopped_) return -1;
1000     memcpy_records_.push_back(record);
1001     return memcpy_records_.size() - 1;
1002   }
StopMemcpy(size_t index)1003   uint64 StopMemcpy(size_t index) {
1004     absl::MutexLock lock(&mutex_);
1005     if (index >= memcpy_records_.size()) return 0;
1006     auto &record = memcpy_records_[index];
1007     LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
1008     return record.start_timestamp;
1009   }
1010 
Stop()1011   Status Stop() {
1012     {
1013       absl::MutexLock lock(&mutex_);
1014       stopped_ = true;
1015       LOG(INFO) << "Collecting " << kernel_records_.size()
1016                 << " kernel records, " << memcpy_records_.size()
1017                 << " memcpy records.";
1018 
1019       // Gather all profiled streams and contexts.
1020       for (const auto &record : kernel_records_) {
1021         TF_RETURN_IF_ERROR(
1022             AddStreamInfo(record.context, record.stream, "Kernel"));
1023       }
1024       for (const auto &record : memcpy_records_) {
1025         TF_RETURN_IF_ERROR(AddStreamInfo(record.context, record.stream,
1026                                          GetTraceEventTypeName(record.type)));
1027       }
1028     }
1029 
1030     // Synchronize all contexts, record end events, synchronize again.
1031     // This scheme is an unreliable measure to associate a event with the wall
1032     // time. There are chances that other threads might enque kernels which
1033     // delay the second synchronization.
1034     TF_RETURN_IF_ERROR(Synchronize());
1035     for (auto &pair : context_infos_) {
1036       TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
1037       TF_RETURN_IF_ERROR(CreateAndRecordEvent(&pair.second.end_event, nullptr));
1038     }
1039 
1040     TF_RETURN_IF_ERROR(Synchronize());
1041     end_walltime_us_ = Env::Default()->NowMicros();
1042     return Status::OK();
1043   }
1044 
Flush(AnnotationMap * annotation_map)1045   Status Flush(AnnotationMap *annotation_map) {
1046     auto kernel_records = ConsumeKernelRecords();
1047     auto memcpy_records = ConsumeMemcpyRecords();
1048     for (const auto &record : kernel_records) {
1049       TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
1050     }
1051     for (const auto &record : memcpy_records) {
1052       TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
1053     }
1054     return Status::OK();
1055   }
1056 
ConsumeKernelRecords()1057   std::vector<KernelRecord> ConsumeKernelRecords() {
1058     absl::MutexLock lock(&mutex_);
1059     return std::move(kernel_records_);
1060   }
ConsumeMemcpyRecords()1061   std::vector<MemcpyRecord> ConsumeMemcpyRecords() {
1062     absl::MutexLock lock(&mutex_);
1063     return std::move(memcpy_records_);
1064   }
1065 
1066  private:
1067   struct ContextInfo {
1068     uint32 context_id = 0;
1069     int num_streams = 0;
1070     CUevent end_event;
1071   };
1072 
1073   struct StreamInfo {
1074     uint32 stream_id = 0;
1075     std::string name;
1076     int index;  // 0 is reserved for null stream.
1077     const ContextInfo *ctx_info;
1078   };
1079 
1080   // Synchronizes all contexts.
Synchronize() const1081   Status Synchronize() const {
1082     CuptiApiTracingDisabler disabler;
1083     for (const auto &pair : context_infos_) {
1084       TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
1085       TF_RETURN_IF_ERROR(ToStatus(cuCtxSynchronize()));
1086     }
1087     return Status::OK();
1088   }
1089 
1090   // Returns element from context_infos_, adding it if not yet present.
GetContextInfo(CUcontext context,ContextInfo ** ctx_info_ptr)1091   Status GetContextInfo(CUcontext context, ContextInfo **ctx_info_ptr) {
1092     auto it = context_infos_.find(context);
1093 
1094     if (it == context_infos_.end()) {
1095       uint32 context_id = 0;
1096       RETURN_IF_CUPTI_ERROR(
1097           cupti_interface_->GetContextId(context, &context_id));
1098       ContextInfo ctx_info = {context_id};
1099       it = context_infos_.emplace(context, ctx_info).first;
1100     }
1101 
1102     *ctx_info_ptr = &it->second;
1103     return Status::OK();
1104   }
1105 
1106   // Adds element to stream_infos_ if not yet present. If present, clear name
1107   // if it doesn't match parameter.
AddStreamInfo(CUcontext context,CUstream stream,absl::string_view name)1108   Status AddStreamInfo(CUcontext context, CUstream stream,
1109                        absl::string_view name) {
1110     StreamKey key(context, stream);
1111     auto it = stream_infos_.find(key);
1112     if (it != stream_infos_.end()) {
1113       if (it->second.name != name) {
1114         it->second.name.clear();  // Stream with inconsistent names, clear it.
1115       }
1116       return Status::OK();
1117     }
1118 
1119     ContextInfo *ctx_info;
1120     TF_RETURN_IF_ERROR(GetContextInfo(context, &ctx_info));
1121     int index = stream ? ++ctx_info->num_streams : 0;
1122     uint32 stream_id = 0;
1123 #if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
1124     RETURN_IF_CUPTI_ERROR(
1125         cupti_interface_->GetStreamIdEx(context, stream, 1, &stream_id));
1126 #else
1127     RETURN_IF_CUPTI_ERROR(
1128         cupti_interface_->GetStreamIdEx(context, stream, 0, &stream_id));
1129 #endif
1130 
1131     StreamInfo stream_info = {stream_id, static_cast<std::string>(name), index,
1132                               ctx_info};
1133     stream_infos_.emplace(key, stream_info);
1134     return Status::OK();
1135   }
1136 
1137   // Returns time in microseconds between events recorded on the GPU.
GetElapsedTimeUs(CUevent start,CUevent stop)1138   static uint64_t GetElapsedTimeUs(CUevent start, CUevent stop) {
1139     CuptiApiTracingDisabler disabler;
1140     float elapsed_ms = 0.0f;
1141     LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop)));
1142     return static_cast<uint64>(
1143         std::llroundf(1000 * std::max(elapsed_ms, 0.0f)));
1144   }
1145 
SaveRecord(const KernelRecord & record,AnnotationMap * annotation_map) const1146   Status SaveRecord(const KernelRecord &record,
1147                     AnnotationMap *annotation_map) const {
1148     if (!record.start_event || !record.stop_event) {
1149       return Status::OK();
1150     }
1151     const auto &stream_info =
1152         stream_infos_.at(StreamKey(record.context, record.stream));
1153     auto start_us =
1154         GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
1155     auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
1156 
1157     std::string annotation;
1158 
1159     CuptiTracerEvent event{};
1160     event.type = CuptiTracerEventType::Kernel;
1161     event.source = CuptiTracerEventSource::Activity;  // on gpu device.
1162     event.name = record.kernel_name;
1163     event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
1164     event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
1165     event.device_id = ordinal_;
1166     event.context_id = stream_info.ctx_info->context_id;
1167     event.stream_id = stream_info.stream_id;
1168     event.correlation_id = record.correlation_id;
1169     AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
1170         event.device_id, event.correlation_id);
1171     event.annotation = info.annotation;
1172     event.kernel_info = record.details;
1173     collector_->AddEvent(std::move(event));
1174     return Status::OK();
1175   }
1176 
SaveRecord(const MemcpyRecord & record,AnnotationMap * annotation_map) const1177   Status SaveRecord(const MemcpyRecord &record,
1178                     AnnotationMap *annotation_map) const {
1179     if (!record.start_event || !record.stop_event) {
1180       return Status::OK();
1181     }
1182     const auto &stream_info =
1183         stream_infos_.at(StreamKey(record.context, record.stream));
1184     auto start_us =
1185         GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
1186     auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
1187 
1188     CuptiTracerEvent event{};
1189     event.type = record.type;
1190     event.name = GetTraceEventTypeName(event.type);
1191     event.source = CuptiTracerEventSource::Activity;
1192     event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
1193     event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
1194     event.device_id = ordinal_;
1195     event.context_id = stream_info.ctx_info->context_id;
1196     event.stream_id = stream_info.stream_id;
1197     event.correlation_id = record.correlation_id;
1198     AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
1199         event.device_id, event.correlation_id);
1200     event.annotation = info.annotation;
1201     event.memcpy_info.num_bytes = record.size_bytes;
1202     // TODO: support MemcpyD2D where destination != source;
1203     event.memcpy_info.destination = ordinal_;
1204     event.memcpy_info.async = record.async;
1205     // TODO: set src_mem_kind and dst_mem_kind.
1206     collector_->AddEvent(std::move(event));
1207     return Status::OK();
1208   }
1209 
1210   absl::Mutex mutex_;
1211   bool stopped_ TF_GUARDED_BY(mutex_) = false;
1212   std::vector<KernelRecord> kernel_records_ TF_GUARDED_BY(mutex_);
1213   std::vector<MemcpyRecord> memcpy_records_ TF_GUARDED_BY(mutex_);
1214 
1215   CuptiInterface *cupti_interface_;
1216   CuptiTraceCollector *collector_;
1217   const int ordinal_;
1218   std::string device_name_;
1219   uint64 end_walltime_us_;
1220   // Include context in key to distinguish null streams.
1221   using StreamKey = std::pair<CUcontext, CUstream>;
1222 
1223   absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
1224   absl::flat_hash_map<StreamKey, StreamInfo> stream_infos_;
1225 };
1226 
1227 // This hook uses cuda events to measure device side activities.
1228 class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
1229  public:
CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions & option,CuptiInterface * cupti_interface,CuptiTraceCollector * collector)1230   CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions &option,
1231                                   CuptiInterface *cupti_interface,
1232                                   CuptiTraceCollector *collector)
1233       : option_(option),
1234         cupti_interface_(cupti_interface),
1235         collector_(collector) {
1236     int num_gpus = CuptiTracer::NumGpus();
1237     cuda_event_recorders_.reserve(num_gpus);
1238     for (int i = 0; i < num_gpus; ++i) {
1239       cuda_event_recorders_.emplace_back(
1240           absl::make_unique<CudaEventRecorder>(cupti_interface, collector, i));
1241     }
1242   }
~CuptiDriverApiHookWithCudaEvent()1243   ~CuptiDriverApiHookWithCudaEvent() {
1244     for (auto *callback_context : callback_contexts_) delete callback_context;
1245   }
1246 
OnDriverApiEnter(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1247   Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
1248                           CUpti_CallbackId cbid,
1249                           const CUpti_CallbackData *cbdata) override {
1250     auto *recorder = cuda_event_recorders_[device_id].get();
1251     switch (cbid) {
1252       case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: {
1253         DCHECK_NE(cbdata->symbolName, nullptr);
1254         const auto *params =
1255             static_cast<const cuLaunchKernel_params *>(cbdata->functionParams);
1256         *cbdata->correlationData = recorder->StartKernel<cuLaunchKernel_params>(
1257             cbdata->symbolName, cbdata->context, cbdata->correlationId, params);
1258         break;
1259       }
1260       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: {
1261         DCHECK_NE(cbdata->symbolName, nullptr);
1262         const auto *params =
1263             static_cast<const cuLaunchCooperativeKernel_params *>(
1264                 cbdata->functionParams);
1265         *cbdata->correlationData =
1266             recorder->StartKernel<cuLaunchCooperativeKernel_params>(
1267                 cbdata->symbolName, cbdata->context, cbdata->correlationId,
1268                 params);
1269         break;
1270       }
1271       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
1272 #if CUDA_VERSION >= 10000
1273         const auto *params =
1274             static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
1275                 cbdata->functionParams);
1276         std::vector<uint32> record_indices;
1277         record_indices.reserve(params->numDevices);
1278         *cbdata->correlationData = -1;  // Invalid value.
1279         const auto &annotation = AnnotationStack::Get();
1280         for (int i = 0; i < params->numDevices; ++i) {
1281           CUstream stream = params->launchParamsList[i].hStream;
1282           ScopedCudaContext scoped_cuda_context(stream);
1283           auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
1284           auto context = scoped_cuda_context.GetContext();
1285           if (!dev_id) return errors::Internal("Invalid CUDA stream");
1286           // Because annotation are per device, therefore we need to populate
1287           // annotation for each device involved.
1288           collector_->annotation_map()->Add(*dev_id, cbdata->correlationId,
1289                                             annotation, "");
1290           record_indices.push_back(
1291               cuda_event_recorders_[*dev_id]->StartKernel<CUDA_LAUNCH_PARAMS>(
1292                   "CooperativeKernelMultiDevice", *context,
1293                   cbdata->correlationId, &(params->launchParamsList[i])));
1294         }
1295         auto *callback_context =
1296             new CuptiApiCallbackContext(std::move(record_indices));
1297         callback_contexts_.insert(callback_context);
1298         *cbdata->correlationData = reinterpret_cast<uint64>(callback_context);
1299 #else
1300         VLOG(1) << "Unhandled cuLaunchCooperativeKernelMultiDevice.";
1301 #endif
1302       } break;
1303       case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
1304         const auto *params =
1305             static_cast<const cuMemcpy_params *>(cbdata->functionParams);
1306         StartMemcpy<cuMemcpy_params>(GetMemcpyType(params->src, params->dst),
1307                                      cbdata, recorder);
1308         break;
1309       }
1310       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
1311         const auto *params =
1312             static_cast<const cuMemcpyAsync_params *>(cbdata->functionParams);
1313         StartMemcpyAsync<cuMemcpyAsync_params>(
1314             GetMemcpyType(params->src, params->dst), cbdata, recorder);
1315         break;
1316       }
1317       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1318         StartMemcpy<cuMemcpyHtoD_v2_params>(CuptiTracerEventType::MemcpyH2D,
1319                                             cbdata, recorder);
1320         break;
1321       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1322         StartMemcpyAsync<cuMemcpyHtoDAsync_v2_params>(
1323             CuptiTracerEventType::MemcpyH2D, cbdata, recorder);
1324         break;
1325       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1326         StartMemcpy<cuMemcpyDtoH_v2_params>(CuptiTracerEventType::MemcpyD2H,
1327                                             cbdata, recorder);
1328         break;
1329       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1330         StartMemcpyAsync<cuMemcpyDtoHAsync_v2_params>(
1331             CuptiTracerEventType::MemcpyD2H, cbdata, recorder);
1332         break;
1333       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1334         StartMemcpy<cuMemcpyDtoD_v2_params>(CuptiTracerEventType::MemcpyD2D,
1335                                             cbdata, recorder);
1336         break;
1337       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1338         StartMemcpyAsync<cuMemcpyDtoDAsync_v2_params>(
1339             CuptiTracerEventType::MemcpyD2D, cbdata, recorder);
1340         break;
1341       default:
1342         VLOG(1) << "Unexpected callback id: " << cbid;
1343         break;
1344     }
1345     return Status::OK();
1346   }
1347 
OnDriverApiExit(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1348   Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
1349                          CUpti_CallbackId cbid,
1350                          const CUpti_CallbackData *cbdata) override {
1351     auto *recorder = cuda_event_recorders_[device_id].get();
1352     if (*cbdata->correlationData == static_cast<size_t>(-1))
1353       return Status::OK();
1354     uint64 start_tsc = 0;
1355     switch (cbid) {
1356       case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
1357       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
1358         start_tsc = recorder->StopKernel(*cbdata->correlationData);
1359         break;
1360       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
1361 #if CUDA_VERSION >= 10000
1362         auto *callback_context = reinterpret_cast<CuptiApiCallbackContext *>(
1363             *cbdata->correlationData);
1364         callback_contexts_.erase(callback_context);
1365         auto record_indices = std::move(callback_context->record_indices);
1366         delete callback_context;
1367         const auto *params =
1368             static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
1369                 cbdata->functionParams);
1370         if (record_indices.size() != params->numDevices)
1371           return errors::Internal("Invalid correlation data");
1372         for (int i = 0; i < params->numDevices; ++i) {
1373           CUstream stream = params->launchParamsList[i].hStream;
1374           ScopedCudaContext scoped_cuda_context(stream);
1375           auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
1376           if (!dev_id) return errors::Internal("Invalid CUDA stream");
1377           start_tsc =
1378               cuda_event_recorders_[*dev_id]->StopKernel(record_indices[i]);
1379         }
1380 #endif
1381       } break;
1382       case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
1383       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
1384       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1385       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1386       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1387       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1388       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1389       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1390         start_tsc = recorder->StopMemcpy(*cbdata->correlationData);
1391         break;
1392       default:
1393         VLOG(1) << "Unexpected callback id: " << cbid;
1394         // TODO: figure out how to get start timestamp in this case.
1395         return Status::OK();
1396     }
1397     // If we are not collecting CPU events from Callback API, we can return now.
1398     if (!option_.required_callback_api_events) {
1399       return Status::OK();
1400     }
1401 
1402     // Grab timestamp for API exit. API entry timestamp saved in cbdata.
1403     uint64 end_tsc = CuptiTracer::GetTimestamp();
1404     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
1405                                      start_tsc, end_tsc, domain, cbid, cbdata);
1406   }
SyncAndFlush()1407   Status SyncAndFlush() override {
1408     for (auto &recorder : cuda_event_recorders_) {
1409       TF_RETURN_IF_ERROR(recorder->Stop());
1410     }
1411     for (auto &recorder : cuda_event_recorders_) {
1412       TF_RETURN_IF_ERROR(recorder->Flush(collector_->annotation_map()));
1413     }
1414     return Status::OK();
1415   }
1416 
1417  private:
1418   template <typename T>
StartMemcpy(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,CudaEventRecorder * recorder)1419   static void StartMemcpy(CuptiTracerEventType type,
1420                           const CUpti_CallbackData *cbdata,
1421                           CudaEventRecorder *recorder) {
1422     const auto *params = static_cast<const T *>(cbdata->functionParams);
1423     *cbdata->correlationData =
1424         recorder->StartMemcpy(type, params->ByteCount, cbdata->context, nullptr,
1425                               cbdata->correlationId, /*async*/ false);
1426   }
1427 
1428   template <typename T>
StartMemcpyAsync(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,CudaEventRecorder * recorder)1429   static void StartMemcpyAsync(CuptiTracerEventType type,
1430                                const CUpti_CallbackData *cbdata,
1431                                CudaEventRecorder *recorder) {
1432     const auto *params = static_cast<const T *>(cbdata->functionParams);
1433     *cbdata->correlationData = recorder->StartMemcpy(
1434         type, params->ByteCount, cbdata->context, params->hStream,
1435         cbdata->correlationId, /*async*/ true);
1436   }
1437 
GetMemoryType(CUdeviceptr ptr)1438   static CUmemorytype GetMemoryType(CUdeviceptr ptr) {
1439     CuptiApiTracingDisabler disabler;
1440     CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
1441     auto status =
1442         cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr);
1443     if (status == CUDA_ERROR_INVALID_VALUE) {
1444       // Pointer not registered with CUDA, must be host memory.
1445       return CU_MEMORYTYPE_HOST;
1446     }
1447     LogIfError(ToStatus(status));
1448     return mem_type;
1449   }
1450 
GetMemcpyType(CUdeviceptr src,CUdeviceptr dst)1451   static CuptiTracerEventType GetMemcpyType(CUdeviceptr src, CUdeviceptr dst) {
1452     CUmemorytype src_type = GetMemoryType(src);
1453     CUmemorytype dst_type = GetMemoryType(dst);
1454     // TODO: handle CU_MEMORYTYPE_ARRAY case
1455     if (src_type == CU_MEMORYTYPE_HOST && dst_type == CU_MEMORYTYPE_DEVICE) {
1456       return CuptiTracerEventType::MemcpyH2D;
1457     } else if (src_type == CU_MEMORYTYPE_DEVICE &&
1458                dst_type == CU_MEMORYTYPE_HOST) {
1459       return CuptiTracerEventType::MemcpyD2H;
1460     } else if (src_type == CU_MEMORYTYPE_DEVICE &&
1461                dst_type == CU_MEMORYTYPE_DEVICE) {
1462       return CuptiTracerEventType::MemcpyD2D;
1463     }
1464     return CuptiTracerEventType::MemcpyOther;
1465   }
1466 
1467   // Each cuLaunchCooperativeKernelMultiDevice will need to add an entry in
1468   // each corresponding device, therefore we need to keep records of all
1469   // the record indices in each device's record array.
1470   // We allocate such data structure during API entry and free during API exit.
1471   // However there is no guarantee that we receive such callbacks in pairs, we
1472   // maintain a on-going API calls to make sure no memory leaks.
1473   struct CuptiApiCallbackContext {
CuptiApiCallbackContexttensorflow::profiler::__anon4531e42b0111::CuptiDriverApiHookWithCudaEvent::CuptiApiCallbackContext1474     explicit CuptiApiCallbackContext(std::vector<uint32> &&r)
1475         : record_indices(std::move(r)) {}
1476     std::vector<uint32> record_indices;
1477   };
1478 
1479   const CuptiTracerOptions option_;
1480   CuptiInterface *cupti_interface_;
1481   CuptiTraceCollector *collector_;
1482   absl::node_hash_set<CuptiApiCallbackContext *> callback_contexts_;
1483   std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
1484   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
1485 };
1486 
ErrorWithHostname(absl::string_view error_message)1487 /*static*/ std::string ErrorWithHostname(absl::string_view error_message) {
1488   return absl::StrCat(port::Hostname(), ": ", error_message);
1489 }
1490 
1491 }  // namespace
1492 
AddDriverApiCallbackEvent(CuptiTraceCollector * collector,CuptiInterface * cupti_interface,int device_id,uint64 start_tsc,uint64 end_tsc,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1493 /*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
1494     CuptiTraceCollector *collector, CuptiInterface *cupti_interface,
1495     int device_id, uint64 start_tsc, uint64 end_tsc,
1496     CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
1497     const CUpti_CallbackData *cbdata) {
1498   switch (cbid) {
1499     case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
1500     case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
1501     case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice:
1502       AddKernelEventUponApiExit(collector, device_id, cbdata, start_tsc,
1503                                 end_tsc);
1504       break;
1505     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
1506     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
1507     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1508     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1509     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1510     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1511     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1512     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1513     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2:
1514     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2:
1515     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2:
1516     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2:
1517     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2:
1518     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2:
1519     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2:
1520     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2:
1521     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2:
1522     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2:
1523     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2:
1524     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2:
1525       // This would be the place to populate the memcpy API activity's src and
1526       // dst memory kind by casting cbdata->functionParams. However, we are not
1527       // doing that because that will incur significant overhead to get the
1528       // memory aperture of each argument.
1529       AddNormalMemcpyEventUponApiExit(collector, device_id, cbid, cbdata,
1530                                       start_tsc, end_tsc);
1531       break;
1532     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer:
1533     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync:
1534       AddP2PMemcpyEventUponApiExit(collector, cupti_interface, device_id, cbid,
1535                                    cbdata, start_tsc, end_tsc);
1536       break;
1537     case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2:
1538       AddCuMemAllocEventUponApiExit(collector, device_id, cbid, cbdata,
1539                                     start_tsc, end_tsc);
1540       break;
1541     case CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2:
1542       AddCuMemAllocPitchEventUponApiExit(collector, device_id, cbid, cbdata,
1543                                          start_tsc, end_tsc);
1544       break;
1545     case CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2:
1546       AddCuMemFreeEventUponApiExit(collector, device_id, cbid, cbdata,
1547                                    start_tsc, end_tsc);
1548       break;
1549     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2:
1550     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2:
1551     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2:
1552     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2:
1553     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2:
1554     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2:
1555     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async:
1556     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async:
1557     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async:
1558     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async:
1559     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async:
1560     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async:
1561       AddCuMemsetEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc,
1562                                   end_tsc);
1563       break;
1564     default:
1565       AddGenericEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc,
1566                                  end_tsc);
1567       break;
1568   }
1569   return Status::OK();
1570 }
1571 
GetTraceEventTypeName(const CuptiTracerEventType & type)1572 const char *GetTraceEventTypeName(const CuptiTracerEventType &type) {
1573   // Do not use a default so that this gives a build error when
1574   // CuptiTracerEventType is extended but this is not.
1575   switch (type) {
1576     case CuptiTracerEventType::MemcpyH2D:
1577       return "MemcpyH2D";
1578     case CuptiTracerEventType::MemcpyD2H:
1579       return "MemcpyD2H";
1580     case CuptiTracerEventType::MemcpyD2D:
1581       return "MemcpyD2D";
1582     case CuptiTracerEventType::MemcpyP2P:
1583       return "MemcpyP2P";
1584     case CuptiTracerEventType::MemcpyOther:
1585       return "MemcpyOther";
1586     case CuptiTracerEventType::Kernel:
1587       return "Compute";
1588     case CuptiTracerEventType::MemoryAlloc:
1589       return "MemoryAlloc";
1590     case CuptiTracerEventType::MemoryFree:
1591       return "MemoryFree";
1592     case CuptiTracerEventType::Memset:
1593       return "Memset";
1594     case CuptiTracerEventType::Overhead:
1595       return "Overhead";
1596     case CuptiTracerEventType::UnifiedMemory:
1597       return "UnifiedMemory";
1598     case CuptiTracerEventType::Generic:
1599       return "Generic";
1600     case CuptiTracerEventType::MemoryResidency:
1601       return "MemoryResidency";
1602     case CuptiTracerEventType::Unsupported:
1603       return "";
1604   }
1605 }
1606 
CuptiTracer(CuptiInterface * cupti_interface)1607 CuptiTracer::CuptiTracer(CuptiInterface *cupti_interface)
1608     : num_gpus_(NumGpus()),
1609       cupti_interface_(cupti_interface),
1610       buffer_pool_(kBufferSizeInBytes) {}
1611 
GetCuptiTracerSingleton()1612 /* static */ CuptiTracer *CuptiTracer::GetCuptiTracerSingleton() {
1613   static auto *singleton = new CuptiTracer(GetCuptiInterface());
1614   return singleton;
1615 }
1616 
IsAvailable() const1617 bool CuptiTracer::IsAvailable() const {
1618   return NumGpus() && !activity_tracing_enabled_ && !api_tracing_enabled_;
1619 }
1620 
NumGpus()1621 int CuptiTracer::NumGpus() {
1622   static int num_gpus = []() -> int {
1623     if (cuInit(0) != CUDA_SUCCESS) {
1624       return 0;
1625     }
1626     int gpu_count;
1627     if (cuDeviceGetCount(&gpu_count) != CUDA_SUCCESS) {
1628       return 0;
1629     }
1630     LOG(INFO) << "Profiler found " << gpu_count << " GPUs";
1631     return gpu_count;
1632   }();
1633   return num_gpus;
1634 }
1635 
Enable(const CuptiTracerOptions & option,CuptiTraceCollector * collector)1636 void CuptiTracer::Enable(const CuptiTracerOptions &option,
1637                          CuptiTraceCollector *collector) {
1638   option_ = option;
1639   collector_ = collector;
1640   if (option_->enable_event_based_activity) {
1641     option_->enable_activity_api = false;
1642     cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithCudaEvent(
1643         option, cupti_interface_, collector));
1644   } else {
1645     cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithActivityApi(
1646         option, cupti_interface_, collector));
1647   }
1648 
1649   Status status = EnableApiTracing();
1650   need_root_access_ |= status.code() == error::PERMISSION_DENIED;
1651   if (!status.ok()) return;
1652 
1653   if (option_->enable_activity_api) {
1654     EnableActivityTracing().IgnoreError();
1655   }
1656   tensorflow::profiler::AnnotationStack::Enable(true);
1657 }
1658 
Disable()1659 void CuptiTracer::Disable() {
1660   DisableApiTracing().IgnoreError();
1661   if (option_->enable_activity_api) {
1662     DisableActivityTracing().IgnoreError();
1663   }
1664   cupti_interface_->CleanUp();
1665   Finalize().IgnoreError();
1666   cupti_driver_api_hook_->SyncAndFlush().IgnoreError();
1667   collector_->Flush();
1668   collector_ = nullptr;
1669   option_.reset();
1670   cupti_driver_api_hook_.reset();
1671   tensorflow::profiler::AnnotationStack::Enable(false);
1672 }
1673 
EnableApiTracing()1674 Status CuptiTracer::EnableApiTracing() {
1675   if (api_tracing_enabled_) return Status::OK();
1676 
1677   VLOG(1) << "Enable subscriber";
1678   // Subscribe can return CUPTI_ERROR_MAX_LIMIT_REACHED.
1679   // The application which calls CUPTI APIs cannot be used with Nvidia tools
1680   // like nvprof, Nvidia Visual Profiler, Nsight Compute, Nsight Systems.
1681   RETURN_IF_CUPTI_ERROR(cupti_interface_->Subscribe(
1682       &subscriber_, (CUpti_CallbackFunc)ApiCallback, this));
1683   api_tracing_enabled_ = true;
1684 
1685   if (!option_->cbids_selected.empty()) {
1686     for (auto cbid : option_->cbids_selected) {
1687       RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
1688           1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
1689     }
1690   } else {  // select all callback ids.
1691     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1692         1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
1693   }
1694 
1695   if (option_->enable_nvtx_tracking) {
1696     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1697         1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
1698   }
1699   return Status::OK();
1700 }
1701 
DisableApiTracing()1702 Status CuptiTracer::DisableApiTracing() {
1703   if (!api_tracing_enabled_) return Status::OK();
1704 
1705   api_tracing_enabled_ = false;
1706 
1707   if (!option_->cbids_selected.empty()) {
1708     for (auto cbid : option_->cbids_selected) {
1709       RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
1710           0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
1711     }
1712   } else {
1713     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1714         0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
1715   }
1716 
1717   if (option_->enable_nvtx_tracking) {
1718     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1719         0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
1720   }
1721 
1722   VLOG(1) << "Disable subscriber";
1723   RETURN_IF_CUPTI_ERROR(cupti_interface_->Unsubscribe(subscriber_));
1724   return Status::OK();
1725 }
1726 
EnableActivityTracing()1727 Status CuptiTracer::EnableActivityTracing() {
1728   if (!option_->activities_selected.empty()) {
1729     // Initialize callback functions for Cupti Activity API.
1730     VLOG(1) << "Registering CUPTI activity callbacks";
1731     RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityRegisterCallbacks(
1732         RequestCuptiActivityBuffer, ProcessCuptiActivityBuffer));
1733 
1734     VLOG(1) << "Enabling activity tracing for "
1735             << option_->activities_selected.size() << " activities";
1736     for (auto activity : option_->activities_selected) {
1737       VLOG(1) << "Enabling activity tracing for: " << activity;
1738       if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
1739         ConfigureActivityUnifiedMemoryCounter(true);
1740       }
1741       RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityEnable(activity));
1742     }
1743   }
1744   activity_tracing_enabled_ = true;
1745   return Status::OK();
1746 }
1747 
DisableActivityTracing()1748 Status CuptiTracer::DisableActivityTracing() {
1749   if (activity_tracing_enabled_) {
1750     VLOG(1) << "Disabling activity tracing for "
1751             << option_->activities_selected.size() << " activities";
1752     for (auto activity : option_->activities_selected) {
1753       VLOG(1) << "Disabling activity tracing for: " << activity;
1754       if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
1755         ConfigureActivityUnifiedMemoryCounter(false);
1756       }
1757       RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityDisable(activity));
1758     }
1759     option_->activities_selected.clear();
1760 
1761     VLOG(1) << "Flushing CUPTI activity buffer";
1762     RETURN_IF_CUPTI_ERROR(
1763         cupti_interface_->ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
1764     LOG(INFO) << "CUPTI activity buffer flushed";
1765   }
1766   activity_tracing_enabled_ = false;
1767   return Status::OK();
1768 }
1769 
Finalize()1770 Status CuptiTracer::Finalize() {
1771   if (option_->cupti_finalize) {
1772     VLOG(1) << "CuptiFinalize";
1773     RETURN_IF_CUPTI_ERROR(cupti_interface_->Finalize());
1774   }
1775   return Status::OK();
1776 }
1777 
GetTimestamp()1778 /*static*/ uint64 CuptiTracer::GetTimestamp() {
1779   uint64_t tsc;
1780   CuptiInterface *cupti_interface = GetCuptiInterface();
1781   if (cupti_interface && cupti_interface->GetTimestamp(&tsc) == CUPTI_SUCCESS) {
1782     return tsc;
1783   }
1784   // Return 0 on error. If an activity timestamp is 0, the activity will be
1785   // dropped during time normalization.
1786   return 0;
1787 }
1788 
HandleNVTXCallback(CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1789 Status CuptiTracer::HandleNVTXCallback(CUpti_CallbackId cbid,
1790                                        const CUpti_CallbackData *cbdata) {
1791   const CUpti_NvtxData *pdata =
1792       reinterpret_cast<const CUpti_NvtxData *>(cbdata);
1793   if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePushEx) {
1794     const nvtxDomainRangePushEx_params *params =
1795         reinterpret_cast<const nvtxDomainRangePushEx_params *>(
1796             pdata->functionParams);
1797     // TODO(profiler): The messageType is actually NVTX_MESSAGE_TYPE_REGISTERED
1798     // (which is 3), However it seems to me that we can not get the registered
1799     // string from nvtxDomainRegisterStringA_params. If we reinterpret the
1800     // payload as ascii, it happen to work.
1801     NVTXRangeTracker::EnterRange(params->core.eventAttrib->message.ascii);
1802   } else if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePop) {
1803     NVTXRangeTracker::ExitRange();
1804   }
1805   return Status::OK();
1806 }
1807 
HandleCallback(CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1808 Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
1809                                    CUpti_CallbackId cbid,
1810                                    const CUpti_CallbackData *cbdata) {
1811   if (!api_tracing_enabled_) return Status::OK();    // already unsubscribed.
1812   if (!cupti_driver_api_hook_) return Status::OK();  // already unsubscribed.
1813   if (domain == CUPTI_CB_DOMAIN_NVTX) return HandleNVTXCallback(cbid, cbdata);
1814   if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return Status::OK();
1815   if (internalCuCall) return Status::OK();
1816 
1817   if (cbdata->context == nullptr) {
1818     // API callback is called before any CUDA context is created.
1819     // This is expected to be rare, and we ignore this case.
1820     VLOG(3) << "API callback received before creation of CUDA context\n";
1821     return errors::Internal("cutpi callback without context");
1822   }
1823 
1824   // Grab a correct device ID.
1825   uint32 device_id = -1;
1826   RETURN_IF_CUPTI_ERROR(
1827       cupti_interface_->GetDeviceId(cbdata->context, &device_id));
1828   if (device_id >= num_gpus_) {
1829     return errors::Internal("Invalid device id:", device_id);
1830   }
1831 
1832   if (cbdata->callbackSite == CUPTI_API_ENTER) {
1833     TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiEnter(
1834         device_id, domain, cbid, cbdata));
1835   } else if (cbdata->callbackSite == CUPTI_API_EXIT) {
1836     // Set up the map from correlation id to annotation string.
1837     const auto &annotation = AnnotationStack::Get();
1838     if (!annotation.empty()) {
1839       if (cbid ==
1840           CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice) {
1841         // Kernels are launched on different devices by this API call, therefore
1842         // we need to populate per device annotation map respectively.
1843         for (int i = 0; i < num_gpus_; ++i) {
1844           collector_->annotation_map()->Add(i, cbdata->correlationId,
1845                                             annotation, "");
1846         }
1847       } else {
1848         absl::string_view nvtx_range = NVTXRangeTracker::CurrentRange();
1849         collector_->annotation_map()->Add(device_id, cbdata->correlationId,
1850                                           annotation, nvtx_range);
1851       }
1852     }
1853 
1854     TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiExit(
1855         device_id, domain, cbid, cbdata));
1856   }
1857   return Status::OK();
1858 }
1859 
ConfigureActivityUnifiedMemoryCounter(bool enable)1860 void CuptiTracer::ConfigureActivityUnifiedMemoryCounter(bool enable) {
1861   CUpti_ActivityUnifiedMemoryCounterConfig config[2];
1862   // By experiments, currently only measurements from these two activities are
1863   // trustworthy. Others like GPU page fault may be problematic.
1864   config[0].kind =
1865       CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD;
1866   config[1].kind =
1867       CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH;
1868 
1869   for (size_t i = 0; i < 2; i++) {
1870     config[i].enable = enable;
1871   }
1872 
1873   CUptiResult res;
1874 
1875   res = cupti_interface_->ActivityConfigureUnifiedMemoryCounter(config, 2);
1876   if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED) {
1877     LOG(ERROR) << "Unified memory is not supported on the "
1878                   "underlying platform.\n";
1879   } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE) {
1880     LOG(ERROR) << "Unified memory is not supported on the device.\n";
1881   } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES) {
1882     LOG(ERROR) << "Unified memory is not supported on the "
1883                   "non-P2P multi-gpu setup.\n";
1884   } else if (res != CUPTI_SUCCESS) {
1885     const char *errstr = "";
1886     cuptiGetResultString(res, &errstr);
1887     LOG(ERROR) << "Error while enabling unified memory profiling: " << errstr;
1888   } else {
1889     VLOG(1) << "Configuring Unified memory profiling: " << res;
1890   }
1891 }
1892 
RequestActivityBuffer(uint8_t ** buffer,size_t * size)1893 void CuptiTracer::RequestActivityBuffer(uint8_t **buffer, size_t *size) {
1894   *buffer = buffer_pool_.GetOrCreateBuffer();
1895   if (*buffer == nullptr) {
1896     LOG(WARNING)
1897         << "CUPTI Buffer not allocated, activity records will be dropped";
1898     *size = 0;
1899     return;
1900   }
1901   *size = buffer_pool_.GetBufferSizeInBytes();
1902 }
1903 
ProcessActivityBuffer(CUcontext context,uint32_t stream_id,uint8_t * buffer,size_t size)1904 Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
1905                                           uint8_t *buffer, size_t size) {
1906   auto buffer_cleanup =
1907       gtl::MakeCleanup([&]() { buffer_pool_.ReclaimBuffer(buffer); });
1908   if (size == 0) {
1909     return Status::OK();
1910   }
1911   if (!activity_tracing_enabled_) {
1912     LOG(WARNING) << "CUPTI activity buffer is reclaimed after flush.";
1913     return Status::OK();
1914   }
1915   if (cupti_interface_->Disabled()) return errors::Internal("Disabled.");
1916 
1917   CUpti_Activity *record = nullptr;
1918   while (true) {
1919     CUptiResult status =
1920         cupti_interface_->ActivityGetNextRecord(buffer, size, &record);
1921     if (status == CUPTI_SUCCESS) {
1922       switch (record->kind) {
1923         case CUPTI_ACTIVITY_KIND_KERNEL:  // sequential
1924         case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
1925           AddKernelActivityEvent(
1926               collector_, reinterpret_cast<CUpti_ActivityKernel4 *>(record));
1927           break;
1928         case CUPTI_ACTIVITY_KIND_MEMCPY:
1929           AddMemcpyActivityEvent(
1930               collector_, reinterpret_cast<CUpti_ActivityMemcpy *>(record));
1931           break;
1932         case CUPTI_ACTIVITY_KIND_MEMCPY2:
1933           AddMemcpy2ActivityEvent(
1934               collector_, reinterpret_cast<CUpti_ActivityMemcpy2 *>(record));
1935           break;
1936         case CUPTI_ACTIVITY_KIND_OVERHEAD:
1937           AddCuptiOverheadActivityEvent(
1938               collector_, reinterpret_cast<CUpti_ActivityOverhead *>(record));
1939           break;
1940         case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER:
1941           AddUnifiedMemoryActivityEvent(
1942               collector_,
1943               reinterpret_cast<CUpti_ActivityUnifiedMemoryCounter2 *>(record));
1944           break;
1945         case CUPTI_ACTIVITY_KIND_MEMORY: {
1946           AddMemoryActivityEvent(
1947               collector_, reinterpret_cast<CUpti_ActivityMemory *>(record));
1948         } break;
1949         case CUPTI_ACTIVITY_KIND_MEMSET:
1950           AddMemsetActivityEvent(
1951               collector_, reinterpret_cast<CUpti_ActivityMemset *>(record));
1952           break;
1953         case CUPTI_ACTIVITY_KIND_SYNCHRONIZATION:
1954           AddSynchronizationActivityEvent(
1955               collector_,
1956               reinterpret_cast<CUpti_ActivitySynchronization *>(record));
1957           break;
1958         default:
1959           VLOG(3) << "Activity type " << record->kind << " is not supported.";
1960           break;
1961       }
1962     } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
1963       break;
1964     } else {
1965       return errors::Internal("Parse cupti activity buffer error.");
1966     }
1967   }
1968 
1969   // Report dropped records.
1970   size_t dropped;
1971   RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityGetNumDroppedRecords(
1972       context, stream_id, &dropped));
1973   if (dropped != 0) {
1974     uint32 device_id = -1;
1975     RETURN_IF_CUPTI_ERROR(cupti_interface_->GetDeviceId(context, &device_id));
1976     collector_->OnEventsDropped("cupti activity buffer full", dropped);
1977   }
1978   return Status::OK();
1979 }
1980 
ErrorIfAny()1981 /*static*/ std::string CuptiTracer::ErrorIfAny() {
1982   if (CuptiTracer::NumGpus() == 0) {
1983     return ErrorWithHostname("No GPU detected.");
1984   } else if (CuptiTracer::GetCuptiTracerSingleton()->NeedRootAccess()) {
1985     return ErrorWithHostname(
1986         "Insufficient privilege to run libcupti (you need root permission).");
1987   } else if (CuptiTracer::GetTimestamp() == 0) {
1988     return ErrorWithHostname(
1989         "Failed to load libcupti (is it installed and accessible?)");
1990   }
1991   return "";
1992 }
1993 
1994 }  // namespace profiler
1995 }  // namespace tensorflow
1996