• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
17 
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/container/flat_hash_set.h"
20 #include "absl/container/node_hash_map.h"
21 #include "absl/container/node_hash_set.h"
22 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
23 #include "third_party/gpus/cuda/extras/CUPTI/include/generated_nvtx_meta.h"
24 #include "tensorflow/core/platform/env.h"
25 #include "tensorflow/core/platform/errors.h"
26 #include "tensorflow/core/platform/host_info.h"
27 #include "tensorflow/core/platform/logging.h"
28 #include "tensorflow/core/platform/macros.h"
29 #include "tensorflow/core/platform/mem.h"
30 #include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
31 #include "tensorflow/core/profiler/internal/gpu/cupti_collector.h"
32 #include "tensorflow/core/profiler/internal/gpu/nvtx_utils.h"
33 
34 namespace tensorflow {
35 namespace profiler {
36 
37 namespace {
38 
39 static thread_local int internalCuCall = 0;
40 
41 // Temporary disable cupti api tracing for this thread during the life scope of
42 // this class. Used for the API calls that initiated by us.
43 class CuptiApiTracingDisabler {
44  public:
CuptiApiTracingDisabler()45   CuptiApiTracingDisabler() { internalCuCall++; }
~CuptiApiTracingDisabler()46   ~CuptiApiTracingDisabler() { internalCuCall--; }
47 };
48 
ToStatus(CUptiResult result)49 Status ToStatus(CUptiResult result) {
50   if (result == CUPTI_SUCCESS) {
51     return Status::OK();
52   }
53   const char *str = nullptr;
54   cuptiGetResultString(result, &str);
55   return errors::Unavailable("CUPTI error: ", str ? str : "<unknown>");
56 }
57 
ToStatus(CUresult result)58 Status ToStatus(CUresult result) {
59   if (result == CUDA_SUCCESS) {
60     return Status::OK();
61   }
62   const char *str = nullptr;
63   cuGetErrorName(result, &str);
64   return errors::Unavailable("CUDA error: ", str ? str : "<unknown>");
65 }
66 
LogIfError(const Status & status)67 inline void LogIfError(const Status &status) {
68   if (status.ok()) return;
69   LOG(ERROR) << status.error_message();
70 }
71 
72 // Maps an OverheadKind enum to a const string.
getActivityOverheadKindString(CUpti_ActivityOverheadKind kind)73 const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
74   switch (kind) {
75     case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER:
76       return "COMPILER";
77     case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH:
78       return "BUFFER_FLUSH";
79     case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION:
80       return "INSTRUMENTATION";
81     case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE:
82       return "RESOURCE";
83     default:
84       break;
85   }
86   return "<UNKNOWN>";
87 }
88 
getActivityUnifiedMemoryKindString(CUpti_ActivityUnifiedMemoryCounterKind kind)89 const char *getActivityUnifiedMemoryKindString(
90     CUpti_ActivityUnifiedMemoryCounterKind kind) {
91   switch (kind) {
92     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
93       return "UM_BYTES_TRANSFER_HTOD";
94     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
95       return "UM_BYTES_TRANSFER_DTOH";
96     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
97       return "UM_CPU_PAGE_FAULT";
98     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
99       return "UM_GPU_PAGE_FAULT";
100     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING:
101       return "UM_THRASHING";
102     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING:
103       return "UM_THROTTLING";
104     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP:
105       return "UM_REMOTE_MAP";
106     case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD:
107       return "UM_BYTES_TRANSFER_DTOD";
108     default:
109       break;
110   }
111   return "<UNKNOWN>";
112 }
113 
114 // CUPTI_ERROR_INSUFFICIENT_PRIVILEGES is introduced at CUDA 10.1.
115 #if CUDA_VERSION <= 10000
116 #define CUPTI_ERROR_INSUFFICIENT_PRIVILEGES 35
117 #endif
118 
119 #define RETURN_IF_CUPTI_ERROR(expr)                                         \
120   do {                                                                      \
121     CUptiResult status = expr;                                              \
122     if (ABSL_PREDICT_FALSE(status != CUPTI_SUCCESS)) {                      \
123       const char *errstr = "";                                              \
124       cupti_interface_->GetResultString(status, &errstr);                   \
125       LOG(ERROR) << "function " << #expr << "failed with error " << errstr; \
126       if (status == CUPTI_ERROR_INSUFFICIENT_PRIVILEGES) {                  \
127         return errors::PermissionDenied("CUPTI need root access!");         \
128       } else {                                                              \
129         return errors::Internal("CUPTI call error", errstr);                \
130       }                                                                     \
131     }                                                                       \
132   } while (false)
133 
Bytes2D(const CUDA_MEMCPY2D * p)134 size_t Bytes2D(const CUDA_MEMCPY2D *p) { return p->Height * p->WidthInBytes; }
135 
Bytes3D(const CUDA_MEMCPY3D * p)136 size_t Bytes3D(const CUDA_MEMCPY3D *p) {
137   return p->Depth * p->Height * p->WidthInBytes;
138 }
139 
140 template <typename CudaMemcpy>
MemcpyKind(const CudaMemcpy * p)141 CuptiTracerEventType MemcpyKind(const CudaMemcpy *p) {
142   if (p->srcMemoryType == CU_MEMORYTYPE_HOST &&
143       p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
144     return CuptiTracerEventType::MemcpyH2D;
145   }
146   if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
147       p->dstMemoryType == CU_MEMORYTYPE_HOST) {
148     return CuptiTracerEventType::MemcpyD2H;
149   }
150   if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
151       p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
152     return CuptiTracerEventType::MemcpyD2D;
153   }
154   return CuptiTracerEventType::Unsupported;
155 }
156 
157 std::tuple<size_t /*bytes*/, CuptiTracerEventType, bool /*async*/>
DecodeDriverMemcpy(CUpti_CallbackId cbid,const void * params)158 DecodeDriverMemcpy(CUpti_CallbackId cbid, const void *params) {
159   switch (cbid) {
160     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: {
161       const auto *p = reinterpret_cast<const cuMemcpyHtoD_v2_params *>(params);
162       return {p->ByteCount, CuptiTracerEventType::MemcpyH2D, false};
163     }
164     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: {
165       const auto *p =
166           reinterpret_cast<const cuMemcpyHtoDAsync_v2_params *>(params);
167       return {p->ByteCount, CuptiTracerEventType::MemcpyH2D, true};
168     }
169     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: {
170       const auto *p = reinterpret_cast<const cuMemcpyDtoH_v2_params *>(params);
171       return {p->ByteCount, CuptiTracerEventType::MemcpyD2H, false};
172     }
173     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: {
174       const auto *p =
175           reinterpret_cast<const cuMemcpyDtoHAsync_v2_params *>(params);
176       return {p->ByteCount, CuptiTracerEventType::MemcpyD2H, true};
177     }
178     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: {
179       const auto *p = reinterpret_cast<const cuMemcpyDtoD_v2_params *>(params);
180       return {p->ByteCount, CuptiTracerEventType::MemcpyD2D, false};
181     }
182     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: {
183       const auto *p =
184           reinterpret_cast<const cuMemcpyDtoDAsync_v2_params *>(params);
185       return {p->ByteCount, CuptiTracerEventType::MemcpyD2D, true};
186     }
187     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
188       const auto *p = reinterpret_cast<const cuMemcpy_params *>(params);
189       return {p->ByteCount, CuptiTracerEventType::MemcpyOther, false};
190     }
191     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
192       const auto *p = reinterpret_cast<const cuMemcpyAsync_params *>(params);
193       return {p->ByteCount, CuptiTracerEventType::MemcpyOther, true};
194     }
195     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2: {
196       const auto *p = reinterpret_cast<const cuMemcpy2D_v2_params *>(params);
197       return {Bytes2D(p->pCopy), MemcpyKind(p->pCopy), false};
198     }
199     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2: {
200       const auto *p =
201           reinterpret_cast<const cuMemcpy2DAsync_v2_params *>(params);
202       return {Bytes2D(p->pCopy), MemcpyKind(p->pCopy), true};
203     }
204     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2: {
205       const auto *p = reinterpret_cast<const cuMemcpy3D_v2_params *>(params);
206       return {Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true};
207     }
208     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2: {
209       const auto *p =
210           reinterpret_cast<const cuMemcpy3DAsync_v2_params *>(params);
211       return {Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true};
212     }
213     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: {
214       const auto *p2p_params =
215           reinterpret_cast<const cuMemcpyPeer_params *>(params);
216       return {p2p_params->ByteCount, CuptiTracerEventType::MemcpyP2P, false};
217     }
218     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: {
219       const auto *p2p_params =
220           reinterpret_cast<const cuMemcpyPeerAsync_params *>(params);
221       return {p2p_params->ByteCount, CuptiTracerEventType::MemcpyP2P, true};
222     }
223     default: {
224       LOG(ERROR) << "Unsupported memcpy activity observed: " << cbid;
225       return {0, CuptiTracerEventType::Unsupported, false};
226     }
227   }
228 }
229 
230 std::tuple<size_t /*bytes*/, CuptiTracerEventType, bool /*async*/>
DecodeDriverMemset(CUpti_CallbackId cbid,const void * params)231 DecodeDriverMemset(CUpti_CallbackId cbid, const void *params) {
232   switch (cbid) {
233     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2: {
234       const auto *p = reinterpret_cast<const cuMemsetD8_v2_params *>(params);
235       return {p->N, CuptiTracerEventType::Memset, false};
236     }
237     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2: {
238       const auto *p = reinterpret_cast<const cuMemsetD16_v2_params *>(params);
239       return {p->N, CuptiTracerEventType::Memset, false};
240     }
241     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2: {
242       const auto *p = reinterpret_cast<const cuMemsetD32_v2_params *>(params);
243       return {p->N, CuptiTracerEventType::Memset, false};
244     }
245     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2: {
246       const auto *p = reinterpret_cast<const cuMemsetD2D8_v2_params *>(params);
247       return {p->dstPitch * p->Height, CuptiTracerEventType::Memset, false};
248     }
249     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2: {
250       const auto *p = reinterpret_cast<const cuMemsetD2D16_v2_params *>(params);
251       return {p->dstPitch * p->Height, CuptiTracerEventType::Memset, false};
252     }
253     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2: {
254       const auto *p = reinterpret_cast<const cuMemsetD2D32_v2_params *>(params);
255       return {p->dstPitch * p->Height, CuptiTracerEventType::Memset, false};
256     }
257     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async: {
258       const auto *p = reinterpret_cast<const cuMemsetD8Async_params *>(params);
259       return {p->N, CuptiTracerEventType::Memset, true};
260     }
261     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async: {
262       const auto *p = reinterpret_cast<const cuMemsetD16Async_params *>(params);
263       return {p->N, CuptiTracerEventType::Memset, true};
264     }
265     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async: {
266       const auto *p = reinterpret_cast<const cuMemsetD32Async_params *>(params);
267       return {p->N, CuptiTracerEventType::Memset, true};
268     }
269     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async: {
270       const auto *p =
271           reinterpret_cast<const cuMemsetD2D8Async_params *>(params);
272       return {p->dstPitch * p->Height, CuptiTracerEventType::Memset, true};
273     }
274     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async: {
275       const auto *p =
276           reinterpret_cast<const cuMemsetD2D16Async_params *>(params);
277       return {p->dstPitch * p->Height, CuptiTracerEventType::Memset, true};
278     }
279     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async: {
280       const auto *p =
281           reinterpret_cast<const cuMemsetD2D32Async_params *>(params);
282       return {p->dstPitch * p->Height, CuptiTracerEventType::Memset, true};
283     }
284     default: {
285       LOG(ERROR) << "Unsupported memset activity observed: " << cbid;
286       return {0, CuptiTracerEventType::Unsupported, false};
287     }
288   }
289 }
290 
291 // Cupti callback corresponding to a driver or runtime API. This global function
292 // is invoked twice for each API: at entry and at exit. The cbdata
293 // parameter is guaranteed by Cupti to be thread-safe. Most invocations are
294 // dropped to the floor and entry/exit is tracked for the APIs we deem
295 // performance-relevant.
ApiCallback(void * user_data,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)296 void CUPTIAPI ApiCallback(void *user_data, CUpti_CallbackDomain domain,
297                           CUpti_CallbackId cbid,
298                           const CUpti_CallbackData *cbdata) {
299   CuptiTracer *tracer = reinterpret_cast<CuptiTracer *>(user_data);
300   tracer->HandleCallback(domain, cbid, cbdata).IgnoreError();
301 }
302 
303 // Callback which is invoked when an empty buffer is requested by CUPTI.
304 // Allocates an empty aligned-memory buffer. The buffer is used by CUPTI as a
305 // ring buffer where device maintains activity profiles that have been
306 // collected.
AllocCuptiActivityBuffer(uint8_t ** buffer,size_t * size,size_t * maxNumRecords)307 void CUPTIAPI AllocCuptiActivityBuffer(uint8_t **buffer, size_t *size,
308                                        size_t *maxNumRecords) {
309   // Buffer size and alignment, 32K and 8 as in CUPTI samples.
310   constexpr size_t kBufferSize = 32 * 1024;
311   constexpr int kBufferAlignSize = 8;
312   *buffer = reinterpret_cast<uint8_t *>(
313       port::AlignedMalloc(kBufferSize, kBufferAlignSize));
314   if (*buffer == nullptr) {
315     LOG(WARNING)
316         << "Cupti Buffer not allocated, activity records will be dropped";
317     return;
318   }
319   *size = kBufferSize;
320   *maxNumRecords = 0;  // Cupti to fill as many records as fit in the buffer.
321   VLOG(3) << "Allocated Cupti Buffer, buffer=" << std::hex
322           << reinterpret_cast<uintptr_t>(*buffer) << std::dec
323           << " size=" << *size;
324 }
325 
326 // Callback which is invoked when a buffer containing activity records is
327 // available from CUPTI. Frees the buffer after reading activity records from
328 // it.
FreeCuptiActivityBuffer(CUcontext context,uint32_t stream_id,uint8_t * buffer,size_t size,size_t valid_size)329 void CUPTIAPI FreeCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
330                                       uint8_t *buffer, size_t size,
331                                       size_t valid_size) {
332   VLOG(3) << "Freeing Cupti Buffer, buffer:" << std::hex
333           << reinterpret_cast<uintptr_t>(buffer) << std::dec
334           << " size: " << size << " valid_size: " << valid_size;
335 
336   if (valid_size > 0) {
337     VLOG(3) << "Activity profile for stream " << stream_id;
338 
339     CuptiTracer *cupti_tracer = CuptiTracer::GetCuptiTracerSingleton();
340     cupti_tracer->ProcessActivityBuffer(context, stream_id, buffer, valid_size)
341         .IgnoreError();
342   }
343   port::AlignedFree(buffer);
344 }
345 
AddKernelEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)346 void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
347                                const CUpti_CallbackData *cbdata,
348                                uint64 start_time, uint64 end_time) {
349   CuptiTracerEvent event{};
350   event.type = CuptiTracerEventType::Kernel;
351   event.source = CuptiTracerEventSource::DriverCallback;
352   event.name = cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
353   event.start_time_ns = start_time;
354   event.end_time_ns = end_time;
355   event.thread_id = Env::Default()->GetCurrentThreadId();
356   event.device_id = device_id;
357   event.context_id = cbdata->contextUid;
358   event.correlation_id = cbdata->correlationId;
359   VLOG(3) << "Cuda Kernel launch API exit. name=" << event.name;
360   collector->AddEvent(std::move(event));
361 }
362 
363 // Performs the actual callback for both normal and P2P memcpy operations.
PopulateMemcpyCallbackEvent(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,size_t num_bytes,uint32 src_device,uint32 dst_device,bool async,uint64 start_time,uint64 end_time)364 CuptiTracerEvent PopulateMemcpyCallbackEvent(
365     CuptiTracerEventType type, const CUpti_CallbackData *cbdata,
366     size_t num_bytes, uint32 src_device, uint32 dst_device, bool async,
367     uint64 start_time, uint64 end_time) {
368   CuptiTracerEvent event{};
369   event.type = type;
370   event.source = CuptiTracerEventSource::DriverCallback;
371   event.start_time_ns = start_time;
372   event.end_time_ns = end_time;
373   event.thread_id = Env::Default()->GetCurrentThreadId();
374   event.device_id = src_device;
375   event.context_id = cbdata->contextUid;
376   event.correlation_id = cbdata->correlationId;
377   event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
378   event.memcpy_info.num_bytes = num_bytes;
379   event.memcpy_info.destination = dst_device;
380   event.memcpy_info.async = async;
381   return event;
382 }
383 
AddNormalMemcpyEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)384 void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector,
385                                      uint32 device_id, CUpti_CallbackId cbid,
386                                      const CUpti_CallbackData *cbdata,
387                                      uint64 start_time, uint64 end_time) {
388   size_t num_bytes;
389   CuptiTracerEventType type;
390   bool async;
391   std::tie(num_bytes, type, async) =
392       DecodeDriverMemcpy(cbid, cbdata->functionParams);
393 
394   VLOG(3) << "Cuda Memcpy API exit. sz=" << num_bytes;
395   CuptiTracerEvent event =
396       PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, device_id, device_id,
397                                   async, start_time, end_time);
398   collector->AddEvent(std::move(event));
399 }
400 
AddCuMemsetEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)401 void AddCuMemsetEventUponApiExit(CuptiTraceCollector *collector,
402                                  uint32 device_id, CUpti_CallbackId cbid,
403                                  const CUpti_CallbackData *cbdata,
404                                  uint64 start_time, uint64 end_time) {
405   // We are casting all variants of cuMemset to cuMemsetD8 for accessing the
406   // first member attribute, a CUdeviceptr.
407   const auto *params =
408       static_cast<const cuMemsetD8_v2_params *>(cbdata->functionParams);
409   size_t num_bytes;
410   bool async;
411   CuptiTracerEventType type;
412   std::tie(num_bytes, type, async) =
413       DecodeDriverMemset(cbid, cbdata->functionParams);
414 
415   CuptiTracerEvent event{};
416   event.type = type;
417   event.source = CuptiTracerEventSource::DriverCallback;
418   event.start_time_ns = start_time;
419   event.end_time_ns = end_time;
420   event.thread_id = Env::Default()->GetCurrentThreadId();
421   event.device_id = device_id;
422   event.context_id = cbdata->contextUid;
423   event.correlation_id = cbdata->correlationId;
424   event.memset_info.num_bytes = num_bytes;
425   // memset_info.kind cannot be determined from API.
426   event.memset_info.async = async;
427   VLOG(3) << "Cuda Memset API exit."
428           << " dptr=" << reinterpret_cast<void *>(params->dstDevice)
429           << " sz=" << num_bytes;
430   collector->AddEvent(std::move(event));
431 }
432 
AddP2PMemcpyEventUponApiExit(CuptiTraceCollector * collector,CuptiInterface * cupti_interface,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)433 void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector,
434                                   CuptiInterface *cupti_interface,
435                                   uint32 device_id, CUpti_CallbackId cbid,
436                                   const CUpti_CallbackData *cbdata,
437                                   uint64 start_time, uint64 end_time) {
438   size_t num_bytes;
439   CuptiTracerEventType type;
440   bool async;
441   std::tie(num_bytes, type, async) =
442       DecodeDriverMemcpy(cbid, cbdata->functionParams);
443 
444   uint32 dst_device = -1, src_device = -1;
445   const auto *p2p_params =
446       static_cast<const cuMemcpyPeer_params *>(cbdata->functionParams);
447   cupti_interface->GetDeviceId(p2p_params->srcContext, &src_device);
448   cupti_interface->GetDeviceId(p2p_params->dstContext, &dst_device);
449   VLOG(3) << "Cuda P2P Memcpy API exit, src: " << src_device
450           << " dst: " << dst_device << " size:" << num_bytes;
451   CuptiTracerEvent event =
452       PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, src_device,
453                                   dst_device, async, start_time, end_time);
454   collector->AddEvent(std::move(event));
455 }
456 
AddCuMemAllocEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)457 void AddCuMemAllocEventUponApiExit(CuptiTraceCollector *collector,
458                                    uint32 device_id, CUpti_CallbackId cbid,
459                                    const CUpti_CallbackData *cbdata,
460                                    uint64 start_time, uint64 end_time) {
461   const auto *params =
462       static_cast<const cuMemAlloc_v2_params *>(cbdata->functionParams);
463   CuptiTracerEvent event{};
464   event.type = CuptiTracerEventType::MemoryAlloc;
465   event.source = CuptiTracerEventSource::DriverCallback;
466   event.name = cbdata->functionName;
467   event.start_time_ns = start_time;
468   event.end_time_ns = end_time;
469   event.thread_id = Env::Default()->GetCurrentThreadId();
470   event.device_id = device_id;
471   event.context_id = cbdata->contextUid;
472   event.correlation_id = cbdata->correlationId;
473   event.memalloc_info.num_bytes = params->bytesize;
474   VLOG(3) << "Cuda MemAlloc API exit."
475           << " dptr=" << reinterpret_cast<void *>(*params->dptr)
476           << " sz=" << params->bytesize;
477   collector->AddEvent(std::move(event));
478 }
479 
AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)480 void AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector *collector,
481                                         uint32 device_id, CUpti_CallbackId cbid,
482                                         const CUpti_CallbackData *cbdata,
483                                         uint64 start_time, uint64 end_time) {
484   const auto *params =
485       static_cast<const cuMemAllocPitch_v2_params *>(cbdata->functionParams);
486   CuptiTracerEvent event{};
487   event.type = CuptiTracerEventType::MemoryAlloc;
488   event.source = CuptiTracerEventSource::DriverCallback;
489   event.name = cbdata->functionName;
490   event.start_time_ns = start_time;
491   event.end_time_ns = end_time;
492   event.thread_id = Env::Default()->GetCurrentThreadId();
493   event.device_id = device_id;
494   event.context_id = cbdata->contextUid;
495   event.correlation_id = cbdata->correlationId;
496   const size_t size_in_bytes = *params->pPitch * params->Height;
497   event.memalloc_info.num_bytes = size_in_bytes;
498   VLOG(3) << "Cuda MemAllocPitch API exit."
499           << " dptr=" << reinterpret_cast<void *>(*params->dptr)
500           << " sz=" << size_in_bytes;
501   collector->AddEvent(std::move(event));
502 }
503 
AddCuMemFreeEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)504 void AddCuMemFreeEventUponApiExit(CuptiTraceCollector *collector,
505                                   uint32 device_id, CUpti_CallbackId cbid,
506                                   const CUpti_CallbackData *cbdata,
507                                   uint64 start_time, uint64 end_time) {
508   const auto *params =
509       static_cast<const cuMemFree_v2_params *>(cbdata->functionParams);
510   CuptiTracerEvent event{};
511   event.type = CuptiTracerEventType::MemoryFree;
512   event.source = CuptiTracerEventSource::DriverCallback;
513   event.name = cbdata->functionName;
514   event.start_time_ns = start_time;
515   event.end_time_ns = end_time;
516   event.thread_id = Env::Default()->GetCurrentThreadId();
517   event.device_id = device_id;
518   event.context_id = cbdata->contextUid;
519   event.correlation_id = cbdata->correlationId;
520   VLOG(3) << "Cuda MemFree API exit."
521           << " dptr=" << reinterpret_cast<void *>(params->dptr);
522   collector->AddEvent(std::move(event));
523 }
524 
AddGenericEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)525 void AddGenericEventUponApiExit(CuptiTraceCollector *collector,
526                                 uint32 device_id, CUpti_CallbackId cbid,
527                                 const CUpti_CallbackData *cbdata,
528                                 uint64 start_time, uint64 end_time) {
529   CuptiTracerEvent event{};
530   event.type = CuptiTracerEventType::Generic;
531   event.source = CuptiTracerEventSource::DriverCallback;
532   event.name = cbdata->functionName;
533   event.start_time_ns = start_time;
534   event.end_time_ns = end_time;
535   event.thread_id = Env::Default()->GetCurrentThreadId();
536   event.device_id = device_id;
537   event.context_id = cbdata->contextUid;
538   event.correlation_id = cbdata->correlationId;
539   VLOG(3) << "Observed generic API exit."
540           << " name=" << cbdata->functionName;
541   collector->AddEvent(std::move(event));
542 }
543 
AddKernelActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityKernel4 * kernel)544 void AddKernelActivityEvent(CuptiTraceCollector *collector,
545                             const CUpti_ActivityKernel4 *kernel) {
546   CuptiTracerEvent event{};
547   event.type = CuptiTracerEventType::Kernel;
548   event.source = CuptiTracerEventSource::Activity;
549   event.name = kernel->name;
550   event.start_time_ns = kernel->start;
551   event.end_time_ns = kernel->end;
552   event.device_id = kernel->deviceId;
553   event.context_id = kernel->contextId;
554   event.stream_id = kernel->streamId;
555   event.correlation_id = kernel->correlationId;
556   AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
557       event.device_id, event.correlation_id);
558   event.annotation = info.annotation;
559   event.nvtx_range = info.nvtx_range;
560   event.kernel_info.registers_per_thread = kernel->registersPerThread;
561   event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
562   event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
563   event.kernel_info.block_x = kernel->blockX;
564   event.kernel_info.block_y = kernel->blockY;
565   event.kernel_info.block_z = kernel->blockZ;
566   event.kernel_info.grid_x = kernel->gridX;
567   event.kernel_info.grid_y = kernel->gridY;
568   event.kernel_info.grid_z = kernel->gridZ;
569   collector->AddEvent(std::move(event));
570 }
571 
AddMemcpyActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemcpy * memcpy)572 void AddMemcpyActivityEvent(CuptiTraceCollector *collector,
573                             const CUpti_ActivityMemcpy *memcpy) {
574   CuptiTracerEvent event{};
575   switch (memcpy->copyKind) {
576     case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
577       event.type = CuptiTracerEventType::MemcpyH2D;
578       event.name = "MemcpyH2D";
579       break;
580     case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
581       event.type = CuptiTracerEventType::MemcpyD2H;
582       event.name = "MemcpyD2H";
583       break;
584     case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
585       event.type = CuptiTracerEventType::MemcpyD2D;
586       event.name = "MemcpyD2D";
587       break;
588     case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
589       event.type = CuptiTracerEventType::MemcpyP2P;
590       event.name = "MemcpyP2P";
591       break;
592     default:
593       event.type = CuptiTracerEventType::MemcpyOther;
594       event.name = "MemcpyOther";
595       break;
596   }
597   event.source = CuptiTracerEventSource::Activity;
598   event.start_time_ns = memcpy->start;
599   event.end_time_ns = memcpy->end;
600   event.device_id = memcpy->deviceId;
601   event.context_id = memcpy->contextId;
602   event.stream_id = memcpy->streamId;
603   event.correlation_id = memcpy->correlationId;
604   AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
605       event.device_id, event.correlation_id);
606   event.annotation = info.annotation;
607   event.memcpy_info.kind = memcpy->copyKind;
608   event.memcpy_info.num_bytes = memcpy->bytes;
609   event.memcpy_info.destination = memcpy->deviceId;
610   event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
611   event.memcpy_info.src_mem_kind = memcpy->srcKind;
612   event.memcpy_info.dst_mem_kind = memcpy->dstKind;
613   collector->AddEvent(std::move(event));
614 }
615 
616 // Invokes callback upon peer-2-peer memcpy between different GPU devices.
AddMemcpy2ActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemcpy2 * memcpy2)617 void AddMemcpy2ActivityEvent(CuptiTraceCollector *collector,
618                              const CUpti_ActivityMemcpy2 *memcpy2) {
619   CuptiTracerEvent event{};
620   event.type = CuptiTracerEventType::MemcpyP2P;
621   event.name = "MemcpyP2P";
622   event.source = CuptiTracerEventSource::Activity;
623   event.start_time_ns = memcpy2->start;
624   event.end_time_ns = memcpy2->end;
625   event.device_id = memcpy2->srcDeviceId;
626   event.context_id = memcpy2->contextId;
627   event.stream_id = memcpy2->streamId;
628   event.correlation_id = memcpy2->correlationId;
629   AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
630       event.device_id, event.correlation_id);
631   event.annotation = info.annotation;
632   event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
633   event.memcpy_info.num_bytes = memcpy2->bytes;
634   event.memcpy_info.destination = memcpy2->dstDeviceId;
635   event.memcpy_info.async = memcpy2->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
636   event.memcpy_info.src_mem_kind = memcpy2->srcKind;
637   event.memcpy_info.dst_mem_kind = memcpy2->dstKind;
638   collector->AddEvent(std::move(event));
639 }
640 
AddCuptiOverheadActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityOverhead * overhead)641 void AddCuptiOverheadActivityEvent(CuptiTraceCollector *collector,
642                                    const CUpti_ActivityOverhead *overhead) {
643   CuptiTracerEvent event{};
644   event.type = CuptiTracerEventType::Overhead;
645   event.name = getActivityOverheadKindString(overhead->overheadKind);
646   event.source = CuptiTracerEventSource::Activity;
647   event.start_time_ns = overhead->start;
648   event.end_time_ns = overhead->end;
649   // If the overhead is not related to a device, we assign it to device 0.
650   event.device_id = 0;
651   // NOTE: no correlation id.
652   switch (overhead->objectKind) {
653     case CUPTI_ACTIVITY_OBJECT_UNKNOWN:
654       // Don't know how to deal with such activities because of we need either
655       // attribute it to a GPU stream or a CPU thread.
656       return;
657 
658     case CUPTI_ACTIVITY_OBJECT_THREAD:
659     case CUPTI_ACTIVITY_OBJECT_PROCESS:
660       event.thread_id = overhead->objectId.pt.threadId;
661       break;
662     case CUPTI_ACTIVITY_OBJECT_STREAM:
663       event.stream_id = overhead->objectId.dcs.streamId;
664       TF_FALLTHROUGH_INTENDED;
665     case CUPTI_ACTIVITY_OBJECT_DEVICE:
666     case CUPTI_ACTIVITY_OBJECT_CONTEXT:
667       event.device_id = overhead->objectId.dcs.deviceId;
668       break;
669     default:
670       LOG(ERROR) << "Unexpected object kind: " << overhead->objectKind;
671       return;
672   }
673   collector->AddEvent(std::move(event));
674 }
675 
AddUnifiedMemoryActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityUnifiedMemoryCounter2 * record)676 void AddUnifiedMemoryActivityEvent(
677     CuptiTraceCollector *collector,
678     const CUpti_ActivityUnifiedMemoryCounter2 *record) {
679   VLOG(3) << "Cuda Unified Memory Activity, kind: " << record->counterKind
680           << " src: " << record->srcId << " dst: " << record->dstId;
681   CuptiTracerEvent event{};
682   event.type = CuptiTracerEventType::UnifiedMemory;
683   event.name = getActivityUnifiedMemoryKindString(record->counterKind);
684   event.source = CuptiTracerEventSource::Activity;
685   event.start_time_ns = record->start;
686   if (record->counterKind ==
687           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT ||
688       record->counterKind ==
689           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING ||
690       record->counterKind ==
691           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP ||
692       record->end <= record->start) {
693     // If the end time is not valid, trim it so that it can be shown on the UI.
694     event.end_time_ns = record->start + 1;
695   } else {
696     event.end_time_ns = record->end;
697   }
698   event.device_id = record->srcId;
699   // NOTE: not context id and correlation id.
700 
701   // For visualization purpose, we assign a pseudo stream id for each
702   // record->counterKind of unified memory related events.
703   constexpr int kPseudoStreamId = 0x10000000;
704   event.stream_id = kPseudoStreamId + record->counterKind;
705   event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
706   // Check whether the activity is byte transfer.
707   if (record->counterKind ==
708           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD ||
709       record->counterKind ==
710           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH ||
711       record->counterKind ==
712           CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD) {
713     event.memcpy_info.num_bytes = record->value;
714   } else {
715     event.memcpy_info.num_bytes = 0;
716   }
717   event.memcpy_info.destination = record->dstId;
718   event.memcpy_info.async = false;
719   collector->AddEvent(std::move(event));
720 }
721 
AddMemoryActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemory * memory)722 void AddMemoryActivityEvent(CuptiTraceCollector *collector,
723                             const CUpti_ActivityMemory *memory) {
724   CuptiTracerEvent event{};
725   event.name = absl::StrCat("Memory ", GetMemoryKindName(memory->memoryKind));
726   event.type = CuptiTracerEventType::MemoryResidency;
727   event.source = CuptiTracerEventSource::Activity;
728   event.start_time_ns = memory->start;
729   event.end_time_ns = std::max(memory->end, memory->start + 1);
730   event.device_id = memory->deviceId;
731   event.context_id = memory->contextId;
732   // Assign to default stream (0) so that event is included during Flush().
733   event.stream_id = 0;
734   event.memory_residency_info.num_bytes = memory->bytes;
735   event.memory_residency_info.kind = memory->memoryKind;
736   event.memory_residency_info.address = memory->address;
737   VLOG(5) << "Cuda activity " << event.name
738           << " addr: " << reinterpret_cast<void *>(memory->address)
739           << " bytes: " << memory->bytes;
740   collector->AddEvent(std::move(event));
741 }
742 
AddMemsetActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemset * memset)743 void AddMemsetActivityEvent(CuptiTraceCollector *collector,
744                             const CUpti_ActivityMemset *memset) {
745   CuptiTracerEvent event{};
746   event.type = CuptiTracerEventType::Memset;
747   event.source = CuptiTracerEventSource::Activity;
748   event.name = absl::StrCat("Memset ", GetMemoryKindName(memset->memoryKind));
749   event.start_time_ns = memset->start;
750   event.end_time_ns = std::max(memset->end, memset->start + 1);
751   event.device_id = memset->deviceId;
752   event.correlation_id = memset->correlationId;
753   event.context_id = memset->contextId;
754   event.stream_id = memset->streamId;
755   event.memset_info.num_bytes = memset->bytes;
756   event.memset_info.kind = memset->memoryKind;
757   event.memset_info.async = (memset->flags & CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC);
758   VLOG(5) << "Cuda activity " << event.name << " bytes: " << memset->bytes
759           << " async: " << event.memset_info.async;
760   collector->AddEvent(std::move(event));
761 }
762 
AddSynchronizationActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivitySynchronization * sync)763 void AddSynchronizationActivityEvent(
764     CuptiTraceCollector *collector, const CUpti_ActivitySynchronization *sync) {
765   CuptiTracerEvent event{};
766   event.type = CuptiTracerEventType::Generic;
767   event.source = CuptiTracerEventSource::Activity;
768   switch (sync->type) {
769     case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE:
770       event.name = "cuEventSynchronize";
771       break;
772     case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT:
773       event.name = "cuStreamWaitEvent";
774       break;
775     case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE:
776       event.name = "cuStreamSynchronize";
777       break;
778     case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_CONTEXT_SYNCHRONIZE:
779       event.name = "cuCtxSynchronize";
780       break;
781     default:
782       event.name = "unknown synchronization event";
783       break;
784   }
785   event.start_time_ns = sync->start;
786   event.end_time_ns = std::max(sync->end, sync->start + 1);
787   event.correlation_id = sync->correlationId;
788   event.context_id = sync->contextId;
789   VLOG(5) << "Cuda activity " << event.name;
790   collector->AddEvent(std::move(event));
791 }
792 
793 // This hook uses cupti activity api to measure device side activities.
794 class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
795  public:
CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions & option,CuptiInterface * cupti_interface,CuptiTraceCollector * collector)796   CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions &option,
797                                     CuptiInterface *cupti_interface,
798                                     CuptiTraceCollector *collector)
799       : option_(option),
800         cupti_interface_(cupti_interface),
801         collector_(collector) {}
802 
OnDriverApiEnter(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)803   Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
804                           CUpti_CallbackId cbid,
805                           const CUpti_CallbackData *cbdata) override {
806     // Stash away the current Cupti timestamp into cbdata.
807     *cbdata->correlationData =
808         option_.required_callback_api_events ? CuptiTracer::GetTimestamp() : 0;
809     return Status::OK();
810   }
OnDriverApiExit(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)811   Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
812                          CUpti_CallbackId cbid,
813                          const CUpti_CallbackData *cbdata) override {
814     // If we are not collecting CPU events from Callback API, we can return now.
815     if (!option_.required_callback_api_events) {
816       return Status::OK();
817     }
818 
819     // Grab timestamp for API exit. API entry timestamp saved in cbdata.
820     uint64 end_tsc = CuptiTracer::GetTimestamp();
821     uint64 start_tsc = *cbdata->correlationData;
822     TrackContext(cbid, cbdata->context);
823     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
824                                      start_tsc, end_tsc, domain, cbid, cbdata);
825   }
SyncAndFlush()826   Status SyncAndFlush() override {
827     if (option_.sync_devices_before_stop) {
828       CuptiApiTracingDisabler disabler;
829       absl::MutexLock lock(&mutex_);
830       for (auto &ctx : contexts_) {
831         cuCtxPushCurrent(ctx);
832         cuCtxSynchronize();  // Ignore error here for best effort.
833         CUcontext current;
834         cuCtxPopCurrent(&current);
835       }
836     }
837     return Status::OK();
838   }
839 
840  private:
TrackContext(CUpti_CallbackId cbid,CUcontext ctx)841   void TrackContext(CUpti_CallbackId cbid, CUcontext ctx) {
842     if (!option_.sync_devices_before_stop) return;
843     if (ctx == NULL) return;
844     absl::MutexLock lock(&mutex_);
845     if (cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2 ||
846         cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy) {
847       contexts_.erase(ctx);
848     } else {
849       contexts_.emplace(ctx);
850     }
851   }
852 
853   const CuptiTracerOptions option_;
854   CuptiInterface *cupti_interface_;
855   CuptiTraceCollector *collector_;
856   absl::Mutex mutex_;
857   absl::flat_hash_set<CUcontext> contexts_ TF_GUARDED_BY(mutex_);
858 
859   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithActivityApi);
860 };
861 
862 struct KernelRecord {
863   const char *kernel_name;
864   // TODO(csigg): cuStreamGetCtx introduced in CUDA 9.2 would allow us to only
865   // record the stream and infer the context during collection.
866   CUcontext context;
867   CUstream stream;
868   uint32 correlation_id;
869   CUevent start_event;
870   CUevent stop_event;
871   KernelDetails details;
872   uint64 start_timestamp;
873 };
874 
875 struct MemcpyRecord {
876   CuptiTracerEventType type;
877   size_t size_bytes;
878   CUcontext context;
879   CUstream stream;
880   uint32 correlation_id;
881   bool async;
882   CUevent start_event;
883   CUevent stop_event;
884   uint64 start_timestamp;
885 };
886 
CreateAndRecordEvent(CUevent * event,CUstream stream)887 Status CreateAndRecordEvent(CUevent *event, CUstream stream) {
888   CuptiApiTracingDisabler disabler;
889   TF_RETURN_IF_ERROR(ToStatus(cuEventCreate(event, CU_EVENT_DEFAULT)));
890   return ToStatus(cuEventRecord(*event, stream));
891 }
892 
893 #if CUDA_VERSION >= 10000
894 // Maintain and restore current thread's CUDA context.
895 // Note: cuStreamGetCtx only available after CUDA 9.2.
896 class ScopedCudaContext {
897  public:
ScopedCudaContext(CUstream stream)898   explicit ScopedCudaContext(CUstream stream) : stream_(stream) {
899     CuptiApiTracingDisabler disabler;  // don't trace cuda call in this func.
900     CUcontext context;
901     if (cuStreamGetCtx(stream, &context) != CUDA_SUCCESS) return;
902     context_ = context;
903     uint32 device_ordinal;
904     if (cuptiGetDeviceId(context, &device_ordinal) != CUPTI_SUCCESS) return;
905     device_ordinal_ = device_ordinal;
906     context_pushed_ = cuCtxPushCurrent(context) == CUDA_SUCCESS;
907   }
~ScopedCudaContext()908   ~ScopedCudaContext() {
909     if (!context_pushed_) return;
910     CuptiApiTracingDisabler disabler;  // don't trace cuda call in this func.
911     cuCtxPopCurrent(&*context_);
912   }
913 
914   // If successful, return the device ordinal of the relevant cuda stream.
915   // Otherwise absl::nullopt;
GetDeviceOrdinal()916   absl::optional<uint32> GetDeviceOrdinal() { return device_ordinal_; }
917 
918   // If successful, return the cuda context of the relevant cuda stream.
919   // Otherwise absl::nullopt;
GetContext()920   absl::optional<CUcontext> GetContext() { return context_; }
921 
922  private:
923   CUstream stream_;
924   absl::optional<CUcontext> context_;
925   absl::optional<uint32> device_ordinal_;
926   bool context_pushed_ = false;
927 };
928 #endif
929 
930 // Stores a series of kernel and memcpy records.
931 class CudaEventRecorder {
932  public:
CudaEventRecorder(CuptiInterface * cupti_interface,CuptiTraceCollector * collector,int ordinal)933   CudaEventRecorder(CuptiInterface *cupti_interface,
934                     CuptiTraceCollector *collector, int ordinal)
935       : cupti_interface_(cupti_interface),
936         collector_(collector),
937         ordinal_(ordinal) {
938     device_name_ = absl::StrCat("gpu ", ordinal);  // default.
939     CUdevice device;
940     if (cuDeviceGet(&device, ordinal) == CUDA_SUCCESS) {
941       char name[100];
942       if (cuDeviceGetName(name, sizeof(name), device) == CUDA_SUCCESS) {
943         device_name_ = name;
944       }
945     }
946   }
947 
948   // Registers the start of a kernel launch. The returned index should be passed
949   // to StopKernel() after the kernel launch has completed.
950   template <typename T>
StartKernel(const char * kernel_name,CUcontext context,uint32 correlation_id,const T * params)951   size_t StartKernel(const char *kernel_name, CUcontext context,
952                      uint32 correlation_id, const T *params) {
953     CUstream stream = params->hStream;
954     KernelRecord record = {kernel_name, context, stream, correlation_id};
955     record.details.registers_per_thread = 0;  // unknown.
956     record.details.static_shared_memory_usage = params->sharedMemBytes;
957     record.details.dynamic_shared_memory_usage = 0;  // unknown
958     record.details.block_x = params->blockDimX;
959     record.details.block_y = params->blockDimY;
960     record.details.block_z = params->blockDimZ;
961     record.details.grid_x = params->gridDimX;
962     record.details.grid_y = params->gridDimY;
963     record.details.grid_z = params->gridDimZ;
964     record.start_timestamp = CuptiTracer::GetTimestamp();
965     LogIfError(CreateAndRecordEvent(&record.start_event, stream));
966     absl::MutexLock lock(&mutex_);
967     if (stopped_) return -1;
968     kernel_records_.push_back(record);
969     return kernel_records_.size() - 1;
970   }
StopKernel(size_t index)971   uint64 StopKernel(size_t index) {
972     absl::MutexLock lock(&mutex_);
973     if (index >= kernel_records_.size()) return 0;
974     auto &record = kernel_records_[index];
975     LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
976     return record.start_timestamp;
977   }
978 
979   // Registers the start of a copy operation. The returned index should be
980   // passed to StopMemcpy() after the memcpy has completed.
StartMemcpy(CuptiTracerEventType type,size_t size_bytes,CUcontext context,CUstream stream,uint32 correlation_id,bool async)981   size_t StartMemcpy(CuptiTracerEventType type, size_t size_bytes,
982                      CUcontext context, CUstream stream, uint32 correlation_id,
983                      bool async) {
984     MemcpyRecord record = {type,   size_bytes,     context,
985                            stream, correlation_id, async};
986     record.start_timestamp = CuptiTracer::GetTimestamp();
987     LogIfError(CreateAndRecordEvent(&record.start_event, stream));
988     absl::MutexLock lock(&mutex_);
989     if (stopped_) return -1;
990     memcpy_records_.push_back(record);
991     return memcpy_records_.size() - 1;
992   }
StopMemcpy(size_t index)993   uint64 StopMemcpy(size_t index) {
994     absl::MutexLock lock(&mutex_);
995     if (index >= memcpy_records_.size()) return 0;
996     auto &record = memcpy_records_[index];
997     LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
998     return record.start_timestamp;
999   }
1000 
Stop()1001   Status Stop() {
1002     {
1003       absl::MutexLock lock(&mutex_);
1004       stopped_ = true;
1005       LOG(INFO) << "Collecting " << kernel_records_.size()
1006                 << " kernel records, " << memcpy_records_.size()
1007                 << " memcpy records.";
1008 
1009       // Gather all profiled streams and contexts.
1010       for (const auto &record : kernel_records_) {
1011         TF_RETURN_IF_ERROR(
1012             AddStreamInfo(record.context, record.stream, "Kernel"));
1013       }
1014       for (const auto &record : memcpy_records_) {
1015         TF_RETURN_IF_ERROR(AddStreamInfo(record.context, record.stream,
1016                                          GetTraceEventTypeName(record.type)));
1017       }
1018     }
1019 
1020     // Synchronize all contexts, record end events, synchronize again.
1021     // This scheme is an unreliable measure to associate a event with the wall
1022     // time. There are chances that other threads might enque kernels which
1023     // delay the second synchronization.
1024     TF_RETURN_IF_ERROR(Synchronize());
1025     for (auto &pair : context_infos_) {
1026       TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
1027       TF_RETURN_IF_ERROR(CreateAndRecordEvent(&pair.second.end_event, nullptr));
1028     }
1029 
1030     TF_RETURN_IF_ERROR(Synchronize());
1031     end_walltime_us_ = Env::Default()->NowMicros();
1032     return Status::OK();
1033   }
1034 
Flush(AnnotationMap * annotation_map)1035   Status Flush(AnnotationMap *annotation_map) {
1036     auto kernel_records = ConsumeKernelRecords();
1037     auto memcpy_records = ConsumeMemcpyRecords();
1038     for (const auto &record : kernel_records) {
1039       TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
1040     }
1041     for (const auto &record : memcpy_records) {
1042       TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
1043     }
1044     return Status::OK();
1045   }
1046 
ConsumeKernelRecords()1047   std::vector<KernelRecord> ConsumeKernelRecords() {
1048     absl::MutexLock lock(&mutex_);
1049     return std::move(kernel_records_);
1050   }
ConsumeMemcpyRecords()1051   std::vector<MemcpyRecord> ConsumeMemcpyRecords() {
1052     absl::MutexLock lock(&mutex_);
1053     return std::move(memcpy_records_);
1054   }
1055 
1056  private:
1057   struct ContextInfo {
1058     uint32 context_id = 0;
1059     int num_streams = 0;
1060     CUevent end_event;
1061   };
1062 
1063   struct StreamInfo {
1064     uint32 stream_id = 0;
1065     std::string name;
1066     int index;  // 0 is reserved for null stream.
1067     const ContextInfo *ctx_info;
1068   };
1069 
1070   // Synchronizes all contexts.
Synchronize() const1071   Status Synchronize() const {
1072     CuptiApiTracingDisabler disabler;
1073     for (const auto &pair : context_infos_) {
1074       TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
1075       TF_RETURN_IF_ERROR(ToStatus(cuCtxSynchronize()));
1076     }
1077     return Status::OK();
1078   }
1079 
1080   // Returns element from context_infos_, adding it if not yet present.
GetContextInfo(CUcontext context,ContextInfo ** ctx_info_ptr)1081   Status GetContextInfo(CUcontext context, ContextInfo **ctx_info_ptr) {
1082     auto it = context_infos_.find(context);
1083 
1084     if (it == context_infos_.end()) {
1085       uint32 context_id = 0;
1086       RETURN_IF_CUPTI_ERROR(
1087           cupti_interface_->GetContextId(context, &context_id));
1088       ContextInfo ctx_info = {context_id};
1089       it = context_infos_.emplace(context, ctx_info).first;
1090     }
1091 
1092     *ctx_info_ptr = &it->second;
1093     return Status::OK();
1094   }
1095 
1096   // Adds element to stream_infos_ if not yet present. If present, clear name
1097   // if it doesn't match parameter.
AddStreamInfo(CUcontext context,CUstream stream,absl::string_view name)1098   Status AddStreamInfo(CUcontext context, CUstream stream,
1099                        absl::string_view name) {
1100     StreamKey key(context, stream);
1101     auto it = stream_infos_.find(key);
1102     if (it != stream_infos_.end()) {
1103       if (it->second.name != name) {
1104         it->second.name.clear();  // Stream with inconsistent names, clear it.
1105       }
1106       return Status::OK();
1107     }
1108 
1109     ContextInfo *ctx_info;
1110     TF_RETURN_IF_ERROR(GetContextInfo(context, &ctx_info));
1111     int index = stream ? ++ctx_info->num_streams : 0;
1112     uint32 stream_id = 0;
1113 #if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
1114     RETURN_IF_CUPTI_ERROR(
1115         cupti_interface_->GetStreamIdEx(context, stream, 1, &stream_id));
1116 #else
1117     RETURN_IF_CUPTI_ERROR(
1118         cupti_interface_->GetStreamIdEx(context, stream, 0, &stream_id));
1119 #endif
1120 
1121     StreamInfo stream_info = {stream_id, static_cast<std::string>(name), index,
1122                               ctx_info};
1123     stream_infos_.emplace(key, stream_info);
1124     return Status::OK();
1125   }
1126 
1127   // Returns time in microseconds between events recorded on the GPU.
GetElapsedTimeUs(CUevent start,CUevent stop)1128   static uint64_t GetElapsedTimeUs(CUevent start, CUevent stop) {
1129     CuptiApiTracingDisabler disabler;
1130     float elapsed_ms = 0.0f;
1131     LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop)));
1132     return static_cast<uint64>(
1133         std::llroundf(1000 * std::max(elapsed_ms, 0.0f)));
1134   }
1135 
SaveRecord(const KernelRecord & record,AnnotationMap * annotation_map) const1136   Status SaveRecord(const KernelRecord &record,
1137                     AnnotationMap *annotation_map) const {
1138     if (!record.start_event || !record.stop_event) {
1139       return Status::OK();
1140     }
1141     const auto &stream_info =
1142         stream_infos_.at(StreamKey(record.context, record.stream));
1143     auto start_us =
1144         GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
1145     auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
1146 
1147     std::string annotation;
1148 
1149     CuptiTracerEvent event{};
1150     event.type = CuptiTracerEventType::Kernel;
1151     event.source = CuptiTracerEventSource::Activity;  // on gpu device.
1152     event.name = record.kernel_name;
1153     event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
1154     event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
1155     event.device_id = ordinal_;
1156     event.context_id = stream_info.ctx_info->context_id;
1157     event.stream_id = stream_info.stream_id;
1158     event.correlation_id = record.correlation_id;
1159     AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
1160         event.device_id, event.correlation_id);
1161     event.annotation = info.annotation;
1162     event.kernel_info = record.details;
1163     collector_->AddEvent(std::move(event));
1164     return Status::OK();
1165   }
1166 
SaveRecord(const MemcpyRecord & record,AnnotationMap * annotation_map) const1167   Status SaveRecord(const MemcpyRecord &record,
1168                     AnnotationMap *annotation_map) const {
1169     if (!record.start_event || !record.stop_event) {
1170       return Status::OK();
1171     }
1172     const auto &stream_info =
1173         stream_infos_.at(StreamKey(record.context, record.stream));
1174     auto start_us =
1175         GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
1176     auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
1177 
1178     CuptiTracerEvent event{};
1179     event.type = record.type;
1180     event.name = GetTraceEventTypeName(event.type);
1181     event.source = CuptiTracerEventSource::Activity;
1182     event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
1183     event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
1184     event.device_id = ordinal_;
1185     event.context_id = stream_info.ctx_info->context_id;
1186     event.stream_id = stream_info.stream_id;
1187     event.correlation_id = record.correlation_id;
1188     AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
1189         event.device_id, event.correlation_id);
1190     event.annotation = info.annotation;
1191     event.memcpy_info.num_bytes = record.size_bytes;
1192     // TODO: support MemcpyD2D where destination != source;
1193     event.memcpy_info.destination = ordinal_;
1194     event.memcpy_info.async = record.async;
1195     // TODO: set src_mem_kind and dst_mem_kind.
1196     collector_->AddEvent(std::move(event));
1197     return Status::OK();
1198   }
1199 
1200   absl::Mutex mutex_;
1201   bool stopped_ TF_GUARDED_BY(mutex_) = false;
1202   std::vector<KernelRecord> kernel_records_ TF_GUARDED_BY(mutex_);
1203   std::vector<MemcpyRecord> memcpy_records_ TF_GUARDED_BY(mutex_);
1204 
1205   CuptiInterface *cupti_interface_;
1206   CuptiTraceCollector *collector_;
1207   const int ordinal_;
1208   std::string device_name_;
1209   uint64 end_walltime_us_;
1210   // Include context in key to distinguish null streams.
1211   using StreamKey = std::pair<CUcontext, CUstream>;
1212 
1213   absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
1214   absl::flat_hash_map<StreamKey, StreamInfo> stream_infos_;
1215 };
1216 
1217 // This hook uses cuda events to measure device side activities.
1218 class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
1219  public:
CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions & option,CuptiInterface * cupti_interface,CuptiTraceCollector * collector)1220   CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions &option,
1221                                   CuptiInterface *cupti_interface,
1222                                   CuptiTraceCollector *collector)
1223       : option_(option),
1224         cupti_interface_(cupti_interface),
1225         collector_(collector) {
1226     int num_gpus = CuptiTracer::NumGpus();
1227     cuda_event_recorders_.reserve(num_gpus);
1228     for (int i = 0; i < num_gpus; ++i) {
1229       cuda_event_recorders_.emplace_back(
1230           absl::make_unique<CudaEventRecorder>(cupti_interface, collector, i));
1231     }
1232   }
~CuptiDriverApiHookWithCudaEvent()1233   ~CuptiDriverApiHookWithCudaEvent() {
1234     for (auto *callback_context : callback_contexts_) delete callback_context;
1235   }
1236 
OnDriverApiEnter(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1237   Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
1238                           CUpti_CallbackId cbid,
1239                           const CUpti_CallbackData *cbdata) override {
1240     auto *recorder = cuda_event_recorders_[device_id].get();
1241     switch (cbid) {
1242       case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: {
1243         DCHECK_NE(cbdata->symbolName, nullptr);
1244         const auto *params =
1245             static_cast<const cuLaunchKernel_params *>(cbdata->functionParams);
1246         *cbdata->correlationData = recorder->StartKernel<cuLaunchKernel_params>(
1247             cbdata->symbolName, cbdata->context, cbdata->correlationId, params);
1248         break;
1249       }
1250       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: {
1251         DCHECK_NE(cbdata->symbolName, nullptr);
1252         const auto *params =
1253             static_cast<const cuLaunchCooperativeKernel_params *>(
1254                 cbdata->functionParams);
1255         *cbdata->correlationData =
1256             recorder->StartKernel<cuLaunchCooperativeKernel_params>(
1257                 cbdata->symbolName, cbdata->context, cbdata->correlationId,
1258                 params);
1259         break;
1260       }
1261       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
1262 #if CUDA_VERSION >= 10000
1263         const auto *params =
1264             static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
1265                 cbdata->functionParams);
1266         std::vector<uint32> record_indices;
1267         record_indices.reserve(params->numDevices);
1268         *cbdata->correlationData = -1;  // Invalid value.
1269         const auto &annotation = AnnotationStack::Get();
1270         for (int i = 0; i < params->numDevices; ++i) {
1271           CUstream stream = params->launchParamsList[i].hStream;
1272           ScopedCudaContext scoped_cuda_context(stream);
1273           auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
1274           auto context = scoped_cuda_context.GetContext();
1275           if (!dev_id) return errors::Internal("Invalid CUDA stream");
1276           // Because annotation are per device, therefore we need to populate
1277           // annotation for each device involved.
1278           collector_->annotation_map()->Add(*dev_id, cbdata->correlationId,
1279                                             annotation, "");
1280           record_indices.push_back(
1281               cuda_event_recorders_[*dev_id]->StartKernel<CUDA_LAUNCH_PARAMS>(
1282                   "CooperativeKernelMultiDevice", *context,
1283                   cbdata->correlationId, &(params->launchParamsList[i])));
1284         }
1285         auto *callback_context =
1286             new CuptiApiCallbackContext(std::move(record_indices));
1287         callback_contexts_.insert(callback_context);
1288         *cbdata->correlationData = reinterpret_cast<uint64>(callback_context);
1289 #else
1290         VLOG(1) << "Unhandled cuLaunchCooperativeKernelMultiDevice.";
1291 #endif
1292       } break;
1293       case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
1294         const auto *params =
1295             static_cast<const cuMemcpy_params *>(cbdata->functionParams);
1296         StartMemcpy<cuMemcpy_params>(GetMemcpyType(params->src, params->dst),
1297                                      cbdata, recorder);
1298         break;
1299       }
1300       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
1301         const auto *params =
1302             static_cast<const cuMemcpyAsync_params *>(cbdata->functionParams);
1303         StartMemcpyAsync<cuMemcpyAsync_params>(
1304             GetMemcpyType(params->src, params->dst), cbdata, recorder);
1305         break;
1306       }
1307       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1308         StartMemcpy<cuMemcpyHtoD_v2_params>(CuptiTracerEventType::MemcpyH2D,
1309                                             cbdata, recorder);
1310         break;
1311       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1312         StartMemcpyAsync<cuMemcpyHtoDAsync_v2_params>(
1313             CuptiTracerEventType::MemcpyH2D, cbdata, recorder);
1314         break;
1315       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1316         StartMemcpy<cuMemcpyDtoH_v2_params>(CuptiTracerEventType::MemcpyD2H,
1317                                             cbdata, recorder);
1318         break;
1319       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1320         StartMemcpyAsync<cuMemcpyDtoHAsync_v2_params>(
1321             CuptiTracerEventType::MemcpyD2H, cbdata, recorder);
1322         break;
1323       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1324         StartMemcpy<cuMemcpyDtoD_v2_params>(CuptiTracerEventType::MemcpyD2D,
1325                                             cbdata, recorder);
1326         break;
1327       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1328         StartMemcpyAsync<cuMemcpyDtoDAsync_v2_params>(
1329             CuptiTracerEventType::MemcpyD2D, cbdata, recorder);
1330         break;
1331       default:
1332         VLOG(1) << "Unexpected callback id: " << cbid;
1333         break;
1334     }
1335     return Status::OK();
1336   }
1337 
OnDriverApiExit(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1338   Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
1339                          CUpti_CallbackId cbid,
1340                          const CUpti_CallbackData *cbdata) override {
1341     auto *recorder = cuda_event_recorders_[device_id].get();
1342     if (*cbdata->correlationData == static_cast<size_t>(-1))
1343       return Status::OK();
1344     uint64 start_tsc = 0;
1345     switch (cbid) {
1346       case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
1347       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
1348         start_tsc = recorder->StopKernel(*cbdata->correlationData);
1349         break;
1350       case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
1351 #if CUDA_VERSION >= 10000
1352         auto *callback_context = reinterpret_cast<CuptiApiCallbackContext *>(
1353             *cbdata->correlationData);
1354         callback_contexts_.erase(callback_context);
1355         auto record_indices = std::move(callback_context->record_indices);
1356         delete callback_context;
1357         const auto *params =
1358             static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
1359                 cbdata->functionParams);
1360         if (record_indices.size() != params->numDevices)
1361           return errors::Internal("Invalid correlation data");
1362         for (int i = 0; i < params->numDevices; ++i) {
1363           CUstream stream = params->launchParamsList[i].hStream;
1364           ScopedCudaContext scoped_cuda_context(stream);
1365           auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
1366           if (!dev_id) return errors::Internal("Invalid CUDA stream");
1367           start_tsc =
1368               cuda_event_recorders_[*dev_id]->StopKernel(record_indices[i]);
1369         }
1370 #endif
1371       } break;
1372       case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
1373       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
1374       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1375       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1376       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1377       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1378       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1379       case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1380         start_tsc = recorder->StopMemcpy(*cbdata->correlationData);
1381         break;
1382       default:
1383         VLOG(1) << "Unexpected callback id: " << cbid;
1384         // TODO: figure out how to get start timestamp in this case.
1385         return Status::OK();
1386     }
1387     // If we are not collecting CPU events from Callback API, we can return now.
1388     if (!option_.required_callback_api_events) {
1389       return Status::OK();
1390     }
1391 
1392     // Grab timestamp for API exit. API entry timestamp saved in cbdata.
1393     uint64 end_tsc = CuptiTracer::GetTimestamp();
1394     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
1395                                      start_tsc, end_tsc, domain, cbid, cbdata);
1396   }
SyncAndFlush()1397   Status SyncAndFlush() override {
1398     for (auto &recorder : cuda_event_recorders_) {
1399       TF_RETURN_IF_ERROR(recorder->Stop());
1400     }
1401     for (auto &recorder : cuda_event_recorders_) {
1402       TF_RETURN_IF_ERROR(recorder->Flush(collector_->annotation_map()));
1403     }
1404     return Status::OK();
1405   }
1406 
1407  private:
1408   template <typename T>
StartMemcpy(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,CudaEventRecorder * recorder)1409   static void StartMemcpy(CuptiTracerEventType type,
1410                           const CUpti_CallbackData *cbdata,
1411                           CudaEventRecorder *recorder) {
1412     const auto *params = static_cast<const T *>(cbdata->functionParams);
1413     *cbdata->correlationData =
1414         recorder->StartMemcpy(type, params->ByteCount, cbdata->context, nullptr,
1415                               cbdata->correlationId, /*async*/ false);
1416   }
1417 
1418   template <typename T>
StartMemcpyAsync(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,CudaEventRecorder * recorder)1419   static void StartMemcpyAsync(CuptiTracerEventType type,
1420                                const CUpti_CallbackData *cbdata,
1421                                CudaEventRecorder *recorder) {
1422     const auto *params = static_cast<const T *>(cbdata->functionParams);
1423     *cbdata->correlationData = recorder->StartMemcpy(
1424         type, params->ByteCount, cbdata->context, params->hStream,
1425         cbdata->correlationId, /*async*/ true);
1426   }
1427 
GetMemoryType(CUdeviceptr ptr)1428   static CUmemorytype GetMemoryType(CUdeviceptr ptr) {
1429     CuptiApiTracingDisabler disabler;
1430     CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
1431     auto status =
1432         cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr);
1433     if (status == CUDA_ERROR_INVALID_VALUE) {
1434       // Pointer not registered with CUDA, must be host memory.
1435       return CU_MEMORYTYPE_HOST;
1436     }
1437     LogIfError(ToStatus(status));
1438     return mem_type;
1439   }
1440 
GetMemcpyType(CUdeviceptr src,CUdeviceptr dst)1441   static CuptiTracerEventType GetMemcpyType(CUdeviceptr src, CUdeviceptr dst) {
1442     CUmemorytype src_type = GetMemoryType(src);
1443     CUmemorytype dst_type = GetMemoryType(dst);
1444     // TODO: handle CU_MEMORYTYPE_ARRAY case
1445     if (src_type == CU_MEMORYTYPE_HOST && dst_type == CU_MEMORYTYPE_DEVICE) {
1446       return CuptiTracerEventType::MemcpyH2D;
1447     } else if (src_type == CU_MEMORYTYPE_DEVICE &&
1448                dst_type == CU_MEMORYTYPE_HOST) {
1449       return CuptiTracerEventType::MemcpyD2H;
1450     } else if (src_type == CU_MEMORYTYPE_DEVICE &&
1451                dst_type == CU_MEMORYTYPE_DEVICE) {
1452       return CuptiTracerEventType::MemcpyD2D;
1453     }
1454     return CuptiTracerEventType::MemcpyOther;
1455   }
1456 
1457   // Each cuLaunchCooperativeKernelMultiDevice will need to add an entry in
1458   // each corresponding device, therefore we need to keep records of all
1459   // the record indices in each device's record array.
1460   // We allocate such data structure during API entry and free during API exit.
1461   // However there is no guarantee that we receive such callbacks in pairs, we
1462   // maintain a on-going API calls to make sure no memory leaks.
1463   struct CuptiApiCallbackContext {
CuptiApiCallbackContexttensorflow::profiler::__anond046b8460111::CuptiDriverApiHookWithCudaEvent::CuptiApiCallbackContext1464     explicit CuptiApiCallbackContext(std::vector<uint32> &&r)
1465         : record_indices(std::move(r)) {}
1466     std::vector<uint32> record_indices;
1467   };
1468 
1469   const CuptiTracerOptions option_;
1470   CuptiInterface *cupti_interface_;
1471   CuptiTraceCollector *collector_;
1472   absl::node_hash_set<CuptiApiCallbackContext *> callback_contexts_;
1473   std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
1474   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
1475 };
1476 
ErrorWithHostname(absl::string_view error_message)1477 /*static*/ std::string ErrorWithHostname(absl::string_view error_message) {
1478   return absl::StrCat(port::Hostname(), ": ", error_message);
1479 }
1480 
1481 }  // namespace
1482 
AddDriverApiCallbackEvent(CuptiTraceCollector * collector,CuptiInterface * cupti_interface,int device_id,uint64 start_tsc,uint64 end_tsc,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1483 /*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
1484     CuptiTraceCollector *collector, CuptiInterface *cupti_interface,
1485     int device_id, uint64 start_tsc, uint64 end_tsc,
1486     CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
1487     const CUpti_CallbackData *cbdata) {
1488   switch (cbid) {
1489     case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
1490     case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
1491     case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice:
1492       AddKernelEventUponApiExit(collector, device_id, cbdata, start_tsc,
1493                                 end_tsc);
1494       break;
1495     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
1496     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
1497     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1498     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1499     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1500     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1501     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1502     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1503     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2:
1504     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2:
1505     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2:
1506     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2:
1507     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2:
1508     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2:
1509     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2:
1510     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2:
1511     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2:
1512     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2:
1513     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2:
1514     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2:
1515       AddNormalMemcpyEventUponApiExit(collector, device_id, cbid, cbdata,
1516                                       start_tsc, end_tsc);
1517       break;
1518     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer:
1519     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync:
1520       AddP2PMemcpyEventUponApiExit(collector, cupti_interface, device_id, cbid,
1521                                    cbdata, start_tsc, end_tsc);
1522       break;
1523     case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2:
1524       AddCuMemAllocEventUponApiExit(collector, device_id, cbid, cbdata,
1525                                     start_tsc, end_tsc);
1526       break;
1527     case CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2:
1528       AddCuMemAllocPitchEventUponApiExit(collector, device_id, cbid, cbdata,
1529                                          start_tsc, end_tsc);
1530       break;
1531     case CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2:
1532       AddCuMemFreeEventUponApiExit(collector, device_id, cbid, cbdata,
1533                                    start_tsc, end_tsc);
1534       break;
1535     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2:
1536     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2:
1537     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2:
1538     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2:
1539     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2:
1540     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2:
1541     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async:
1542     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async:
1543     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async:
1544     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async:
1545     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async:
1546     case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async:
1547       AddCuMemsetEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc,
1548                                   end_tsc);
1549       break;
1550     default:
1551       AddGenericEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc,
1552                                  end_tsc);
1553       break;
1554   }
1555   return Status::OK();
1556 }
1557 
GetTraceEventTypeName(const CuptiTracerEventType & type)1558 const char *GetTraceEventTypeName(const CuptiTracerEventType &type) {
1559   // Do not use a default so that this gives a build error when
1560   // CuptiTracerEventType is extended but this is not.
1561   switch (type) {
1562     case CuptiTracerEventType::MemcpyH2D:
1563       return "MemcpyH2D";
1564     case CuptiTracerEventType::MemcpyD2H:
1565       return "MemcpyD2H";
1566     case CuptiTracerEventType::MemcpyD2D:
1567       return "MemcpyD2D";
1568     case CuptiTracerEventType::MemcpyP2P:
1569       return "MemcpyP2P";
1570     case CuptiTracerEventType::MemcpyOther:
1571       return "MemcpyOther";
1572     case CuptiTracerEventType::Kernel:
1573       return "Compute";
1574     case CuptiTracerEventType::MemoryAlloc:
1575       return "MemoryAlloc";
1576     case CuptiTracerEventType::MemoryFree:
1577       return "MemoryFree";
1578     case CuptiTracerEventType::Memset:
1579       return "Memset";
1580     case CuptiTracerEventType::Overhead:
1581       return "Overhead";
1582     case CuptiTracerEventType::UnifiedMemory:
1583       return "UnifiedMemory";
1584     case CuptiTracerEventType::Generic:
1585       return "Generic";
1586     case CuptiTracerEventType::MemoryResidency:
1587       return "MemoryResidency";
1588     case CuptiTracerEventType::Unsupported:
1589       return "";
1590   }
1591 }
1592 
GetCuptiTracerSingleton()1593 /* static */ CuptiTracer *CuptiTracer::GetCuptiTracerSingleton() {
1594   static auto *singleton = new CuptiTracer(GetCuptiInterface());
1595   return singleton;
1596 }
1597 
IsAvailable() const1598 bool CuptiTracer::IsAvailable() const {
1599   return NumGpus() && !activity_tracing_enabled_ && !api_tracing_enabled_;
1600 }
1601 
NumGpus()1602 int CuptiTracer::NumGpus() {
1603   static int num_gpus = []() -> int {
1604     if (cuInit(0) != CUDA_SUCCESS) {
1605       return 0;
1606     }
1607     int gpu_count;
1608     if (cuDeviceGetCount(&gpu_count) != CUDA_SUCCESS) {
1609       return 0;
1610     }
1611     LOG(INFO) << "Profiler found " << gpu_count << " GPUs";
1612     return gpu_count;
1613   }();
1614   return num_gpus;
1615 }
1616 
Enable(const CuptiTracerOptions & option,CuptiTraceCollector * collector)1617 void CuptiTracer::Enable(const CuptiTracerOptions &option,
1618                          CuptiTraceCollector *collector) {
1619   option_ = option;
1620   collector_ = collector;
1621   if (option_->enable_event_based_activity) {
1622     option_->enable_activity_api = false;
1623     cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithCudaEvent(
1624         option, cupti_interface_, collector));
1625   } else {
1626     cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithActivityApi(
1627         option, cupti_interface_, collector));
1628   }
1629 
1630   Status status = EnableApiTracing();
1631   need_root_access_ |= status.code() == error::PERMISSION_DENIED;
1632   if (!status.ok()) return;
1633 
1634   if (option_->enable_activity_api) {
1635     EnableActivityTracing().IgnoreError();
1636   }
1637 }
1638 
Disable()1639 void CuptiTracer::Disable() {
1640   DisableApiTracing().IgnoreError();
1641   if (option_->enable_activity_api) {
1642     DisableActivityTracing().IgnoreError();
1643   }
1644   cupti_interface_->CleanUp();
1645   Finalize().IgnoreError();
1646   cupti_driver_api_hook_->SyncAndFlush().IgnoreError();
1647   collector_->Flush();
1648   collector_ = nullptr;
1649   option_.reset();
1650   cupti_driver_api_hook_.reset();
1651 }
1652 
EnableApiTracing()1653 Status CuptiTracer::EnableApiTracing() {
1654   if (api_tracing_enabled_) return Status::OK();
1655 
1656   VLOG(1) << "Enable subscriber";
1657   // Subscribe can return CUPTI_ERROR_MAX_LIMIT_REACHED.
1658   // The application which calls CUPTI APIs cannot be used with Nvidia tools
1659   // like nvprof, Nvidia Visual Profiler, Nsight Compute, Nsight Systems.
1660   RETURN_IF_CUPTI_ERROR(cupti_interface_->Subscribe(
1661       &subscriber_, (CUpti_CallbackFunc)ApiCallback, this));
1662   api_tracing_enabled_ = true;
1663 
1664   if (!option_->cbids_selected.empty()) {
1665     for (auto cbid : option_->cbids_selected) {
1666       RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
1667           1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
1668     }
1669   } else {  // select all callback ids.
1670     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1671         1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
1672   }
1673 
1674   if (option_->enable_nvtx_tracking) {
1675     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1676         1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
1677   }
1678   return Status::OK();
1679 }
1680 
DisableApiTracing()1681 Status CuptiTracer::DisableApiTracing() {
1682   if (!api_tracing_enabled_) return Status::OK();
1683 
1684   api_tracing_enabled_ = false;
1685 
1686   if (!option_->cbids_selected.empty()) {
1687     for (auto cbid : option_->cbids_selected) {
1688       RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
1689           0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
1690     }
1691   } else {
1692     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1693         0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
1694   }
1695 
1696   if (option_->enable_nvtx_tracking) {
1697     RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1698         0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
1699   }
1700 
1701   VLOG(1) << "Disable subscriber";
1702   RETURN_IF_CUPTI_ERROR(cupti_interface_->Unsubscribe(subscriber_));
1703   return Status::OK();
1704 }
1705 
EnableActivityTracing()1706 Status CuptiTracer::EnableActivityTracing() {
1707   if (!option_->activities_selected.empty()) {
1708     // Initialize callback functions for Cupti Activity API.
1709     VLOG(1) << "Registering CUPTI activity callbacks";
1710     RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityRegisterCallbacks(
1711         AllocCuptiActivityBuffer, FreeCuptiActivityBuffer));
1712 
1713     VLOG(1) << "Enabling activity tracing for "
1714             << option_->activities_selected.size() << " activities";
1715     for (auto activity : option_->activities_selected) {
1716       VLOG(1) << "Enabling activity tracing for: " << activity;
1717       if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
1718         ConfigureActivityUnifiedMemoryCounter(true);
1719       }
1720       RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityEnable(activity));
1721     }
1722   }
1723   activity_tracing_enabled_ = true;
1724   return Status::OK();
1725 }
1726 
DisableActivityTracing()1727 Status CuptiTracer::DisableActivityTracing() {
1728   if (activity_tracing_enabled_) {
1729     VLOG(1) << "Disabling activity tracing for "
1730             << option_->activities_selected.size() << " activities";
1731     for (auto activity : option_->activities_selected) {
1732       VLOG(1) << "Disabling activity tracing for: " << activity;
1733       if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
1734         ConfigureActivityUnifiedMemoryCounter(false);
1735       }
1736       RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityDisable(activity));
1737     }
1738     option_->activities_selected.clear();
1739 
1740     VLOG(1) << "Flushing CUPTI activity buffer";
1741     RETURN_IF_CUPTI_ERROR(
1742         cupti_interface_->ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
1743     LOG(INFO) << "CUPTI activity buffer flushed";
1744   }
1745   activity_tracing_enabled_ = false;
1746   return Status::OK();
1747 }
1748 
Finalize()1749 Status CuptiTracer::Finalize() {
1750   if (option_->cupti_finalize) {
1751     VLOG(1) << "CuptiFinalize";
1752     RETURN_IF_CUPTI_ERROR(cupti_interface_->Finalize());
1753   }
1754   return Status::OK();
1755 }
1756 
GetTimestamp()1757 /*static*/ uint64 CuptiTracer::GetTimestamp() {
1758   uint64_t tsc;
1759   CuptiInterface *cupti_interface = GetCuptiInterface();
1760   if (cupti_interface && cupti_interface->GetTimestamp(&tsc) == CUPTI_SUCCESS) {
1761     return tsc;
1762   }
1763   // Return 0 on error. If an activity timestamp is 0, the activity will be
1764   // dropped during time normalization.
1765   return 0;
1766 }
1767 
HandleNVTXCallback(CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1768 Status CuptiTracer::HandleNVTXCallback(CUpti_CallbackId cbid,
1769                                        const CUpti_CallbackData *cbdata) {
1770   const CUpti_NvtxData *pdata =
1771       reinterpret_cast<const CUpti_NvtxData *>(cbdata);
1772   if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePushEx) {
1773     const nvtxDomainRangePushEx_params *params =
1774         reinterpret_cast<const nvtxDomainRangePushEx_params *>(
1775             pdata->functionParams);
1776     // TODO(profiler): The messageType is actually NVTX_MESSAGE_TYPE_REGISTERED
1777     // (which is 3), However it seems to me that we can not get the registered
1778     // string from nvtxDomainRegisterStringA_params. If we reinterpret the
1779     // payload as ascii, it happen to work.
1780     NVTXRangeTracker::EnterRange(params->core.eventAttrib->message.ascii);
1781   } else if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePop) {
1782     NVTXRangeTracker::ExitRange();
1783   }
1784   return Status::OK();
1785 }
1786 
HandleCallback(CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1787 Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
1788                                    CUpti_CallbackId cbid,
1789                                    const CUpti_CallbackData *cbdata) {
1790   if (!api_tracing_enabled_) return Status::OK();  // already unsubscribed.
1791   if (!cupti_driver_api_hook_) return Status::OK();  // already unsubscribed.
1792   if (domain == CUPTI_CB_DOMAIN_NVTX) return HandleNVTXCallback(cbid, cbdata);
1793   if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return Status::OK();
1794   if (internalCuCall) return Status::OK();
1795 
1796   if (cbdata->context == nullptr) {
1797     // API callback is called before any CUDA context is created.
1798     // This is expected to be rare, and we ignore this case.
1799     VLOG(3) << "API callback received before creation of CUDA context\n";
1800     return errors::Internal("cutpi callback without context");
1801   }
1802 
1803   // Grab a correct device ID.
1804   uint32 device_id = -1;
1805   RETURN_IF_CUPTI_ERROR(
1806       cupti_interface_->GetDeviceId(cbdata->context, &device_id));
1807   if (device_id >= num_gpus_) {
1808     return errors::Internal("Invalid device id:", device_id);
1809   }
1810 
1811   if (cbdata->callbackSite == CUPTI_API_ENTER) {
1812     TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiEnter(
1813         device_id, domain, cbid, cbdata));
1814   } else if (cbdata->callbackSite == CUPTI_API_EXIT) {
1815     // Set up the map from correlation id to annotation string.
1816     const auto &annotation = AnnotationStack::Get();
1817     if (!annotation.empty()) {
1818       if (cbid ==
1819           CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice) {
1820         // Kernels are launched on different devices by this API call, therefore
1821         // we need to populate per device annotation map respectively.
1822         for (int i = 0; i < num_gpus_; ++i) {
1823           collector_->annotation_map()->Add(i, cbdata->correlationId,
1824                                             annotation, "");
1825         }
1826       } else {
1827         absl::string_view nvtx_range = NVTXRangeTracker::CurrentRange();
1828         collector_->annotation_map()->Add(device_id, cbdata->correlationId,
1829                                           annotation, nvtx_range);
1830       }
1831     }
1832 
1833     TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiExit(
1834         device_id, domain, cbid, cbdata));
1835   }
1836   return Status::OK();
1837 }
1838 
ConfigureActivityUnifiedMemoryCounter(bool enable)1839 void CuptiTracer::ConfigureActivityUnifiedMemoryCounter(bool enable) {
1840   CUpti_ActivityUnifiedMemoryCounterConfig config[2];
1841   // By experiments, currently only measurements from these two activities are
1842   // trustworthy. Others like GPU page fault may be problematic.
1843   config[0].kind =
1844       CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD;
1845   config[1].kind =
1846       CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH;
1847 
1848   for (size_t i = 0; i < 2; i++) {
1849     config[i].enable = enable;
1850   }
1851 
1852   CUptiResult res;
1853 
1854   res = cupti_interface_->ActivityConfigureUnifiedMemoryCounter(config, 2);
1855   if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED) {
1856     LOG(ERROR) << "Unified memory is not supported on the "
1857                   "underlying platform.\n";
1858   } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE) {
1859     LOG(ERROR) << "Unified memory is not supported on the device.\n";
1860   } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES) {
1861     LOG(ERROR) << "Unified memory is not supported on the "
1862                   "non-P2P multi-gpu setup.\n";
1863   } else if (res != CUPTI_SUCCESS) {
1864     const char *errstr = "";
1865     cuptiGetResultString(res, &errstr);
1866     LOG(ERROR) << "Error while enabling unified memory profiling: " << errstr;
1867   } else {
1868     VLOG(1) << "Configuring Unified memory profiling: " << res;
1869   }
1870 }
1871 
ProcessActivityBuffer(CUcontext context,uint32_t stream_id,uint8_t * buffer,size_t size)1872 Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
1873                                           uint8_t *buffer, size_t size) {
1874   if (!activity_tracing_enabled_) {
1875     LOG(WARNING) << "CUPTI activity buffer is freed after flush.";
1876     return Status::OK();
1877   }
1878   if (cupti_interface_->Disabled()) return errors::Internal("Disabled.");
1879 
1880   CUpti_Activity *record = nullptr;
1881   while (true) {
1882     CUptiResult status =
1883         cupti_interface_->ActivityGetNextRecord(buffer, size, &record);
1884     if (status == CUPTI_SUCCESS) {
1885       switch (record->kind) {
1886         case CUPTI_ACTIVITY_KIND_KERNEL:  // sequential
1887         case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
1888           AddKernelActivityEvent(
1889               collector_, reinterpret_cast<CUpti_ActivityKernel4 *>(record));
1890           break;
1891         case CUPTI_ACTIVITY_KIND_MEMCPY:
1892           AddMemcpyActivityEvent(
1893               collector_, reinterpret_cast<CUpti_ActivityMemcpy *>(record));
1894           break;
1895         case CUPTI_ACTIVITY_KIND_MEMCPY2:
1896           AddMemcpy2ActivityEvent(
1897               collector_, reinterpret_cast<CUpti_ActivityMemcpy2 *>(record));
1898           break;
1899         case CUPTI_ACTIVITY_KIND_OVERHEAD:
1900           AddCuptiOverheadActivityEvent(
1901               collector_, reinterpret_cast<CUpti_ActivityOverhead *>(record));
1902           break;
1903         case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER:
1904           AddUnifiedMemoryActivityEvent(
1905               collector_,
1906               reinterpret_cast<CUpti_ActivityUnifiedMemoryCounter2 *>(record));
1907           break;
1908         case CUPTI_ACTIVITY_KIND_MEMORY: {
1909           AddMemoryActivityEvent(
1910               collector_, reinterpret_cast<CUpti_ActivityMemory *>(record));
1911         } break;
1912         case CUPTI_ACTIVITY_KIND_MEMSET:
1913           AddMemsetActivityEvent(
1914               collector_, reinterpret_cast<CUpti_ActivityMemset *>(record));
1915           break;
1916         case CUPTI_ACTIVITY_KIND_SYNCHRONIZATION:
1917           AddSynchronizationActivityEvent(
1918               collector_,
1919               reinterpret_cast<CUpti_ActivitySynchronization *>(record));
1920           break;
1921         default:
1922           VLOG(3) << "Activity type " << record->kind << " is not supported.";
1923           break;
1924       }
1925     } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
1926       break;
1927     } else {
1928       return errors::Internal("Parse cupti activity buffer error.");
1929     }
1930   }
1931 
1932   // Report dropped records.
1933   size_t dropped;
1934   RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityGetNumDroppedRecords(
1935       context, stream_id, &dropped));
1936   if (dropped != 0) {
1937     uint32 device_id = -1;
1938     RETURN_IF_CUPTI_ERROR(cupti_interface_->GetDeviceId(context, &device_id));
1939     collector_->OnEventsDropped("cupti activity buffer full", dropped);
1940   }
1941   return Status::OK();
1942 }
1943 
ErrorIfAny()1944 /*static*/ std::string CuptiTracer::ErrorIfAny() {
1945   if (CuptiTracer::NumGpus() == 0) {
1946     return ErrorWithHostname("No GPU detected.");
1947   } else if (CuptiTracer::GetCuptiTracerSingleton()->NeedRootAccess()) {
1948     return ErrorWithHostname(
1949         "Insufficient privilege to run libcupti (you need root permission).");
1950   } else if (CuptiTracer::GetTimestamp() == 0) {
1951     return ErrorWithHostname(
1952         "Failed to load libcupti (is it installed and accessible?)");
1953   }
1954   return "";
1955 }
1956 
1957 }  // namespace profiler
1958 }  // namespace tensorflow
1959