1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/backends/gpu/cupti_tracer.h"
17
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/container/flat_hash_set.h"
20 #include "absl/container/node_hash_map.h"
21 #include "absl/container/node_hash_set.h"
22 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
23 #include "third_party/gpus/cuda/extras/CUPTI/include/generated_nvtx_meta.h"
24 #include "tensorflow/core/lib/gtl/cleanup.h"
25 #include "tensorflow/core/platform/env.h"
26 #include "tensorflow/core/platform/errors.h"
27 #include "tensorflow/core/platform/host_info.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/macros.h"
30 #include "tensorflow/core/profiler/backends/cpu/annotation_stack.h"
31 #include "tensorflow/core/profiler/backends/gpu/cupti_collector.h"
32 #include "tensorflow/core/profiler/backends/gpu/nvtx_utils.h"
33
34 namespace tensorflow {
35 namespace profiler {
36
37 namespace {
38
39 // CUPTI from CUDA 11.6 adds information about the hardware channel that ops
40 // run on; this makes its way into the channel_id and channel_type fields in the
41 // structs we export.
42 //
43 // Define some type aliases so we can access the hardware channel id if it's
44 // available.
45 #if CUDA_VERSION >= 11060 // CUDA 11.6
46 #define TF_CUPTI_HAS_CHANNEL_ID 1
47 using CuptiActivityKernelTy = CUpti_ActivityKernel7;
48 using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy5;
49 using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpyPtoP4;
50 using CuptiActivityMemsetTy = CUpti_ActivityMemset4;
51 #else
52 using CuptiActivityKernelTy = CUpti_ActivityKernel4;
53 using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy;
54 using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpy2;
55 using CuptiActivityMemsetTy = CUpti_ActivityMemset;
56 #endif
57
58 static thread_local int internalCuCall = 0;
59
60 // Temporary disable cupti api tracing for this thread during the life scope of
61 // this class. Used for the API calls that initiated by us.
62 class CuptiApiTracingDisabler {
63 public:
CuptiApiTracingDisabler()64 CuptiApiTracingDisabler() { internalCuCall++; }
~CuptiApiTracingDisabler()65 ~CuptiApiTracingDisabler() { internalCuCall--; }
66 };
67
ToStatus(CUptiResult result)68 Status ToStatus(CUptiResult result) {
69 if (result == CUPTI_SUCCESS) {
70 return OkStatus();
71 }
72 const char *str = nullptr;
73 cuptiGetResultString(result, &str);
74 return errors::Unavailable("CUPTI error: ", str ? str : "<unknown>");
75 }
76
ToStatus(CUresult result)77 Status ToStatus(CUresult result) {
78 if (result == CUDA_SUCCESS) {
79 return OkStatus();
80 }
81 const char *str = nullptr;
82 cuGetErrorName(result, &str);
83 return errors::Unavailable("CUDA error: ", str ? str : "<unknown>");
84 }
85
LogIfError(const Status & status)86 inline void LogIfError(const Status &status) {
87 if (status.ok()) return;
88 LOG(ERROR) << status.error_message();
89 }
90
91 // Maps an OverheadKind enum to a const string.
getActivityOverheadKindString(CUpti_ActivityOverheadKind kind)92 const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
93 switch (kind) {
94 case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER:
95 return "COMPILER";
96 case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH:
97 return "BUFFER_FLUSH";
98 case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION:
99 return "INSTRUMENTATION";
100 case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE:
101 return "RESOURCE";
102 default:
103 break;
104 }
105 return "<UNKNOWN>";
106 }
107
getActivityUnifiedMemoryKindString(CUpti_ActivityUnifiedMemoryCounterKind kind)108 const char *getActivityUnifiedMemoryKindString(
109 CUpti_ActivityUnifiedMemoryCounterKind kind) {
110 switch (kind) {
111 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
112 return "UM_BYTES_TRANSFER_HTOD";
113 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
114 return "UM_BYTES_TRANSFER_DTOH";
115 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
116 return "UM_CPU_PAGE_FAULT";
117 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
118 return "UM_GPU_PAGE_FAULT";
119 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING:
120 return "UM_THRASHING";
121 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING:
122 return "UM_THROTTLING";
123 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP:
124 return "UM_REMOTE_MAP";
125 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD:
126 return "UM_BYTES_TRANSFER_DTOD";
127 default:
128 break;
129 }
130 return "<UNKNOWN>";
131 }
132
133 // CUPTI_ERROR_INSUFFICIENT_PRIVILEGES is introduced at CUDA 10.1.
134 #if CUDA_VERSION <= 10000
135 #define CUPTI_ERROR_INSUFFICIENT_PRIVILEGES 35
136 #endif
137
138 #define RETURN_IF_CUPTI_ERROR(expr) \
139 do { \
140 CUptiResult status = expr; \
141 if (ABSL_PREDICT_FALSE(status != CUPTI_SUCCESS)) { \
142 const char *errstr = ""; \
143 cupti_interface_->GetResultString(status, &errstr); \
144 LOG(ERROR) << "function " << #expr << "failed with error " << errstr; \
145 if (status == CUPTI_ERROR_INSUFFICIENT_PRIVILEGES) { \
146 return errors::PermissionDenied("CUPTI need root access!"); \
147 } else { \
148 return errors::Internal("CUPTI call error", errstr); \
149 } \
150 } \
151 } while (false)
152
Bytes2D(const CUDA_MEMCPY2D * p)153 size_t Bytes2D(const CUDA_MEMCPY2D *p) { return p->Height * p->WidthInBytes; }
154
Bytes3D(const CUDA_MEMCPY3D * p)155 size_t Bytes3D(const CUDA_MEMCPY3D *p) {
156 return p->Depth * p->Height * p->WidthInBytes;
157 }
158
159 template <typename CudaMemcpy>
MemcpyKind(const CudaMemcpy * p)160 CuptiTracerEventType MemcpyKind(const CudaMemcpy *p) {
161 if (p->srcMemoryType == CU_MEMORYTYPE_HOST &&
162 p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
163 return CuptiTracerEventType::MemcpyH2D;
164 }
165 if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
166 p->dstMemoryType == CU_MEMORYTYPE_HOST) {
167 return CuptiTracerEventType::MemcpyD2H;
168 }
169 if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
170 p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
171 return CuptiTracerEventType::MemcpyD2D;
172 }
173 return CuptiTracerEventType::Unsupported;
174 }
175
176 std::tuple<size_t /*bytes*/, CuptiTracerEventType, bool /*async*/>
DecodeDriverMemcpy(CUpti_CallbackId cbid,const void * params)177 DecodeDriverMemcpy(CUpti_CallbackId cbid, const void *params) {
178 switch (cbid) {
179 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: {
180 const auto *p = reinterpret_cast<const cuMemcpyHtoD_v2_params *>(params);
181 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyH2D,
182 false);
183 }
184 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: {
185 const auto *p =
186 reinterpret_cast<const cuMemcpyHtoDAsync_v2_params *>(params);
187 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyH2D,
188 true);
189 }
190 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: {
191 const auto *p = reinterpret_cast<const cuMemcpyDtoH_v2_params *>(params);
192 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2H,
193 false);
194 }
195 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: {
196 const auto *p =
197 reinterpret_cast<const cuMemcpyDtoHAsync_v2_params *>(params);
198 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2H,
199 true);
200 }
201 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: {
202 const auto *p = reinterpret_cast<const cuMemcpyDtoD_v2_params *>(params);
203 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2D,
204 false);
205 }
206 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: {
207 const auto *p =
208 reinterpret_cast<const cuMemcpyDtoDAsync_v2_params *>(params);
209 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2D,
210 true);
211 }
212 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
213 const auto *p = reinterpret_cast<const cuMemcpy_params *>(params);
214 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyOther,
215 false);
216 }
217 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
218 const auto *p = reinterpret_cast<const cuMemcpyAsync_params *>(params);
219 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyOther,
220 true);
221 }
222 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2: {
223 const auto *p = reinterpret_cast<const cuMemcpy2D_v2_params *>(params);
224 return std::make_tuple(Bytes2D(p->pCopy), MemcpyKind(p->pCopy), false);
225 }
226 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2: {
227 const auto *p =
228 reinterpret_cast<const cuMemcpy2DAsync_v2_params *>(params);
229 return std::make_tuple(Bytes2D(p->pCopy), MemcpyKind(p->pCopy), true);
230 }
231 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2: {
232 const auto *p = reinterpret_cast<const cuMemcpy3D_v2_params *>(params);
233 return std::make_tuple(Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true);
234 }
235 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2: {
236 const auto *p =
237 reinterpret_cast<const cuMemcpy3DAsync_v2_params *>(params);
238 return std::make_tuple(Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true);
239 }
240 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: {
241 const auto *p2p_params =
242 reinterpret_cast<const cuMemcpyPeer_params *>(params);
243 return std::make_tuple(p2p_params->ByteCount,
244 CuptiTracerEventType::MemcpyP2P, false);
245 }
246 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: {
247 const auto *p2p_params =
248 reinterpret_cast<const cuMemcpyPeerAsync_params *>(params);
249 return std::make_tuple(p2p_params->ByteCount,
250 CuptiTracerEventType::MemcpyP2P, true);
251 }
252 default: {
253 LOG(ERROR) << "Unsupported memcpy activity observed: " << cbid;
254 return std::make_tuple(0, CuptiTracerEventType::Unsupported, false);
255 }
256 }
257 }
258
259 std::tuple<size_t /*bytes*/, CuptiTracerEventType, bool /*async*/>
DecodeDriverMemset(CUpti_CallbackId cbid,const void * params)260 DecodeDriverMemset(CUpti_CallbackId cbid, const void *params) {
261 switch (cbid) {
262 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2: {
263 const auto *p = reinterpret_cast<const cuMemsetD8_v2_params *>(params);
264 return std::make_tuple(p->N, CuptiTracerEventType::Memset, false);
265 }
266 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2: {
267 const auto *p = reinterpret_cast<const cuMemsetD16_v2_params *>(params);
268 return std::make_tuple(p->N, CuptiTracerEventType::Memset, false);
269 }
270 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2: {
271 const auto *p = reinterpret_cast<const cuMemsetD32_v2_params *>(params);
272 return std::make_tuple(p->N, CuptiTracerEventType::Memset, false);
273 }
274 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2: {
275 const auto *p = reinterpret_cast<const cuMemsetD2D8_v2_params *>(params);
276 return std::make_tuple(p->dstPitch * p->Height,
277 CuptiTracerEventType::Memset, false);
278 }
279 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2: {
280 const auto *p = reinterpret_cast<const cuMemsetD2D16_v2_params *>(params);
281 return std::make_tuple(p->dstPitch * p->Height,
282 CuptiTracerEventType::Memset, false);
283 }
284 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2: {
285 const auto *p = reinterpret_cast<const cuMemsetD2D32_v2_params *>(params);
286 return std::make_tuple(p->dstPitch * p->Height,
287 CuptiTracerEventType::Memset, false);
288 }
289 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async: {
290 const auto *p = reinterpret_cast<const cuMemsetD8Async_params *>(params);
291 return std::make_tuple(p->N, CuptiTracerEventType::Memset, true);
292 }
293 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async: {
294 const auto *p = reinterpret_cast<const cuMemsetD16Async_params *>(params);
295 return std::make_tuple(p->N, CuptiTracerEventType::Memset, true);
296 }
297 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async: {
298 const auto *p = reinterpret_cast<const cuMemsetD32Async_params *>(params);
299 return std::make_tuple(p->N, CuptiTracerEventType::Memset, true);
300 }
301 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async: {
302 const auto *p =
303 reinterpret_cast<const cuMemsetD2D8Async_params *>(params);
304 return std::make_tuple(p->dstPitch * p->Height,
305 CuptiTracerEventType::Memset, true);
306 }
307 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async: {
308 const auto *p =
309 reinterpret_cast<const cuMemsetD2D16Async_params *>(params);
310 return std::make_tuple(p->dstPitch * p->Height,
311 CuptiTracerEventType::Memset, true);
312 }
313 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async: {
314 const auto *p =
315 reinterpret_cast<const cuMemsetD2D32Async_params *>(params);
316 return std::make_tuple(p->dstPitch * p->Height,
317 CuptiTracerEventType::Memset, true);
318 }
319 default: {
320 LOG(ERROR) << "Unsupported memset activity observed: " << cbid;
321 return std::make_tuple(0, CuptiTracerEventType::Unsupported, false);
322 }
323 }
324 }
325
326 // Cupti callback corresponding to a driver or runtime API. This global function
327 // is invoked twice for each API: at entry and at exit. The cbdata
328 // parameter is guaranteed by Cupti to be thread-safe. Most invocations are
329 // dropped to the floor and entry/exit is tracked for the APIs we deem
330 // performance-relevant.
ApiCallback(void * user_data,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)331 void CUPTIAPI ApiCallback(void *user_data, CUpti_CallbackDomain domain,
332 CUpti_CallbackId cbid,
333 const CUpti_CallbackData *cbdata) {
334 CuptiTracer *tracer = reinterpret_cast<CuptiTracer *>(user_data);
335 tracer->HandleCallback(domain, cbid, cbdata).IgnoreError();
336 }
337
338 // Callback which is invoked when an empty buffer is requested by CUPTI.
339 // Allocates an empty aligned-memory buffer. The buffer is used by CUPTI as a
340 // ring buffer where device maintains activity profiles that have been
341 // collected.
RequestCuptiActivityBuffer(uint8_t ** buffer,size_t * size,size_t * maxNumRecords)342 void CUPTIAPI RequestCuptiActivityBuffer(uint8_t **buffer, size_t *size,
343 size_t *maxNumRecords) {
344 CuptiTracer::GetCuptiTracerSingleton()->RequestActivityBuffer(buffer, size);
345 VLOG(3) << "Requested CUPTI Buffer, buffer=" << std::hex
346 << reinterpret_cast<uintptr_t>(*buffer) << std::dec
347 << " size=" << *size;
348 // Request CUPTI to fill as many records as possible in the buffer.
349 *maxNumRecords = 0;
350 }
351
352 // Callback which is invoked when a buffer containing activity records is
353 // available from CUPTI. Processes the buffer after reading activity records
354 // from it.
ProcessCuptiActivityBuffer(CUcontext context,uint32_t stream_id,uint8_t * buffer,size_t size,size_t valid_size)355 void CUPTIAPI ProcessCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
356 uint8_t *buffer, size_t size,
357 size_t valid_size) {
358 VLOG(3) << "Processing CUPTI Buffer, buffer:" << std::hex
359 << reinterpret_cast<uintptr_t>(buffer) << std::dec
360 << " size: " << size << " valid_size: " << valid_size;
361 VLOG(3) << "Activity profile for stream " << stream_id;
362
363 Status status = CuptiTracer::GetCuptiTracerSingleton()->ProcessActivityBuffer(
364 context, stream_id, buffer, valid_size);
365 if (!status.ok()) {
366 LOG(ERROR) << status;
367 }
368 }
369
AddKernelEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)370 void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
371 const CUpti_CallbackData *cbdata,
372 uint64 start_time, uint64 end_time) {
373 CuptiTracerEvent event{};
374 event.type = CuptiTracerEventType::Kernel;
375 event.source = CuptiTracerEventSource::DriverCallback;
376 event.name = cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
377 event.start_time_ns = start_time;
378 event.end_time_ns = end_time;
379 event.thread_id = Env::Default()->GetCurrentThreadId();
380 event.device_id = device_id;
381 event.context_id = cbdata->contextUid;
382 event.correlation_id = cbdata->correlationId;
383 VLOG(3) << "Cuda Kernel launch API exit. name=" << event.name;
384 collector->AddEvent(std::move(event));
385 }
386
387 // Performs the actual callback for both normal and P2P memcpy operations.
PopulateMemcpyCallbackEvent(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,size_t num_bytes,uint32 src_device,uint32 dst_device,bool async,uint64 start_time,uint64 end_time)388 CuptiTracerEvent PopulateMemcpyCallbackEvent(
389 CuptiTracerEventType type, const CUpti_CallbackData *cbdata,
390 size_t num_bytes, uint32 src_device, uint32 dst_device, bool async,
391 uint64 start_time, uint64 end_time) {
392 CuptiTracerEvent event{};
393 event.type = type;
394 event.source = CuptiTracerEventSource::DriverCallback;
395 event.start_time_ns = start_time;
396 event.end_time_ns = end_time;
397 event.thread_id = Env::Default()->GetCurrentThreadId();
398 event.device_id = src_device;
399 event.context_id = cbdata->contextUid;
400 event.correlation_id = cbdata->correlationId;
401 event.memcpy_info.num_bytes = num_bytes;
402 event.memcpy_info.destination = dst_device;
403 event.memcpy_info.async = async;
404 // These are not populated during callback for API activities.
405 event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
406 event.memcpy_info.dst_mem_kind = CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN;
407 event.memcpy_info.src_mem_kind = CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN;
408 return event;
409 }
410
AddNormalMemcpyEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)411 void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector,
412 uint32 device_id, CUpti_CallbackId cbid,
413 const CUpti_CallbackData *cbdata,
414 uint64 start_time, uint64 end_time) {
415 size_t num_bytes;
416 CuptiTracerEventType type;
417 bool async;
418 std::tie(num_bytes, type, async) =
419 DecodeDriverMemcpy(cbid, cbdata->functionParams);
420
421 VLOG(3) << "Cuda Memcpy API exit. sz=" << num_bytes;
422 CuptiTracerEvent event =
423 PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, device_id, device_id,
424 async, start_time, end_time);
425 collector->AddEvent(std::move(event));
426 }
427
AddCuMemsetEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)428 void AddCuMemsetEventUponApiExit(CuptiTraceCollector *collector,
429 uint32 device_id, CUpti_CallbackId cbid,
430 const CUpti_CallbackData *cbdata,
431 uint64 start_time, uint64 end_time) {
432 // We are casting all variants of cuMemset to cuMemsetD8 for accessing the
433 // first member attribute, a CUdeviceptr.
434 const auto *params =
435 static_cast<const cuMemsetD8_v2_params *>(cbdata->functionParams);
436 size_t num_bytes;
437 bool async;
438 CuptiTracerEventType type;
439 std::tie(num_bytes, type, async) =
440 DecodeDriverMemset(cbid, cbdata->functionParams);
441
442 CuptiTracerEvent event{};
443 event.type = type;
444 event.source = CuptiTracerEventSource::DriverCallback;
445 event.start_time_ns = start_time;
446 event.end_time_ns = end_time;
447 event.thread_id = Env::Default()->GetCurrentThreadId();
448 event.device_id = device_id;
449 event.context_id = cbdata->contextUid;
450 event.correlation_id = cbdata->correlationId;
451 event.memset_info.num_bytes = num_bytes;
452 // memset_info.kind cannot be determined from API.
453 event.memset_info.async = async;
454 VLOG(3) << "Cuda Memset API exit."
455 << " dptr=" << reinterpret_cast<void *>(params->dstDevice)
456 << " sz=" << num_bytes;
457 collector->AddEvent(std::move(event));
458 }
459
AddP2PMemcpyEventUponApiExit(CuptiTraceCollector * collector,CuptiInterface * cupti_interface,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)460 void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector,
461 CuptiInterface *cupti_interface,
462 uint32 device_id, CUpti_CallbackId cbid,
463 const CUpti_CallbackData *cbdata,
464 uint64 start_time, uint64 end_time) {
465 size_t num_bytes;
466 CuptiTracerEventType type;
467 bool async;
468 std::tie(num_bytes, type, async) =
469 DecodeDriverMemcpy(cbid, cbdata->functionParams);
470
471 uint32 dst_device = -1, src_device = -1;
472 const auto *p2p_params =
473 static_cast<const cuMemcpyPeer_params *>(cbdata->functionParams);
474 cupti_interface->GetDeviceId(p2p_params->srcContext, &src_device);
475 cupti_interface->GetDeviceId(p2p_params->dstContext, &dst_device);
476 VLOG(3) << "Cuda P2P Memcpy API exit, src: " << src_device
477 << " dst: " << dst_device << " size:" << num_bytes;
478 CuptiTracerEvent event =
479 PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, src_device,
480 dst_device, async, start_time, end_time);
481 collector->AddEvent(std::move(event));
482 }
483
AddCuMemAllocEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)484 void AddCuMemAllocEventUponApiExit(CuptiTraceCollector *collector,
485 uint32 device_id, CUpti_CallbackId cbid,
486 const CUpti_CallbackData *cbdata,
487 uint64 start_time, uint64 end_time) {
488 const auto *params =
489 static_cast<const cuMemAlloc_v2_params *>(cbdata->functionParams);
490 CuptiTracerEvent event{};
491 event.type = CuptiTracerEventType::MemoryAlloc;
492 event.source = CuptiTracerEventSource::DriverCallback;
493 event.name = cbdata->functionName;
494 event.start_time_ns = start_time;
495 event.end_time_ns = end_time;
496 event.thread_id = Env::Default()->GetCurrentThreadId();
497 event.device_id = device_id;
498 event.context_id = cbdata->contextUid;
499 event.correlation_id = cbdata->correlationId;
500 event.memalloc_info.num_bytes = params->bytesize;
501 VLOG(3) << "Cuda MemAlloc API exit."
502 << " dptr=" << reinterpret_cast<void *>(*params->dptr)
503 << " sz=" << params->bytesize;
504 collector->AddEvent(std::move(event));
505 }
506
AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)507 void AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector *collector,
508 uint32 device_id, CUpti_CallbackId cbid,
509 const CUpti_CallbackData *cbdata,
510 uint64 start_time, uint64 end_time) {
511 const auto *params =
512 static_cast<const cuMemAllocPitch_v2_params *>(cbdata->functionParams);
513 CuptiTracerEvent event{};
514 event.type = CuptiTracerEventType::MemoryAlloc;
515 event.source = CuptiTracerEventSource::DriverCallback;
516 event.name = cbdata->functionName;
517 event.start_time_ns = start_time;
518 event.end_time_ns = end_time;
519 event.thread_id = Env::Default()->GetCurrentThreadId();
520 event.device_id = device_id;
521 event.context_id = cbdata->contextUid;
522 event.correlation_id = cbdata->correlationId;
523 const size_t size_in_bytes = *params->pPitch * params->Height;
524 event.memalloc_info.num_bytes = size_in_bytes;
525 VLOG(3) << "Cuda MemAllocPitch API exit."
526 << " dptr=" << reinterpret_cast<void *>(*params->dptr)
527 << " sz=" << size_in_bytes;
528 collector->AddEvent(std::move(event));
529 }
530
AddCuMemFreeEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)531 void AddCuMemFreeEventUponApiExit(CuptiTraceCollector *collector,
532 uint32 device_id, CUpti_CallbackId cbid,
533 const CUpti_CallbackData *cbdata,
534 uint64 start_time, uint64 end_time) {
535 const auto *params =
536 static_cast<const cuMemFree_v2_params *>(cbdata->functionParams);
537 CuptiTracerEvent event{};
538 event.type = CuptiTracerEventType::MemoryFree;
539 event.source = CuptiTracerEventSource::DriverCallback;
540 event.name = cbdata->functionName;
541 event.start_time_ns = start_time;
542 event.end_time_ns = end_time;
543 event.thread_id = Env::Default()->GetCurrentThreadId();
544 event.device_id = device_id;
545 event.context_id = cbdata->contextUid;
546 event.correlation_id = cbdata->correlationId;
547 VLOG(3) << "Cuda MemFree API exit."
548 << " dptr=" << reinterpret_cast<void *>(params->dptr);
549 collector->AddEvent(std::move(event));
550 }
551
AddGenericEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)552 void AddGenericEventUponApiExit(CuptiTraceCollector *collector,
553 uint32 device_id, CUpti_CallbackId cbid,
554 const CUpti_CallbackData *cbdata,
555 uint64 start_time, uint64 end_time) {
556 CuptiTracerEvent event{};
557 event.type = CuptiTracerEventType::Generic;
558 event.source = CuptiTracerEventSource::DriverCallback;
559 event.name = cbdata->functionName;
560 event.start_time_ns = start_time;
561 event.end_time_ns = end_time;
562 event.thread_id = Env::Default()->GetCurrentThreadId();
563 event.device_id = device_id;
564 event.context_id = cbdata->contextUid;
565 event.correlation_id = cbdata->correlationId;
566 VLOG(3) << "Observed generic API exit."
567 << " name=" << cbdata->functionName;
568 collector->AddEvent(std::move(event));
569 }
570
AddKernelActivityEvent(CuptiTraceCollector * collector,const CuptiActivityKernelTy * kernel)571 void AddKernelActivityEvent(CuptiTraceCollector *collector,
572 const CuptiActivityKernelTy *kernel) {
573 CuptiTracerEvent event{};
574 event.type = CuptiTracerEventType::Kernel;
575 event.source = CuptiTracerEventSource::Activity;
576 event.name = kernel->name;
577 event.start_time_ns = kernel->start;
578 event.end_time_ns = kernel->end;
579 event.device_id = kernel->deviceId;
580 event.context_id = kernel->contextId;
581 event.stream_id = kernel->streamId;
582 event.correlation_id = kernel->correlationId;
583 AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
584 event.device_id, event.correlation_id);
585 event.annotation = info.annotation;
586 event.nvtx_range = info.nvtx_range;
587 event.kernel_info.registers_per_thread = kernel->registersPerThread;
588 event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
589 event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
590 event.kernel_info.block_x = kernel->blockX;
591 event.kernel_info.block_y = kernel->blockY;
592 event.kernel_info.block_z = kernel->blockZ;
593 event.kernel_info.grid_x = kernel->gridX;
594 event.kernel_info.grid_y = kernel->gridY;
595 event.kernel_info.grid_z = kernel->gridZ;
596 #if TF_CUPTI_HAS_CHANNEL_ID
597 event.kernel_info.channel_id = kernel->channelID;
598 event.kernel_info.channel_type = kernel->channelType;
599 #endif
600 collector->AddEvent(std::move(event));
601 }
602
AddMemcpyActivityEvent(CuptiTraceCollector * collector,const CuptiActivityMemcpyTy * memcpy)603 void AddMemcpyActivityEvent(CuptiTraceCollector *collector,
604 const CuptiActivityMemcpyTy *memcpy) {
605 CuptiTracerEvent event{};
606 switch (memcpy->copyKind) {
607 case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
608 event.type = CuptiTracerEventType::MemcpyH2D;
609 event.name = "MemcpyH2D";
610 break;
611 case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
612 event.type = CuptiTracerEventType::MemcpyD2H;
613 event.name = "MemcpyD2H";
614 break;
615 case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
616 event.type = CuptiTracerEventType::MemcpyD2D;
617 event.name = "MemcpyD2D";
618 break;
619 case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
620 event.type = CuptiTracerEventType::MemcpyP2P;
621 event.name = "MemcpyP2P";
622 break;
623 default:
624 event.type = CuptiTracerEventType::MemcpyOther;
625 event.name = "MemcpyOther";
626 break;
627 }
628
629 event.source = CuptiTracerEventSource::Activity;
630 event.start_time_ns = memcpy->start;
631 event.end_time_ns = memcpy->end;
632 event.device_id = memcpy->deviceId;
633 event.context_id = memcpy->contextId;
634 event.stream_id = memcpy->streamId;
635 event.correlation_id = memcpy->correlationId;
636 AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
637 event.device_id, event.correlation_id);
638 event.annotation = info.annotation;
639 event.memcpy_info.copy_kind = memcpy->copyKind;
640 event.memcpy_info.num_bytes = memcpy->bytes;
641 event.memcpy_info.destination = memcpy->deviceId;
642 event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
643 event.memcpy_info.src_mem_kind = memcpy->srcKind;
644 event.memcpy_info.dst_mem_kind = memcpy->dstKind;
645 #if TF_CUPTI_HAS_CHANNEL_ID
646 event.memcpy_info.channel_id = memcpy->channelID;
647 event.memcpy_info.channel_type = memcpy->channelType;
648 #endif
649 collector->AddEvent(std::move(event));
650 }
651
652 // Invokes callback upon peer-2-peer memcpy between different GPU devices.
AddMemcpyP2PActivityEvent(CuptiTraceCollector * collector,const CuptiActivityMemcpyP2PTy * memcpy)653 void AddMemcpyP2PActivityEvent(CuptiTraceCollector *collector,
654 const CuptiActivityMemcpyP2PTy *memcpy) {
655 CuptiTracerEvent event{};
656 event.type = CuptiTracerEventType::MemcpyP2P;
657 event.name = "MemcpyP2P";
658 event.source = CuptiTracerEventSource::Activity;
659 event.start_time_ns = memcpy->start;
660 event.end_time_ns = memcpy->end;
661 event.device_id = memcpy->srcDeviceId;
662 event.context_id = memcpy->contextId;
663 event.stream_id = memcpy->streamId;
664 event.correlation_id = memcpy->correlationId;
665 AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
666 event.device_id, event.correlation_id);
667 event.annotation = info.annotation;
668 event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
669 event.memcpy_info.num_bytes = memcpy->bytes;
670 event.memcpy_info.destination = memcpy->dstDeviceId;
671 event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
672 event.memcpy_info.src_mem_kind = memcpy->srcKind;
673 event.memcpy_info.dst_mem_kind = memcpy->dstKind;
674 #if TF_CUPTI_HAS_CHANNEL_ID
675 event.memcpy_info.channel_id = memcpy->channelID;
676 event.memcpy_info.channel_type = memcpy->channelType;
677 #endif
678 collector->AddEvent(std::move(event));
679 }
680
AddCuptiOverheadActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityOverhead * overhead)681 void AddCuptiOverheadActivityEvent(CuptiTraceCollector *collector,
682 const CUpti_ActivityOverhead *overhead) {
683 CuptiTracerEvent event{};
684 event.type = CuptiTracerEventType::Overhead;
685 event.name = getActivityOverheadKindString(overhead->overheadKind);
686 event.source = CuptiTracerEventSource::Activity;
687 event.start_time_ns = overhead->start;
688 event.end_time_ns = overhead->end;
689 // If the overhead is not related to a device, we assign it to device 0.
690 event.device_id = 0;
691 // NOTE: no correlation id.
692 switch (overhead->objectKind) {
693 case CUPTI_ACTIVITY_OBJECT_UNKNOWN:
694 // Don't know how to deal with such activities because of we need either
695 // attribute it to a GPU stream or a CPU thread.
696 return;
697
698 case CUPTI_ACTIVITY_OBJECT_THREAD:
699 case CUPTI_ACTIVITY_OBJECT_PROCESS:
700 event.thread_id = overhead->objectId.pt.threadId;
701 break;
702 case CUPTI_ACTIVITY_OBJECT_STREAM:
703 event.stream_id = overhead->objectId.dcs.streamId;
704 TF_FALLTHROUGH_INTENDED;
705 case CUPTI_ACTIVITY_OBJECT_DEVICE:
706 case CUPTI_ACTIVITY_OBJECT_CONTEXT:
707 event.device_id = overhead->objectId.dcs.deviceId;
708 break;
709 default:
710 LOG(ERROR) << "Unexpected object kind: " << overhead->objectKind;
711 return;
712 }
713 collector->AddEvent(std::move(event));
714 }
715
AddUnifiedMemoryActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityUnifiedMemoryCounter2 * record)716 void AddUnifiedMemoryActivityEvent(
717 CuptiTraceCollector *collector,
718 const CUpti_ActivityUnifiedMemoryCounter2 *record) {
719 VLOG(3) << "Cuda Unified Memory Activity, kind: " << record->counterKind
720 << " src: " << record->srcId << " dst: " << record->dstId;
721 CuptiTracerEvent event{};
722 event.type = CuptiTracerEventType::UnifiedMemory;
723 event.name = getActivityUnifiedMemoryKindString(record->counterKind);
724 event.source = CuptiTracerEventSource::Activity;
725 event.start_time_ns = record->start;
726 if (record->counterKind ==
727 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT ||
728 record->counterKind ==
729 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING ||
730 record->counterKind ==
731 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP ||
732 record->end <= record->start) {
733 // If the end time is not valid, trim it so that it can be shown on the UI.
734 event.end_time_ns = record->start + 1;
735 } else {
736 event.end_time_ns = record->end;
737 }
738 event.device_id = record->srcId;
739 // NOTE: not context id and correlation id.
740
741 // For visualization purpose, we assign a pseudo stream id for each
742 // record->counterKind of unified memory related events.
743 constexpr int kPseudoStreamId = 0x10000000;
744 event.stream_id = kPseudoStreamId + record->counterKind;
745 event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
746 // Check whether the activity is byte transfer.
747 if (record->counterKind ==
748 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD ||
749 record->counterKind ==
750 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH ||
751 record->counterKind ==
752 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD) {
753 event.memcpy_info.num_bytes = record->value;
754 } else {
755 event.memcpy_info.num_bytes = 0;
756 }
757 event.memcpy_info.destination = record->dstId;
758 event.memcpy_info.async = false;
759 collector->AddEvent(std::move(event));
760 }
761
AddMemoryActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemory * memory)762 void AddMemoryActivityEvent(CuptiTraceCollector *collector,
763 const CUpti_ActivityMemory *memory) {
764 CuptiTracerEvent event{};
765 event.name = absl::StrCat("Memory ", GetMemoryKindName(memory->memoryKind));
766 event.type = CuptiTracerEventType::MemoryResidency;
767 event.source = CuptiTracerEventSource::Activity;
768 event.start_time_ns = memory->start;
769 event.end_time_ns = std::max(memory->end, memory->start + 1);
770 event.device_id = memory->deviceId;
771 event.context_id = memory->contextId;
772 // Assign to default stream (0) so that event is included during Flush().
773 event.stream_id = 0;
774 event.memory_residency_info.num_bytes = memory->bytes;
775 event.memory_residency_info.mem_kind = memory->memoryKind;
776 event.memory_residency_info.address = memory->address;
777 VLOG(5) << "Cuda activity " << event.name
778 << " addr: " << reinterpret_cast<void *>(memory->address)
779 << " bytes: " << memory->bytes;
780 collector->AddEvent(std::move(event));
781 }
782
AddMemsetActivityEvent(CuptiTraceCollector * collector,const CuptiActivityMemsetTy * memset)783 void AddMemsetActivityEvent(CuptiTraceCollector *collector,
784 const CuptiActivityMemsetTy *memset) {
785 auto mem_kind = memset->memoryKind;
786 CuptiTracerEvent event{};
787 event.type = CuptiTracerEventType::Memset;
788 event.source = CuptiTracerEventSource::Activity;
789 event.name = absl::StrCat("Memset ", mem_kind);
790 event.start_time_ns = memset->start;
791 event.end_time_ns = std::max(memset->end, memset->start + 1);
792 event.device_id = memset->deviceId;
793 event.correlation_id = memset->correlationId;
794 event.context_id = memset->contextId;
795 event.stream_id = memset->streamId;
796 event.memset_info.num_bytes = memset->bytes;
797 event.memset_info.mem_kind = mem_kind;
798 event.memset_info.async = (memset->flags & CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC);
799 #if TF_CUPTI_HAS_CHANNEL_ID
800 event.memset_info.channel_id = memset->channelID;
801 event.memset_info.channel_type = memset->channelType;
802 #endif
803 VLOG(5) << "Cuda activity " << event.name << " bytes: " << memset->bytes
804 << " async: " << event.memset_info.async;
805 collector->AddEvent(std::move(event));
806 }
807
AddSynchronizationActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivitySynchronization * sync)808 void AddSynchronizationActivityEvent(
809 CuptiTraceCollector *collector, const CUpti_ActivitySynchronization *sync) {
810 CuptiTracerEvent event{};
811 event.type = CuptiTracerEventType::Generic;
812 event.source = CuptiTracerEventSource::Activity;
813 switch (sync->type) {
814 case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE:
815 event.name = "cuEventSynchronize";
816 break;
817 case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT:
818 event.name = "cuStreamWaitEvent";
819 break;
820 case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE:
821 event.name = "cuStreamSynchronize";
822 break;
823 case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_CONTEXT_SYNCHRONIZE:
824 event.name = "cuCtxSynchronize";
825 break;
826 default:
827 event.name = "unknown synchronization event";
828 break;
829 }
830 event.start_time_ns = sync->start;
831 event.end_time_ns = std::max(sync->end, sync->start + 1);
832 event.correlation_id = sync->correlationId;
833 event.context_id = sync->contextId;
834 VLOG(5) << "Cuda activity " << event.name;
835 collector->AddEvent(std::move(event));
836 }
837
838 // This hook uses cupti activity api to measure device side activities.
839 class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
840 public:
CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions & option,CuptiInterface * cupti_interface,CuptiTraceCollector * collector)841 CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions &option,
842 CuptiInterface *cupti_interface,
843 CuptiTraceCollector *collector)
844 : option_(option),
845 cupti_interface_(cupti_interface),
846 collector_(collector) {}
847
OnDriverApiEnter(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)848 Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
849 CUpti_CallbackId cbid,
850 const CUpti_CallbackData *cbdata) override {
851 // Stash away the current Cupti timestamp into cbdata.
852 *cbdata->correlationData =
853 option_.required_callback_api_events ? CuptiTracer::GetTimestamp() : 0;
854 return OkStatus();
855 }
OnDriverApiExit(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)856 Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
857 CUpti_CallbackId cbid,
858 const CUpti_CallbackData *cbdata) override {
859 // If we are not collecting CPU events from Callback API, we can return now.
860 if (!option_.required_callback_api_events) {
861 return OkStatus();
862 }
863
864 // Grab timestamp for API exit. API entry timestamp saved in cbdata.
865 uint64 end_tsc = CuptiTracer::GetTimestamp();
866 uint64 start_tsc = *cbdata->correlationData;
867 TrackContext(cbid, cbdata->context);
868 return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
869 start_tsc, end_tsc, domain, cbid, cbdata);
870 }
SyncAndFlush()871 Status SyncAndFlush() override {
872 if (option_.sync_devices_before_stop) {
873 CuptiApiTracingDisabler disabler;
874 absl::MutexLock lock(&mutex_);
875 for (auto &ctx : contexts_) {
876 cuCtxPushCurrent(ctx);
877 cuCtxSynchronize(); // Ignore error here for best effort.
878 CUcontext current;
879 cuCtxPopCurrent(¤t);
880 }
881 }
882 return OkStatus();
883 }
884
885 private:
TrackContext(CUpti_CallbackId cbid,CUcontext ctx)886 void TrackContext(CUpti_CallbackId cbid, CUcontext ctx) {
887 if (!option_.sync_devices_before_stop) return;
888 if (ctx == nullptr) return;
889 absl::MutexLock lock(&mutex_);
890 if (cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2 ||
891 cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy) {
892 contexts_.erase(ctx);
893 } else {
894 contexts_.emplace(ctx);
895 }
896 }
897
898 const CuptiTracerOptions option_;
899 CuptiInterface *cupti_interface_;
900 CuptiTraceCollector *collector_;
901 absl::Mutex mutex_;
902 absl::flat_hash_set<CUcontext> contexts_ TF_GUARDED_BY(mutex_);
903
904 TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithActivityApi);
905 };
906
907 struct KernelRecord {
908 const char *kernel_name;
909 // TODO(csigg): cuStreamGetCtx introduced in CUDA 9.2 would allow us to only
910 // record the stream and infer the context during collection.
911 CUcontext context;
912 CUstream stream;
913 uint32 correlation_id;
914 CUevent start_event;
915 CUevent stop_event;
916 KernelDetails details;
917 uint64 start_timestamp;
918 };
919
920 struct MemcpyRecord {
921 CuptiTracerEventType type;
922 size_t size_bytes;
923 CUcontext context;
924 CUstream stream;
925 uint32 correlation_id;
926 bool async;
927 CUevent start_event;
928 CUevent stop_event;
929 uint64 start_timestamp;
930 };
931
CreateAndRecordEvent(CUevent * event,CUstream stream)932 Status CreateAndRecordEvent(CUevent *event, CUstream stream) {
933 CuptiApiTracingDisabler disabler;
934 TF_RETURN_IF_ERROR(ToStatus(cuEventCreate(event, CU_EVENT_DEFAULT)));
935 return ToStatus(cuEventRecord(*event, stream));
936 }
937
938 #if CUDA_VERSION >= 10000
939 // Maintain and restore current thread's CUDA context.
940 // Note: cuStreamGetCtx only available after CUDA 9.2.
941 class ScopedCudaContext {
942 public:
ScopedCudaContext(CUstream stream)943 explicit ScopedCudaContext(CUstream stream) : stream_(stream) {
944 CuptiApiTracingDisabler disabler; // don't trace cuda call in this func.
945 CUcontext context;
946 if (cuStreamGetCtx(stream, &context) != CUDA_SUCCESS) return;
947 context_ = context;
948 uint32 device_ordinal;
949 if (cuptiGetDeviceId(context, &device_ordinal) != CUPTI_SUCCESS) return;
950 device_ordinal_ = device_ordinal;
951 context_pushed_ = cuCtxPushCurrent(context) == CUDA_SUCCESS;
952 }
~ScopedCudaContext()953 ~ScopedCudaContext() {
954 if (!context_pushed_) return;
955 CuptiApiTracingDisabler disabler; // don't trace cuda call in this func.
956 cuCtxPopCurrent(&*context_);
957 }
958
959 // If successful, return the device ordinal of the relevant cuda stream.
960 // Otherwise absl::nullopt;
GetDeviceOrdinal()961 absl::optional<uint32> GetDeviceOrdinal() { return device_ordinal_; }
962
963 // If successful, return the cuda context of the relevant cuda stream.
964 // Otherwise absl::nullopt;
GetContext()965 absl::optional<CUcontext> GetContext() { return context_; }
966
967 private:
968 CUstream stream_;
969 absl::optional<CUcontext> context_;
970 absl::optional<uint32> device_ordinal_;
971 bool context_pushed_ = false;
972 };
973 #endif
974
975 // Stores a series of kernel and memcpy records.
976 class CudaEventRecorder {
977 public:
CudaEventRecorder(CuptiInterface * cupti_interface,CuptiTraceCollector * collector,int ordinal)978 CudaEventRecorder(CuptiInterface *cupti_interface,
979 CuptiTraceCollector *collector, int ordinal)
980 : cupti_interface_(cupti_interface),
981 collector_(collector),
982 ordinal_(ordinal) {
983 device_name_ = absl::StrCat("gpu ", ordinal); // default.
984 CUdevice device;
985 if (cuDeviceGet(&device, ordinal) == CUDA_SUCCESS) {
986 char name[100];
987 if (cuDeviceGetName(name, sizeof(name), device) == CUDA_SUCCESS) {
988 device_name_ = name;
989 }
990 }
991 }
992
993 // Registers the start of a kernel launch. The returned index should be passed
994 // to StopKernel() after the kernel launch has completed.
995 template <typename T>
StartKernel(const char * kernel_name,CUcontext context,uint32 correlation_id,const T * params)996 size_t StartKernel(const char *kernel_name, CUcontext context,
997 uint32 correlation_id, const T *params) {
998 CUstream stream = params->hStream;
999 KernelRecord record = {kernel_name, context, stream, correlation_id};
1000 record.details.registers_per_thread = 0; // unknown.
1001 record.details.static_shared_memory_usage = params->sharedMemBytes;
1002 record.details.dynamic_shared_memory_usage = 0; // unknown
1003 record.details.block_x = params->blockDimX;
1004 record.details.block_y = params->blockDimY;
1005 record.details.block_z = params->blockDimZ;
1006 record.details.grid_x = params->gridDimX;
1007 record.details.grid_y = params->gridDimY;
1008 record.details.grid_z = params->gridDimZ;
1009 record.start_timestamp = CuptiTracer::GetTimestamp();
1010 LogIfError(CreateAndRecordEvent(&record.start_event, stream));
1011 absl::MutexLock lock(&mutex_);
1012 if (stopped_) return -1;
1013 kernel_records_.push_back(record);
1014 return kernel_records_.size() - 1;
1015 }
StopKernel(size_t index)1016 uint64 StopKernel(size_t index) {
1017 absl::MutexLock lock(&mutex_);
1018 if (index >= kernel_records_.size()) return 0;
1019 auto &record = kernel_records_[index];
1020 LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
1021 return record.start_timestamp;
1022 }
1023
1024 // Registers the start of a copy operation. The returned index should be
1025 // passed to StopMemcpy() after the memcpy has completed.
StartMemcpy(CuptiTracerEventType type,size_t size_bytes,CUcontext context,CUstream stream,uint32 correlation_id,bool async)1026 size_t StartMemcpy(CuptiTracerEventType type, size_t size_bytes,
1027 CUcontext context, CUstream stream, uint32 correlation_id,
1028 bool async) {
1029 MemcpyRecord record = {type, size_bytes, context,
1030 stream, correlation_id, async};
1031 record.start_timestamp = CuptiTracer::GetTimestamp();
1032 LogIfError(CreateAndRecordEvent(&record.start_event, stream));
1033 absl::MutexLock lock(&mutex_);
1034 if (stopped_) return -1;
1035 memcpy_records_.push_back(record);
1036 return memcpy_records_.size() - 1;
1037 }
StopMemcpy(size_t index)1038 uint64 StopMemcpy(size_t index) {
1039 absl::MutexLock lock(&mutex_);
1040 if (index >= memcpy_records_.size()) return 0;
1041 auto &record = memcpy_records_[index];
1042 LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
1043 return record.start_timestamp;
1044 }
1045
Stop()1046 Status Stop() {
1047 {
1048 absl::MutexLock lock(&mutex_);
1049 stopped_ = true;
1050 LOG(INFO) << "Collecting " << kernel_records_.size()
1051 << " kernel records, " << memcpy_records_.size()
1052 << " memcpy records.";
1053
1054 // Gather all profiled streams and contexts.
1055 for (const auto &record : kernel_records_) {
1056 TF_RETURN_IF_ERROR(
1057 AddStreamInfo(record.context, record.stream, "Kernel"));
1058 }
1059 for (const auto &record : memcpy_records_) {
1060 TF_RETURN_IF_ERROR(AddStreamInfo(record.context, record.stream,
1061 GetTraceEventTypeName(record.type)));
1062 }
1063 }
1064
1065 // Synchronize all contexts, record end events, synchronize again.
1066 // This scheme is an unreliable measure to associate a event with the wall
1067 // time. There are chances that other threads might enque kernels which
1068 // delay the second synchronization.
1069 TF_RETURN_IF_ERROR(Synchronize());
1070 for (auto &pair : context_infos_) {
1071 TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
1072 TF_RETURN_IF_ERROR(CreateAndRecordEvent(&pair.second.end_event, nullptr));
1073 }
1074
1075 TF_RETURN_IF_ERROR(Synchronize());
1076 end_walltime_us_ = Env::Default()->NowMicros();
1077 return OkStatus();
1078 }
1079
Flush(AnnotationMap * annotation_map)1080 Status Flush(AnnotationMap *annotation_map) {
1081 auto kernel_records = ConsumeKernelRecords();
1082 auto memcpy_records = ConsumeMemcpyRecords();
1083 for (const auto &record : kernel_records) {
1084 TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
1085 }
1086 for (const auto &record : memcpy_records) {
1087 TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
1088 }
1089 return OkStatus();
1090 }
1091
ConsumeKernelRecords()1092 std::vector<KernelRecord> ConsumeKernelRecords() {
1093 absl::MutexLock lock(&mutex_);
1094 return std::move(kernel_records_);
1095 }
ConsumeMemcpyRecords()1096 std::vector<MemcpyRecord> ConsumeMemcpyRecords() {
1097 absl::MutexLock lock(&mutex_);
1098 return std::move(memcpy_records_);
1099 }
1100
1101 private:
1102 struct ContextInfo {
1103 uint32 context_id = 0;
1104 int num_streams = 0;
1105 CUevent end_event;
1106 };
1107
1108 struct StreamInfo {
1109 uint32 stream_id = 0;
1110 std::string name;
1111 int index; // 0 is reserved for null stream.
1112 const ContextInfo *ctx_info;
1113 };
1114
1115 // Synchronizes all contexts.
Synchronize() const1116 Status Synchronize() const {
1117 CuptiApiTracingDisabler disabler;
1118 for (const auto &pair : context_infos_) {
1119 TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
1120 TF_RETURN_IF_ERROR(ToStatus(cuCtxSynchronize()));
1121 }
1122 return OkStatus();
1123 }
1124
1125 // Returns element from context_infos_, adding it if not yet present.
GetContextInfo(CUcontext context,ContextInfo ** ctx_info_ptr)1126 Status GetContextInfo(CUcontext context, ContextInfo **ctx_info_ptr) {
1127 auto it = context_infos_.find(context);
1128
1129 if (it == context_infos_.end()) {
1130 uint32 context_id = 0;
1131 RETURN_IF_CUPTI_ERROR(
1132 cupti_interface_->GetContextId(context, &context_id));
1133 ContextInfo ctx_info = {context_id};
1134 it = context_infos_.emplace(context, ctx_info).first;
1135 }
1136
1137 *ctx_info_ptr = &it->second;
1138 return OkStatus();
1139 }
1140
1141 // Adds element to stream_infos_ if not yet present. If present, clear name
1142 // if it doesn't match parameter.
AddStreamInfo(CUcontext context,CUstream stream,absl::string_view name)1143 Status AddStreamInfo(CUcontext context, CUstream stream,
1144 absl::string_view name) {
1145 StreamKey key(context, stream);
1146 auto it = stream_infos_.find(key);
1147 if (it != stream_infos_.end()) {
1148 if (it->second.name != name) {
1149 it->second.name.clear(); // Stream with inconsistent names, clear it.
1150 }
1151 return OkStatus();
1152 }
1153
1154 ContextInfo *ctx_info;
1155 TF_RETURN_IF_ERROR(GetContextInfo(context, &ctx_info));
1156 int index = stream ? ++ctx_info->num_streams : 0;
1157 uint32 stream_id = 0;
1158 #if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
1159 RETURN_IF_CUPTI_ERROR(
1160 cupti_interface_->GetStreamIdEx(context, stream, 1, &stream_id));
1161 #else
1162 RETURN_IF_CUPTI_ERROR(
1163 cupti_interface_->GetStreamIdEx(context, stream, 0, &stream_id));
1164 #endif
1165
1166 StreamInfo stream_info = {stream_id, static_cast<std::string>(name), index,
1167 ctx_info};
1168 stream_infos_.emplace(key, stream_info);
1169 return OkStatus();
1170 }
1171
1172 // Returns time in microseconds between events recorded on the GPU.
GetElapsedTimeUs(CUevent start,CUevent stop)1173 static uint64_t GetElapsedTimeUs(CUevent start, CUevent stop) {
1174 CuptiApiTracingDisabler disabler;
1175 float elapsed_ms = 0.0f;
1176 LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop)));
1177 return static_cast<uint64>(
1178 std::llroundf(1000 * std::max(elapsed_ms, 0.0f)));
1179 }
1180
SaveRecord(const KernelRecord & record,AnnotationMap * annotation_map) const1181 Status SaveRecord(const KernelRecord &record,
1182 AnnotationMap *annotation_map) const {
1183 if (!record.start_event || !record.stop_event) {
1184 return OkStatus();
1185 }
1186 const auto &stream_info =
1187 stream_infos_.at(StreamKey(record.context, record.stream));
1188 auto start_us =
1189 GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
1190 auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
1191
1192 std::string annotation;
1193
1194 CuptiTracerEvent event{};
1195 event.type = CuptiTracerEventType::Kernel;
1196 event.source = CuptiTracerEventSource::Activity; // on gpu device.
1197 event.name = record.kernel_name;
1198 event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
1199 event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
1200 event.device_id = ordinal_;
1201 event.context_id = stream_info.ctx_info->context_id;
1202 event.stream_id = stream_info.stream_id;
1203 event.correlation_id = record.correlation_id;
1204 AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
1205 event.device_id, event.correlation_id);
1206 event.annotation = info.annotation;
1207 event.kernel_info = record.details;
1208 collector_->AddEvent(std::move(event));
1209 return OkStatus();
1210 }
1211
SaveRecord(const MemcpyRecord & record,AnnotationMap * annotation_map) const1212 Status SaveRecord(const MemcpyRecord &record,
1213 AnnotationMap *annotation_map) const {
1214 if (!record.start_event || !record.stop_event) {
1215 return OkStatus();
1216 }
1217 const auto &stream_info =
1218 stream_infos_.at(StreamKey(record.context, record.stream));
1219 auto start_us =
1220 GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
1221 auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
1222
1223 CuptiTracerEvent event{};
1224 event.type = record.type;
1225 event.name = GetTraceEventTypeName(event.type);
1226 event.source = CuptiTracerEventSource::Activity;
1227 event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
1228 event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
1229 event.device_id = ordinal_;
1230 event.context_id = stream_info.ctx_info->context_id;
1231 event.stream_id = stream_info.stream_id;
1232 event.correlation_id = record.correlation_id;
1233 AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
1234 event.device_id, event.correlation_id);
1235 event.annotation = info.annotation;
1236 event.memcpy_info.num_bytes = record.size_bytes;
1237 // TODO: support MemcpyD2D where destination != source;
1238 event.memcpy_info.destination = ordinal_;
1239 event.memcpy_info.async = record.async;
1240 // TODO: set src_mem_kind and dst_mem_kind.
1241 collector_->AddEvent(std::move(event));
1242 return OkStatus();
1243 }
1244
1245 absl::Mutex mutex_;
1246 bool stopped_ TF_GUARDED_BY(mutex_) = false;
1247 std::vector<KernelRecord> kernel_records_ TF_GUARDED_BY(mutex_);
1248 std::vector<MemcpyRecord> memcpy_records_ TF_GUARDED_BY(mutex_);
1249
1250 CuptiInterface *cupti_interface_;
1251 CuptiTraceCollector *collector_;
1252 const int ordinal_;
1253 std::string device_name_;
1254 uint64 end_walltime_us_;
1255 // Include context in key to distinguish null streams.
1256 using StreamKey = std::pair<CUcontext, CUstream>;
1257
1258 absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
1259 absl::flat_hash_map<StreamKey, StreamInfo> stream_infos_;
1260 };
1261
1262 // This hook uses cuda events to measure device side activities.
1263 class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
1264 public:
CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions & option,CuptiInterface * cupti_interface,CuptiTraceCollector * collector)1265 CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions &option,
1266 CuptiInterface *cupti_interface,
1267 CuptiTraceCollector *collector)
1268 : option_(option),
1269 cupti_interface_(cupti_interface),
1270 collector_(collector) {
1271 int num_gpus = CuptiTracer::NumGpus();
1272 cuda_event_recorders_.reserve(num_gpus);
1273 for (int i = 0; i < num_gpus; ++i) {
1274 cuda_event_recorders_.emplace_back(
1275 std::make_unique<CudaEventRecorder>(cupti_interface, collector, i));
1276 }
1277 }
~CuptiDriverApiHookWithCudaEvent()1278 ~CuptiDriverApiHookWithCudaEvent() {
1279 for (auto *callback_context : callback_contexts_) delete callback_context;
1280 }
1281
OnDriverApiEnter(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1282 Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
1283 CUpti_CallbackId cbid,
1284 const CUpti_CallbackData *cbdata) override {
1285 auto *recorder = cuda_event_recorders_[device_id].get();
1286 switch (cbid) {
1287 case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: {
1288 DCHECK_NE(cbdata->symbolName, nullptr);
1289 const auto *params =
1290 static_cast<const cuLaunchKernel_params *>(cbdata->functionParams);
1291 *cbdata->correlationData = recorder->StartKernel<cuLaunchKernel_params>(
1292 cbdata->symbolName, cbdata->context, cbdata->correlationId, params);
1293 break;
1294 }
1295 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: {
1296 DCHECK_NE(cbdata->symbolName, nullptr);
1297 const auto *params =
1298 static_cast<const cuLaunchCooperativeKernel_params *>(
1299 cbdata->functionParams);
1300 *cbdata->correlationData =
1301 recorder->StartKernel<cuLaunchCooperativeKernel_params>(
1302 cbdata->symbolName, cbdata->context, cbdata->correlationId,
1303 params);
1304 break;
1305 }
1306 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
1307 #if CUDA_VERSION >= 10000
1308 const auto *params =
1309 static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
1310 cbdata->functionParams);
1311 std::vector<uint32> record_indices;
1312 record_indices.reserve(params->numDevices);
1313 *cbdata->correlationData = -1; // Invalid value.
1314 const auto &annotation = AnnotationStack::Get();
1315 for (int i = 0; i < params->numDevices; ++i) {
1316 CUstream stream = params->launchParamsList[i].hStream;
1317 ScopedCudaContext scoped_cuda_context(stream);
1318 auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
1319 auto context = scoped_cuda_context.GetContext();
1320 if (!dev_id) return errors::Internal("Invalid CUDA stream");
1321 // Because annotation are per device, therefore we need to populate
1322 // annotation for each device involved.
1323 collector_->annotation_map()->Add(*dev_id, cbdata->correlationId,
1324 annotation, "");
1325 record_indices.push_back(
1326 cuda_event_recorders_[*dev_id]->StartKernel<CUDA_LAUNCH_PARAMS>(
1327 "CooperativeKernelMultiDevice", *context,
1328 cbdata->correlationId, &(params->launchParamsList[i])));
1329 }
1330 auto *callback_context =
1331 new CuptiApiCallbackContext(std::move(record_indices));
1332 callback_contexts_.insert(callback_context);
1333 *cbdata->correlationData = reinterpret_cast<uint64>(callback_context);
1334 #else
1335 VLOG(1) << "Unhandled cuLaunchCooperativeKernelMultiDevice.";
1336 #endif
1337 } break;
1338 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
1339 const auto *params =
1340 static_cast<const cuMemcpy_params *>(cbdata->functionParams);
1341 StartMemcpy<cuMemcpy_params>(GetMemcpyType(params->src, params->dst),
1342 cbdata, recorder);
1343 break;
1344 }
1345 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
1346 const auto *params =
1347 static_cast<const cuMemcpyAsync_params *>(cbdata->functionParams);
1348 StartMemcpyAsync<cuMemcpyAsync_params>(
1349 GetMemcpyType(params->src, params->dst), cbdata, recorder);
1350 break;
1351 }
1352 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1353 StartMemcpy<cuMemcpyHtoD_v2_params>(CuptiTracerEventType::MemcpyH2D,
1354 cbdata, recorder);
1355 break;
1356 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1357 StartMemcpyAsync<cuMemcpyHtoDAsync_v2_params>(
1358 CuptiTracerEventType::MemcpyH2D, cbdata, recorder);
1359 break;
1360 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1361 StartMemcpy<cuMemcpyDtoH_v2_params>(CuptiTracerEventType::MemcpyD2H,
1362 cbdata, recorder);
1363 break;
1364 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1365 StartMemcpyAsync<cuMemcpyDtoHAsync_v2_params>(
1366 CuptiTracerEventType::MemcpyD2H, cbdata, recorder);
1367 break;
1368 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1369 StartMemcpy<cuMemcpyDtoD_v2_params>(CuptiTracerEventType::MemcpyD2D,
1370 cbdata, recorder);
1371 break;
1372 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1373 StartMemcpyAsync<cuMemcpyDtoDAsync_v2_params>(
1374 CuptiTracerEventType::MemcpyD2D, cbdata, recorder);
1375 break;
1376 default:
1377 VLOG(1) << "Unexpected callback id: " << cbid;
1378 break;
1379 }
1380 return OkStatus();
1381 }
1382
OnDriverApiExit(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1383 Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
1384 CUpti_CallbackId cbid,
1385 const CUpti_CallbackData *cbdata) override {
1386 auto *recorder = cuda_event_recorders_[device_id].get();
1387 if (*cbdata->correlationData == static_cast<size_t>(-1)) return OkStatus();
1388 uint64 start_tsc = 0;
1389 switch (cbid) {
1390 case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
1391 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
1392 start_tsc = recorder->StopKernel(*cbdata->correlationData);
1393 break;
1394 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
1395 #if CUDA_VERSION >= 10000
1396 auto *callback_context = reinterpret_cast<CuptiApiCallbackContext *>(
1397 *cbdata->correlationData);
1398 callback_contexts_.erase(callback_context);
1399 auto record_indices = std::move(callback_context->record_indices);
1400 delete callback_context;
1401 const auto *params =
1402 static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
1403 cbdata->functionParams);
1404 if (record_indices.size() != params->numDevices)
1405 return errors::Internal("Invalid correlation data");
1406 for (int i = 0; i < params->numDevices; ++i) {
1407 CUstream stream = params->launchParamsList[i].hStream;
1408 ScopedCudaContext scoped_cuda_context(stream);
1409 auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
1410 if (!dev_id) return errors::Internal("Invalid CUDA stream");
1411 start_tsc =
1412 cuda_event_recorders_[*dev_id]->StopKernel(record_indices[i]);
1413 }
1414 #endif
1415 } break;
1416 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
1417 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
1418 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1419 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1420 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1421 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1422 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1423 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1424 start_tsc = recorder->StopMemcpy(*cbdata->correlationData);
1425 break;
1426 default:
1427 VLOG(1) << "Unexpected callback id: " << cbid;
1428 // TODO: figure out how to get start timestamp in this case.
1429 return OkStatus();
1430 }
1431 // If we are not collecting CPU events from Callback API, we can return now.
1432 if (!option_.required_callback_api_events) {
1433 return OkStatus();
1434 }
1435
1436 // Grab timestamp for API exit. API entry timestamp saved in cbdata.
1437 uint64 end_tsc = CuptiTracer::GetTimestamp();
1438 return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
1439 start_tsc, end_tsc, domain, cbid, cbdata);
1440 }
SyncAndFlush()1441 Status SyncAndFlush() override {
1442 for (auto &recorder : cuda_event_recorders_) {
1443 TF_RETURN_IF_ERROR(recorder->Stop());
1444 }
1445 for (auto &recorder : cuda_event_recorders_) {
1446 TF_RETURN_IF_ERROR(recorder->Flush(collector_->annotation_map()));
1447 }
1448 return OkStatus();
1449 }
1450
1451 private:
1452 template <typename T>
StartMemcpy(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,CudaEventRecorder * recorder)1453 static void StartMemcpy(CuptiTracerEventType type,
1454 const CUpti_CallbackData *cbdata,
1455 CudaEventRecorder *recorder) {
1456 const auto *params = static_cast<const T *>(cbdata->functionParams);
1457 *cbdata->correlationData =
1458 recorder->StartMemcpy(type, params->ByteCount, cbdata->context, nullptr,
1459 cbdata->correlationId, /*async*/ false);
1460 }
1461
1462 template <typename T>
StartMemcpyAsync(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,CudaEventRecorder * recorder)1463 static void StartMemcpyAsync(CuptiTracerEventType type,
1464 const CUpti_CallbackData *cbdata,
1465 CudaEventRecorder *recorder) {
1466 const auto *params = static_cast<const T *>(cbdata->functionParams);
1467 *cbdata->correlationData = recorder->StartMemcpy(
1468 type, params->ByteCount, cbdata->context, params->hStream,
1469 cbdata->correlationId, /*async*/ true);
1470 }
1471
GetMemoryType(CUdeviceptr ptr)1472 static CUmemorytype GetMemoryType(CUdeviceptr ptr) {
1473 CuptiApiTracingDisabler disabler;
1474 CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
1475 auto status =
1476 cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr);
1477 if (status == CUDA_ERROR_INVALID_VALUE) {
1478 // Pointer not registered with CUDA, must be host memory.
1479 return CU_MEMORYTYPE_HOST;
1480 }
1481 LogIfError(ToStatus(status));
1482 return mem_type;
1483 }
1484
GetMemcpyType(CUdeviceptr src,CUdeviceptr dst)1485 static CuptiTracerEventType GetMemcpyType(CUdeviceptr src, CUdeviceptr dst) {
1486 CUmemorytype src_type = GetMemoryType(src);
1487 CUmemorytype dst_type = GetMemoryType(dst);
1488 // TODO: handle CU_MEMORYTYPE_ARRAY case
1489 if (src_type == CU_MEMORYTYPE_HOST && dst_type == CU_MEMORYTYPE_DEVICE) {
1490 return CuptiTracerEventType::MemcpyH2D;
1491 } else if (src_type == CU_MEMORYTYPE_DEVICE &&
1492 dst_type == CU_MEMORYTYPE_HOST) {
1493 return CuptiTracerEventType::MemcpyD2H;
1494 } else if (src_type == CU_MEMORYTYPE_DEVICE &&
1495 dst_type == CU_MEMORYTYPE_DEVICE) {
1496 return CuptiTracerEventType::MemcpyD2D;
1497 }
1498 return CuptiTracerEventType::MemcpyOther;
1499 }
1500
1501 // Each cuLaunchCooperativeKernelMultiDevice will need to add an entry in
1502 // each corresponding device, therefore we need to keep records of all
1503 // the record indices in each device's record array.
1504 // We allocate such data structure during API entry and free during API exit.
1505 // However there is no guarantee that we receive such callbacks in pairs, we
1506 // maintain a on-going API calls to make sure no memory leaks.
1507 struct CuptiApiCallbackContext {
CuptiApiCallbackContexttensorflow::profiler::__anonec3d75710111::CuptiDriverApiHookWithCudaEvent::CuptiApiCallbackContext1508 explicit CuptiApiCallbackContext(std::vector<uint32> &&r)
1509 : record_indices(std::move(r)) {}
1510 std::vector<uint32> record_indices;
1511 };
1512
1513 const CuptiTracerOptions option_;
1514 CuptiInterface *cupti_interface_;
1515 CuptiTraceCollector *collector_;
1516 absl::node_hash_set<CuptiApiCallbackContext *> callback_contexts_;
1517 std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
1518 TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
1519 };
1520
ErrorWithHostname(absl::string_view error_message)1521 /*static*/ std::string ErrorWithHostname(absl::string_view error_message) {
1522 return absl::StrCat(port::Hostname(), ": ", error_message);
1523 }
1524
1525 } // namespace
1526
AddDriverApiCallbackEvent(CuptiTraceCollector * collector,CuptiInterface * cupti_interface,int device_id,uint64 start_tsc,uint64 end_tsc,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1527 /*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
1528 CuptiTraceCollector *collector, CuptiInterface *cupti_interface,
1529 int device_id, uint64 start_tsc, uint64 end_tsc,
1530 CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
1531 const CUpti_CallbackData *cbdata) {
1532 switch (cbid) {
1533 case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
1534 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
1535 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice:
1536 AddKernelEventUponApiExit(collector, device_id, cbdata, start_tsc,
1537 end_tsc);
1538 break;
1539 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
1540 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
1541 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1542 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1543 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1544 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1545 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1546 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1547 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2:
1548 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2:
1549 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2:
1550 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2:
1551 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2:
1552 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2:
1553 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2:
1554 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2:
1555 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2:
1556 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2:
1557 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2:
1558 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2:
1559 // This would be the place to populate the memcpy API activity's src and
1560 // dst memory kind by casting cbdata->functionParams. However, we are not
1561 // doing that because that will incur significant overhead to get the
1562 // memory aperture of each argument.
1563 AddNormalMemcpyEventUponApiExit(collector, device_id, cbid, cbdata,
1564 start_tsc, end_tsc);
1565 break;
1566 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer:
1567 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync:
1568 AddP2PMemcpyEventUponApiExit(collector, cupti_interface, device_id, cbid,
1569 cbdata, start_tsc, end_tsc);
1570 break;
1571 case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2:
1572 AddCuMemAllocEventUponApiExit(collector, device_id, cbid, cbdata,
1573 start_tsc, end_tsc);
1574 break;
1575 case CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2:
1576 AddCuMemAllocPitchEventUponApiExit(collector, device_id, cbid, cbdata,
1577 start_tsc, end_tsc);
1578 break;
1579 case CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2:
1580 AddCuMemFreeEventUponApiExit(collector, device_id, cbid, cbdata,
1581 start_tsc, end_tsc);
1582 break;
1583 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2:
1584 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2:
1585 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2:
1586 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2:
1587 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2:
1588 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2:
1589 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async:
1590 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async:
1591 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async:
1592 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async:
1593 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async:
1594 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async:
1595 AddCuMemsetEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc,
1596 end_tsc);
1597 break;
1598 default:
1599 AddGenericEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc,
1600 end_tsc);
1601 break;
1602 }
1603 return OkStatus();
1604 }
1605
GetTraceEventTypeName(const CuptiTracerEventType & type)1606 const char *GetTraceEventTypeName(const CuptiTracerEventType &type) {
1607 // Do not use a default so that this gives a build error when
1608 // CuptiTracerEventType is extended but this is not.
1609 switch (type) {
1610 case CuptiTracerEventType::MemcpyH2D:
1611 return "MemcpyH2D";
1612 case CuptiTracerEventType::MemcpyD2H:
1613 return "MemcpyD2H";
1614 case CuptiTracerEventType::MemcpyD2D:
1615 return "MemcpyD2D";
1616 case CuptiTracerEventType::MemcpyP2P:
1617 return "MemcpyP2P";
1618 case CuptiTracerEventType::MemcpyOther:
1619 return "MemcpyOther";
1620 case CuptiTracerEventType::Kernel:
1621 return "Compute";
1622 case CuptiTracerEventType::MemoryAlloc:
1623 return "MemoryAlloc";
1624 case CuptiTracerEventType::MemoryFree:
1625 return "MemoryFree";
1626 case CuptiTracerEventType::Memset:
1627 return "Memset";
1628 case CuptiTracerEventType::Overhead:
1629 return "Overhead";
1630 case CuptiTracerEventType::UnifiedMemory:
1631 return "UnifiedMemory";
1632 case CuptiTracerEventType::Generic:
1633 return "Generic";
1634 case CuptiTracerEventType::MemoryResidency:
1635 return "MemoryResidency";
1636 case CuptiTracerEventType::Unsupported:
1637 return "";
1638 }
1639 }
1640
CuptiTracer(CuptiInterface * cupti_interface)1641 CuptiTracer::CuptiTracer(CuptiInterface *cupti_interface)
1642 : num_gpus_(NumGpus()),
1643 cupti_interface_(cupti_interface),
1644 buffer_pool_(kBufferSizeInBytes) {}
1645
GetCuptiTracerSingleton()1646 /* static */ CuptiTracer *CuptiTracer::GetCuptiTracerSingleton() {
1647 static auto *singleton = new CuptiTracer(GetCuptiInterface());
1648 return singleton;
1649 }
1650
IsAvailable() const1651 bool CuptiTracer::IsAvailable() const {
1652 return NumGpus() && !activity_tracing_enabled_ && !api_tracing_enabled_;
1653 }
1654
NumGpus()1655 int CuptiTracer::NumGpus() {
1656 static int num_gpus = []() -> int {
1657 if (cuInit(0) != CUDA_SUCCESS) {
1658 return 0;
1659 }
1660 int gpu_count;
1661 if (cuDeviceGetCount(&gpu_count) != CUDA_SUCCESS) {
1662 return 0;
1663 }
1664 LOG(INFO) << "Profiler found " << gpu_count << " GPUs";
1665 return gpu_count;
1666 }();
1667 return num_gpus;
1668 }
1669
Enable(const CuptiTracerOptions & option,CuptiTraceCollector * collector)1670 void CuptiTracer::Enable(const CuptiTracerOptions &option,
1671 CuptiTraceCollector *collector) {
1672 option_ = option;
1673 collector_ = collector;
1674 if (option_->enable_event_based_activity) {
1675 option_->enable_activity_api = false;
1676 cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithCudaEvent(
1677 option, cupti_interface_, collector));
1678 } else {
1679 cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithActivityApi(
1680 option, cupti_interface_, collector));
1681 }
1682
1683 Status status = EnableApiTracing();
1684 need_root_access_ |= status.code() == error::PERMISSION_DENIED;
1685 if (!status.ok()) return;
1686
1687 if (option_->enable_activity_api) {
1688 EnableActivityTracing().IgnoreError();
1689 }
1690 tensorflow::profiler::AnnotationStack::Enable(true);
1691 }
1692
Disable()1693 void CuptiTracer::Disable() {
1694 DisableApiTracing().IgnoreError();
1695 if (option_->enable_activity_api) {
1696 DisableActivityTracing().IgnoreError();
1697 }
1698 cupti_interface_->CleanUp();
1699 Finalize().IgnoreError();
1700 cupti_driver_api_hook_->SyncAndFlush().IgnoreError();
1701 collector_->Flush();
1702 collector_ = nullptr;
1703 option_.reset();
1704 cupti_driver_api_hook_.reset();
1705 tensorflow::profiler::AnnotationStack::Enable(false);
1706 }
1707
EnableApiTracing()1708 Status CuptiTracer::EnableApiTracing() {
1709 if (api_tracing_enabled_) return OkStatus();
1710
1711 VLOG(1) << "Enable subscriber";
1712 // Subscribe can return CUPTI_ERROR_MAX_LIMIT_REACHED.
1713 // The application which calls CUPTI APIs cannot be used with Nvidia tools
1714 // like nvprof, Nvidia Visual Profiler, Nsight Compute, Nsight Systems.
1715 RETURN_IF_CUPTI_ERROR(cupti_interface_->Subscribe(
1716 &subscriber_, (CUpti_CallbackFunc)ApiCallback, this));
1717 api_tracing_enabled_ = true;
1718
1719 if (!option_->cbids_selected.empty()) {
1720 for (auto cbid : option_->cbids_selected) {
1721 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
1722 1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
1723 }
1724 } else { // select all callback ids.
1725 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1726 1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
1727 }
1728
1729 if (option_->enable_nvtx_tracking) {
1730 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1731 1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
1732 }
1733 return OkStatus();
1734 }
1735
DisableApiTracing()1736 Status CuptiTracer::DisableApiTracing() {
1737 if (!api_tracing_enabled_) return OkStatus();
1738
1739 api_tracing_enabled_ = false;
1740
1741 if (!option_->cbids_selected.empty()) {
1742 for (auto cbid : option_->cbids_selected) {
1743 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
1744 0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
1745 }
1746 } else {
1747 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1748 0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
1749 }
1750
1751 if (option_->enable_nvtx_tracking) {
1752 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1753 0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
1754 }
1755
1756 VLOG(1) << "Disable subscriber";
1757 RETURN_IF_CUPTI_ERROR(cupti_interface_->Unsubscribe(subscriber_));
1758 return OkStatus();
1759 }
1760
EnableActivityTracing()1761 Status CuptiTracer::EnableActivityTracing() {
1762 if (!option_->activities_selected.empty()) {
1763 // Initialize callback functions for Cupti Activity API.
1764 VLOG(1) << "Registering CUPTI activity callbacks";
1765 RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityRegisterCallbacks(
1766 RequestCuptiActivityBuffer, ProcessCuptiActivityBuffer));
1767
1768 VLOG(1) << "Enabling activity tracing for "
1769 << option_->activities_selected.size() << " activities";
1770 for (auto activity : option_->activities_selected) {
1771 VLOG(1) << "Enabling activity tracing for: " << activity;
1772 if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
1773 ConfigureActivityUnifiedMemoryCounter(true);
1774 }
1775 RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityEnable(activity));
1776 }
1777 }
1778 activity_tracing_enabled_ = true;
1779 return OkStatus();
1780 }
1781
DisableActivityTracing()1782 Status CuptiTracer::DisableActivityTracing() {
1783 if (activity_tracing_enabled_) {
1784 VLOG(1) << "Disabling activity tracing for "
1785 << option_->activities_selected.size() << " activities";
1786 for (auto activity : option_->activities_selected) {
1787 VLOG(1) << "Disabling activity tracing for: " << activity;
1788 if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
1789 ConfigureActivityUnifiedMemoryCounter(false);
1790 }
1791 RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityDisable(activity));
1792 }
1793 option_->activities_selected.clear();
1794
1795 VLOG(1) << "Flushing CUPTI activity buffer";
1796 RETURN_IF_CUPTI_ERROR(
1797 cupti_interface_->ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
1798 LOG(INFO) << "CUPTI activity buffer flushed";
1799 }
1800 activity_tracing_enabled_ = false;
1801 return OkStatus();
1802 }
1803
Finalize()1804 Status CuptiTracer::Finalize() {
1805 if (option_->cupti_finalize) {
1806 VLOG(1) << "CuptiFinalize";
1807 RETURN_IF_CUPTI_ERROR(cupti_interface_->Finalize());
1808 }
1809 return OkStatus();
1810 }
1811
GetTimestamp()1812 /*static*/ uint64 CuptiTracer::GetTimestamp() {
1813 uint64_t tsc;
1814 CuptiInterface *cupti_interface = GetCuptiInterface();
1815 if (cupti_interface && cupti_interface->GetTimestamp(&tsc) == CUPTI_SUCCESS) {
1816 return tsc;
1817 }
1818 // Return 0 on error. If an activity timestamp is 0, the activity will be
1819 // dropped during time normalization.
1820 return 0;
1821 }
1822
HandleNVTXCallback(CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1823 Status CuptiTracer::HandleNVTXCallback(CUpti_CallbackId cbid,
1824 const CUpti_CallbackData *cbdata) {
1825 const CUpti_NvtxData *pdata =
1826 reinterpret_cast<const CUpti_NvtxData *>(cbdata);
1827 if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePushEx) {
1828 const nvtxDomainRangePushEx_params *params =
1829 reinterpret_cast<const nvtxDomainRangePushEx_params *>(
1830 pdata->functionParams);
1831 // TODO(profiler): The messageType is actually NVTX_MESSAGE_TYPE_REGISTERED
1832 // (which is 3), However it seems to me that we can not get the registered
1833 // string from nvtxDomainRegisterStringA_params. If we reinterpret the
1834 // payload as ascii, it happen to work.
1835 NVTXRangeTracker::EnterRange(params->core.eventAttrib->message.ascii);
1836 } else if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePop) {
1837 NVTXRangeTracker::ExitRange();
1838 }
1839 return OkStatus();
1840 }
1841
HandleCallback(CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1842 Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
1843 CUpti_CallbackId cbid,
1844 const CUpti_CallbackData *cbdata) {
1845 if (!api_tracing_enabled_) return OkStatus(); // already unsubscribed.
1846 if (!cupti_driver_api_hook_) return OkStatus(); // already unsubscribed.
1847 if (domain == CUPTI_CB_DOMAIN_NVTX) return HandleNVTXCallback(cbid, cbdata);
1848 if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return OkStatus();
1849 if (internalCuCall) return OkStatus();
1850
1851 if (cbdata->context == nullptr) {
1852 // API callback is called before any CUDA context is created.
1853 // This is expected to be rare, and we ignore this case.
1854 VLOG(3) << "API callback received before creation of CUDA context\n";
1855 return errors::Internal("cutpi callback without context");
1856 }
1857
1858 // Grab a correct device ID.
1859 uint32 device_id = -1;
1860 RETURN_IF_CUPTI_ERROR(
1861 cupti_interface_->GetDeviceId(cbdata->context, &device_id));
1862 if (device_id >= num_gpus_) {
1863 return errors::Internal("Invalid device id:", device_id);
1864 }
1865
1866 if (cbdata->callbackSite == CUPTI_API_ENTER) {
1867 TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiEnter(
1868 device_id, domain, cbid, cbdata));
1869 } else if (cbdata->callbackSite == CUPTI_API_EXIT) {
1870 // Set up the map from correlation id to annotation string.
1871 const auto &annotation = AnnotationStack::Get();
1872 if (!annotation.empty()) {
1873 if (cbid ==
1874 CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice) {
1875 // Kernels are launched on different devices by this API call, therefore
1876 // we need to populate per device annotation map respectively.
1877 for (int i = 0; i < num_gpus_; ++i) {
1878 collector_->annotation_map()->Add(i, cbdata->correlationId,
1879 annotation, "");
1880 }
1881 } else {
1882 absl::string_view nvtx_range = NVTXRangeTracker::CurrentRange();
1883 collector_->annotation_map()->Add(device_id, cbdata->correlationId,
1884 annotation, nvtx_range);
1885 }
1886 }
1887
1888 TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiExit(
1889 device_id, domain, cbid, cbdata));
1890 }
1891 return OkStatus();
1892 }
1893
ConfigureActivityUnifiedMemoryCounter(bool enable)1894 void CuptiTracer::ConfigureActivityUnifiedMemoryCounter(bool enable) {
1895 CUpti_ActivityUnifiedMemoryCounterConfig config[2];
1896 // By experiments, currently only measurements from these two activities are
1897 // trustworthy. Others like GPU page fault may be problematic.
1898 config[0].kind =
1899 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD;
1900 config[1].kind =
1901 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH;
1902
1903 for (size_t i = 0; i < 2; i++) {
1904 config[i].enable = enable;
1905 }
1906
1907 CUptiResult res;
1908
1909 res = cupti_interface_->ActivityConfigureUnifiedMemoryCounter(config, 2);
1910 if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED) {
1911 LOG(ERROR) << "Unified memory is not supported on the "
1912 "underlying platform.\n";
1913 } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE) {
1914 LOG(ERROR) << "Unified memory is not supported on the device.\n";
1915 } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES) {
1916 LOG(ERROR) << "Unified memory is not supported on the "
1917 "non-P2P multi-gpu setup.\n";
1918 } else if (res != CUPTI_SUCCESS) {
1919 const char *errstr = "";
1920 cuptiGetResultString(res, &errstr);
1921 LOG(ERROR) << "Error while enabling unified memory profiling: " << errstr;
1922 } else {
1923 VLOG(1) << "Configuring Unified memory profiling: " << res;
1924 }
1925 }
1926
RequestActivityBuffer(uint8_t ** buffer,size_t * size)1927 void CuptiTracer::RequestActivityBuffer(uint8_t **buffer, size_t *size) {
1928 *buffer = buffer_pool_.GetOrCreateBuffer();
1929 if (*buffer == nullptr) {
1930 LOG(WARNING)
1931 << "CUPTI Buffer not allocated, activity records will be dropped";
1932 *size = 0;
1933 return;
1934 }
1935 *size = buffer_pool_.GetBufferSizeInBytes();
1936 }
1937
ProcessActivityBuffer(CUcontext context,uint32_t stream_id,uint8_t * buffer,size_t size)1938 Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
1939 uint8_t *buffer, size_t size) {
1940 auto buffer_cleanup =
1941 gtl::MakeCleanup([&]() { buffer_pool_.ReclaimBuffer(buffer); });
1942 if (size == 0) {
1943 return OkStatus();
1944 }
1945 if (!activity_tracing_enabled_) {
1946 LOG(WARNING) << "CUPTI activity buffer is reclaimed after flush.";
1947 return OkStatus();
1948 }
1949 if (cupti_interface_->Disabled()) return errors::Internal("Disabled.");
1950
1951 CUpti_Activity *record = nullptr;
1952 while (true) {
1953 CUptiResult status =
1954 cupti_interface_->ActivityGetNextRecord(buffer, size, &record);
1955 if (status == CUPTI_SUCCESS) {
1956 switch (record->kind) {
1957 case CUPTI_ACTIVITY_KIND_KERNEL: // sequential
1958 case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
1959 AddKernelActivityEvent(
1960 collector_, reinterpret_cast<CuptiActivityKernelTy *>(record));
1961 break;
1962 case CUPTI_ACTIVITY_KIND_MEMCPY:
1963 AddMemcpyActivityEvent(
1964 collector_, reinterpret_cast<CuptiActivityMemcpyTy *>(record));
1965 break;
1966 case CUPTI_ACTIVITY_KIND_MEMCPY2:
1967 AddMemcpyP2PActivityEvent(
1968 collector_, reinterpret_cast<CuptiActivityMemcpyP2PTy *>(record));
1969 break;
1970 case CUPTI_ACTIVITY_KIND_OVERHEAD:
1971 AddCuptiOverheadActivityEvent(
1972 collector_, reinterpret_cast<CUpti_ActivityOverhead *>(record));
1973 break;
1974 case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER:
1975 AddUnifiedMemoryActivityEvent(
1976 collector_,
1977 reinterpret_cast<CUpti_ActivityUnifiedMemoryCounter2 *>(record));
1978 break;
1979 case CUPTI_ACTIVITY_KIND_MEMORY: {
1980 AddMemoryActivityEvent(
1981 collector_, reinterpret_cast<CUpti_ActivityMemory *>(record));
1982 } break;
1983 case CUPTI_ACTIVITY_KIND_MEMSET:
1984 AddMemsetActivityEvent(
1985 collector_, reinterpret_cast<CuptiActivityMemsetTy *>(record));
1986 break;
1987 case CUPTI_ACTIVITY_KIND_SYNCHRONIZATION:
1988 AddSynchronizationActivityEvent(
1989 collector_,
1990 reinterpret_cast<CUpti_ActivitySynchronization *>(record));
1991 break;
1992 default:
1993 VLOG(3) << "Activity type " << record->kind << " is not supported.";
1994 break;
1995 }
1996 } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
1997 break;
1998 } else {
1999 return errors::Internal("Parse cupti activity buffer error.");
2000 }
2001 }
2002
2003 // Report dropped records.
2004 size_t dropped;
2005 RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityGetNumDroppedRecords(
2006 context, stream_id, &dropped));
2007 if (dropped != 0) {
2008 uint32 device_id = -1;
2009 RETURN_IF_CUPTI_ERROR(cupti_interface_->GetDeviceId(context, &device_id));
2010 collector_->OnEventsDropped("cupti activity buffer full", dropped);
2011 }
2012 return OkStatus();
2013 }
2014
ErrorIfAny()2015 /*static*/ std::string CuptiTracer::ErrorIfAny() {
2016 if (CuptiTracer::NumGpus() == 0) {
2017 return ErrorWithHostname("No GPU detected.");
2018 } else if (CuptiTracer::GetCuptiTracerSingleton()->NeedRootAccess()) {
2019 return ErrorWithHostname(
2020 "Insufficient privilege to run libcupti (you need root permission).");
2021 } else if (CuptiTracer::GetTimestamp() == 0) {
2022 return ErrorWithHostname(
2023 "Failed to load libcupti (is it installed and accessible?)");
2024 }
2025 return "";
2026 }
2027
2028 } // namespace profiler
2029 } // namespace tensorflow
2030