1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
17
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/container/flat_hash_set.h"
20 #include "absl/container/node_hash_map.h"
21 #include "absl/container/node_hash_set.h"
22 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
23 #include "third_party/gpus/cuda/extras/CUPTI/include/generated_nvtx_meta.h"
24 #include "tensorflow/core/lib/gtl/cleanup.h"
25 #include "tensorflow/core/platform/env.h"
26 #include "tensorflow/core/platform/errors.h"
27 #include "tensorflow/core/platform/host_info.h"
28 #include "tensorflow/core/platform/logging.h"
29 #include "tensorflow/core/platform/macros.h"
30 #include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
31 #include "tensorflow/core/profiler/internal/gpu/cupti_collector.h"
32 #include "tensorflow/core/profiler/internal/gpu/nvtx_utils.h"
33
34 namespace tensorflow {
35 namespace profiler {
36
37 namespace {
38
39 static thread_local int internalCuCall = 0;
40
41 // Temporary disable cupti api tracing for this thread during the life scope of
42 // this class. Used for the API calls that initiated by us.
43 class CuptiApiTracingDisabler {
44 public:
CuptiApiTracingDisabler()45 CuptiApiTracingDisabler() { internalCuCall++; }
~CuptiApiTracingDisabler()46 ~CuptiApiTracingDisabler() { internalCuCall--; }
47 };
48
ToStatus(CUptiResult result)49 Status ToStatus(CUptiResult result) {
50 if (result == CUPTI_SUCCESS) {
51 return Status::OK();
52 }
53 const char *str = nullptr;
54 cuptiGetResultString(result, &str);
55 return errors::Unavailable("CUPTI error: ", str ? str : "<unknown>");
56 }
57
ToStatus(CUresult result)58 Status ToStatus(CUresult result) {
59 if (result == CUDA_SUCCESS) {
60 return Status::OK();
61 }
62 const char *str = nullptr;
63 cuGetErrorName(result, &str);
64 return errors::Unavailable("CUDA error: ", str ? str : "<unknown>");
65 }
66
LogIfError(const Status & status)67 inline void LogIfError(const Status &status) {
68 if (status.ok()) return;
69 LOG(ERROR) << status.error_message();
70 }
71
72 // Maps an OverheadKind enum to a const string.
getActivityOverheadKindString(CUpti_ActivityOverheadKind kind)73 const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
74 switch (kind) {
75 case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER:
76 return "COMPILER";
77 case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH:
78 return "BUFFER_FLUSH";
79 case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION:
80 return "INSTRUMENTATION";
81 case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE:
82 return "RESOURCE";
83 default:
84 break;
85 }
86 return "<UNKNOWN>";
87 }
88
getActivityUnifiedMemoryKindString(CUpti_ActivityUnifiedMemoryCounterKind kind)89 const char *getActivityUnifiedMemoryKindString(
90 CUpti_ActivityUnifiedMemoryCounterKind kind) {
91 switch (kind) {
92 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
93 return "UM_BYTES_TRANSFER_HTOD";
94 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
95 return "UM_BYTES_TRANSFER_DTOH";
96 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
97 return "UM_CPU_PAGE_FAULT";
98 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
99 return "UM_GPU_PAGE_FAULT";
100 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING:
101 return "UM_THRASHING";
102 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING:
103 return "UM_THROTTLING";
104 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP:
105 return "UM_REMOTE_MAP";
106 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD:
107 return "UM_BYTES_TRANSFER_DTOD";
108 default:
109 break;
110 }
111 return "<UNKNOWN>";
112 }
113
114 // CUPTI_ERROR_INSUFFICIENT_PRIVILEGES is introduced at CUDA 10.1.
115 #if CUDA_VERSION <= 10000
116 #define CUPTI_ERROR_INSUFFICIENT_PRIVILEGES 35
117 #endif
118
119 #define RETURN_IF_CUPTI_ERROR(expr) \
120 do { \
121 CUptiResult status = expr; \
122 if (ABSL_PREDICT_FALSE(status != CUPTI_SUCCESS)) { \
123 const char *errstr = ""; \
124 cupti_interface_->GetResultString(status, &errstr); \
125 LOG(ERROR) << "function " << #expr << "failed with error " << errstr; \
126 if (status == CUPTI_ERROR_INSUFFICIENT_PRIVILEGES) { \
127 return errors::PermissionDenied("CUPTI need root access!"); \
128 } else { \
129 return errors::Internal("CUPTI call error", errstr); \
130 } \
131 } \
132 } while (false)
133
Bytes2D(const CUDA_MEMCPY2D * p)134 size_t Bytes2D(const CUDA_MEMCPY2D *p) { return p->Height * p->WidthInBytes; }
135
Bytes3D(const CUDA_MEMCPY3D * p)136 size_t Bytes3D(const CUDA_MEMCPY3D *p) {
137 return p->Depth * p->Height * p->WidthInBytes;
138 }
139
140 template <typename CudaMemcpy>
MemcpyKind(const CudaMemcpy * p)141 CuptiTracerEventType MemcpyKind(const CudaMemcpy *p) {
142 if (p->srcMemoryType == CU_MEMORYTYPE_HOST &&
143 p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
144 return CuptiTracerEventType::MemcpyH2D;
145 }
146 if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
147 p->dstMemoryType == CU_MEMORYTYPE_HOST) {
148 return CuptiTracerEventType::MemcpyD2H;
149 }
150 if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
151 p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
152 return CuptiTracerEventType::MemcpyD2D;
153 }
154 return CuptiTracerEventType::Unsupported;
155 }
156
157 std::tuple<size_t /*bytes*/, CuptiTracerEventType, bool /*async*/>
DecodeDriverMemcpy(CUpti_CallbackId cbid,const void * params)158 DecodeDriverMemcpy(CUpti_CallbackId cbid, const void *params) {
159 switch (cbid) {
160 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: {
161 const auto *p = reinterpret_cast<const cuMemcpyHtoD_v2_params *>(params);
162 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyH2D,
163 false);
164 }
165 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: {
166 const auto *p =
167 reinterpret_cast<const cuMemcpyHtoDAsync_v2_params *>(params);
168 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyH2D,
169 true);
170 }
171 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: {
172 const auto *p = reinterpret_cast<const cuMemcpyDtoH_v2_params *>(params);
173 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2H,
174 false);
175 }
176 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: {
177 const auto *p =
178 reinterpret_cast<const cuMemcpyDtoHAsync_v2_params *>(params);
179 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2H,
180 true);
181 }
182 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: {
183 const auto *p = reinterpret_cast<const cuMemcpyDtoD_v2_params *>(params);
184 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2D,
185 false);
186 }
187 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: {
188 const auto *p =
189 reinterpret_cast<const cuMemcpyDtoDAsync_v2_params *>(params);
190 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyD2D,
191 true);
192 }
193 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
194 const auto *p = reinterpret_cast<const cuMemcpy_params *>(params);
195 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyOther,
196 false);
197 }
198 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
199 const auto *p = reinterpret_cast<const cuMemcpyAsync_params *>(params);
200 return std::make_tuple(p->ByteCount, CuptiTracerEventType::MemcpyOther,
201 true);
202 }
203 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2: {
204 const auto *p = reinterpret_cast<const cuMemcpy2D_v2_params *>(params);
205 return std::make_tuple(Bytes2D(p->pCopy), MemcpyKind(p->pCopy), false);
206 }
207 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2: {
208 const auto *p =
209 reinterpret_cast<const cuMemcpy2DAsync_v2_params *>(params);
210 return std::make_tuple(Bytes2D(p->pCopy), MemcpyKind(p->pCopy), true);
211 }
212 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2: {
213 const auto *p = reinterpret_cast<const cuMemcpy3D_v2_params *>(params);
214 return std::make_tuple(Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true);
215 }
216 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2: {
217 const auto *p =
218 reinterpret_cast<const cuMemcpy3DAsync_v2_params *>(params);
219 return std::make_tuple(Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true);
220 }
221 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: {
222 const auto *p2p_params =
223 reinterpret_cast<const cuMemcpyPeer_params *>(params);
224 return std::make_tuple(p2p_params->ByteCount,
225 CuptiTracerEventType::MemcpyP2P, false);
226 }
227 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: {
228 const auto *p2p_params =
229 reinterpret_cast<const cuMemcpyPeerAsync_params *>(params);
230 return std::make_tuple(p2p_params->ByteCount,
231 CuptiTracerEventType::MemcpyP2P, true);
232 }
233 default: {
234 LOG(ERROR) << "Unsupported memcpy activity observed: " << cbid;
235 return std::make_tuple(0, CuptiTracerEventType::Unsupported, false);
236 }
237 }
238 }
239
240 std::tuple<size_t /*bytes*/, CuptiTracerEventType, bool /*async*/>
DecodeDriverMemset(CUpti_CallbackId cbid,const void * params)241 DecodeDriverMemset(CUpti_CallbackId cbid, const void *params) {
242 switch (cbid) {
243 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2: {
244 const auto *p = reinterpret_cast<const cuMemsetD8_v2_params *>(params);
245 return std::make_tuple(p->N, CuptiTracerEventType::Memset, false);
246 }
247 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2: {
248 const auto *p = reinterpret_cast<const cuMemsetD16_v2_params *>(params);
249 return std::make_tuple(p->N, CuptiTracerEventType::Memset, false);
250 }
251 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2: {
252 const auto *p = reinterpret_cast<const cuMemsetD32_v2_params *>(params);
253 return std::make_tuple(p->N, CuptiTracerEventType::Memset, false);
254 }
255 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2: {
256 const auto *p = reinterpret_cast<const cuMemsetD2D8_v2_params *>(params);
257 return std::make_tuple(p->dstPitch * p->Height,
258 CuptiTracerEventType::Memset, false);
259 }
260 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2: {
261 const auto *p = reinterpret_cast<const cuMemsetD2D16_v2_params *>(params);
262 return std::make_tuple(p->dstPitch * p->Height,
263 CuptiTracerEventType::Memset, false);
264 }
265 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2: {
266 const auto *p = reinterpret_cast<const cuMemsetD2D32_v2_params *>(params);
267 return std::make_tuple(p->dstPitch * p->Height,
268 CuptiTracerEventType::Memset, false);
269 }
270 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async: {
271 const auto *p = reinterpret_cast<const cuMemsetD8Async_params *>(params);
272 return std::make_tuple(p->N, CuptiTracerEventType::Memset, true);
273 }
274 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async: {
275 const auto *p = reinterpret_cast<const cuMemsetD16Async_params *>(params);
276 return std::make_tuple(p->N, CuptiTracerEventType::Memset, true);
277 }
278 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async: {
279 const auto *p = reinterpret_cast<const cuMemsetD32Async_params *>(params);
280 return std::make_tuple(p->N, CuptiTracerEventType::Memset, true);
281 }
282 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async: {
283 const auto *p =
284 reinterpret_cast<const cuMemsetD2D8Async_params *>(params);
285 return std::make_tuple(p->dstPitch * p->Height,
286 CuptiTracerEventType::Memset, true);
287 }
288 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async: {
289 const auto *p =
290 reinterpret_cast<const cuMemsetD2D16Async_params *>(params);
291 return std::make_tuple(p->dstPitch * p->Height,
292 CuptiTracerEventType::Memset, true);
293 }
294 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async: {
295 const auto *p =
296 reinterpret_cast<const cuMemsetD2D32Async_params *>(params);
297 return std::make_tuple(p->dstPitch * p->Height,
298 CuptiTracerEventType::Memset, true);
299 }
300 default: {
301 LOG(ERROR) << "Unsupported memset activity observed: " << cbid;
302 return std::make_tuple(0, CuptiTracerEventType::Unsupported, false);
303 }
304 }
305 }
306
307 // Cupti callback corresponding to a driver or runtime API. This global function
308 // is invoked twice for each API: at entry and at exit. The cbdata
309 // parameter is guaranteed by Cupti to be thread-safe. Most invocations are
310 // dropped to the floor and entry/exit is tracked for the APIs we deem
311 // performance-relevant.
ApiCallback(void * user_data,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)312 void CUPTIAPI ApiCallback(void *user_data, CUpti_CallbackDomain domain,
313 CUpti_CallbackId cbid,
314 const CUpti_CallbackData *cbdata) {
315 CuptiTracer *tracer = reinterpret_cast<CuptiTracer *>(user_data);
316 tracer->HandleCallback(domain, cbid, cbdata).IgnoreError();
317 }
318
319 // Callback which is invoked when an empty buffer is requested by CUPTI.
320 // Allocates an empty aligned-memory buffer. The buffer is used by CUPTI as a
321 // ring buffer where device maintains activity profiles that have been
322 // collected.
RequestCuptiActivityBuffer(uint8_t ** buffer,size_t * size,size_t * maxNumRecords)323 void CUPTIAPI RequestCuptiActivityBuffer(uint8_t **buffer, size_t *size,
324 size_t *maxNumRecords) {
325 CuptiTracer::GetCuptiTracerSingleton()->RequestActivityBuffer(buffer, size);
326 VLOG(3) << "Requested CUPTI Buffer, buffer=" << std::hex
327 << reinterpret_cast<uintptr_t>(*buffer) << std::dec
328 << " size=" << *size;
329 // Request CUPTI to fill as many records as possible in the buffer.
330 *maxNumRecords = 0;
331 }
332
333 // Callback which is invoked when a buffer containing activity records is
334 // available from CUPTI. Processes the buffer after reading activity records
335 // from it.
ProcessCuptiActivityBuffer(CUcontext context,uint32_t stream_id,uint8_t * buffer,size_t size,size_t valid_size)336 void CUPTIAPI ProcessCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
337 uint8_t *buffer, size_t size,
338 size_t valid_size) {
339 VLOG(3) << "Processing CUPTI Buffer, buffer:" << std::hex
340 << reinterpret_cast<uintptr_t>(buffer) << std::dec
341 << " size: " << size << " valid_size: " << valid_size;
342 VLOG(3) << "Activity profile for stream " << stream_id;
343
344 Status status = CuptiTracer::GetCuptiTracerSingleton()->ProcessActivityBuffer(
345 context, stream_id, buffer, valid_size);
346 if (!status.ok()) {
347 LOG(ERROR) << status;
348 }
349 }
350
AddKernelEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)351 void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
352 const CUpti_CallbackData *cbdata,
353 uint64 start_time, uint64 end_time) {
354 CuptiTracerEvent event{};
355 event.type = CuptiTracerEventType::Kernel;
356 event.source = CuptiTracerEventSource::DriverCallback;
357 event.name = cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
358 event.start_time_ns = start_time;
359 event.end_time_ns = end_time;
360 event.thread_id = Env::Default()->GetCurrentThreadId();
361 event.device_id = device_id;
362 event.context_id = cbdata->contextUid;
363 event.correlation_id = cbdata->correlationId;
364 VLOG(3) << "Cuda Kernel launch API exit. name=" << event.name;
365 collector->AddEvent(std::move(event));
366 }
367
368 // Performs the actual callback for both normal and P2P memcpy operations.
PopulateMemcpyCallbackEvent(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,size_t num_bytes,uint32 src_device,uint32 dst_device,bool async,uint64 start_time,uint64 end_time)369 CuptiTracerEvent PopulateMemcpyCallbackEvent(
370 CuptiTracerEventType type, const CUpti_CallbackData *cbdata,
371 size_t num_bytes, uint32 src_device, uint32 dst_device, bool async,
372 uint64 start_time, uint64 end_time) {
373 CuptiTracerEvent event{};
374 event.type = type;
375 event.source = CuptiTracerEventSource::DriverCallback;
376 event.start_time_ns = start_time;
377 event.end_time_ns = end_time;
378 event.thread_id = Env::Default()->GetCurrentThreadId();
379 event.device_id = src_device;
380 event.context_id = cbdata->contextUid;
381 event.correlation_id = cbdata->correlationId;
382 event.memcpy_info.num_bytes = num_bytes;
383 event.memcpy_info.destination = dst_device;
384 event.memcpy_info.async = async;
385 // These are not populated during callback for API activities.
386 event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
387 event.memcpy_info.dst_mem_kind = CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN;
388 event.memcpy_info.src_mem_kind = CUPTI_ACTIVITY_MEMORY_KIND_UNKNOWN;
389 return event;
390 }
391
AddNormalMemcpyEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)392 void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector,
393 uint32 device_id, CUpti_CallbackId cbid,
394 const CUpti_CallbackData *cbdata,
395 uint64 start_time, uint64 end_time) {
396 size_t num_bytes;
397 CuptiTracerEventType type;
398 bool async;
399 std::tie(num_bytes, type, async) =
400 DecodeDriverMemcpy(cbid, cbdata->functionParams);
401
402 VLOG(3) << "Cuda Memcpy API exit. sz=" << num_bytes;
403 CuptiTracerEvent event =
404 PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, device_id, device_id,
405 async, start_time, end_time);
406 collector->AddEvent(std::move(event));
407 }
408
AddCuMemsetEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)409 void AddCuMemsetEventUponApiExit(CuptiTraceCollector *collector,
410 uint32 device_id, CUpti_CallbackId cbid,
411 const CUpti_CallbackData *cbdata,
412 uint64 start_time, uint64 end_time) {
413 // We are casting all variants of cuMemset to cuMemsetD8 for accessing the
414 // first member attribute, a CUdeviceptr.
415 const auto *params =
416 static_cast<const cuMemsetD8_v2_params *>(cbdata->functionParams);
417 size_t num_bytes;
418 bool async;
419 CuptiTracerEventType type;
420 std::tie(num_bytes, type, async) =
421 DecodeDriverMemset(cbid, cbdata->functionParams);
422
423 CuptiTracerEvent event{};
424 event.type = type;
425 event.source = CuptiTracerEventSource::DriverCallback;
426 event.start_time_ns = start_time;
427 event.end_time_ns = end_time;
428 event.thread_id = Env::Default()->GetCurrentThreadId();
429 event.device_id = device_id;
430 event.context_id = cbdata->contextUid;
431 event.correlation_id = cbdata->correlationId;
432 event.memset_info.num_bytes = num_bytes;
433 // memset_info.kind cannot be determined from API.
434 event.memset_info.async = async;
435 VLOG(3) << "Cuda Memset API exit."
436 << " dptr=" << reinterpret_cast<void *>(params->dstDevice)
437 << " sz=" << num_bytes;
438 collector->AddEvent(std::move(event));
439 }
440
AddP2PMemcpyEventUponApiExit(CuptiTraceCollector * collector,CuptiInterface * cupti_interface,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)441 void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector,
442 CuptiInterface *cupti_interface,
443 uint32 device_id, CUpti_CallbackId cbid,
444 const CUpti_CallbackData *cbdata,
445 uint64 start_time, uint64 end_time) {
446 size_t num_bytes;
447 CuptiTracerEventType type;
448 bool async;
449 std::tie(num_bytes, type, async) =
450 DecodeDriverMemcpy(cbid, cbdata->functionParams);
451
452 uint32 dst_device = -1, src_device = -1;
453 const auto *p2p_params =
454 static_cast<const cuMemcpyPeer_params *>(cbdata->functionParams);
455 cupti_interface->GetDeviceId(p2p_params->srcContext, &src_device);
456 cupti_interface->GetDeviceId(p2p_params->dstContext, &dst_device);
457 VLOG(3) << "Cuda P2P Memcpy API exit, src: " << src_device
458 << " dst: " << dst_device << " size:" << num_bytes;
459 CuptiTracerEvent event =
460 PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, src_device,
461 dst_device, async, start_time, end_time);
462 collector->AddEvent(std::move(event));
463 }
464
AddCuMemAllocEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)465 void AddCuMemAllocEventUponApiExit(CuptiTraceCollector *collector,
466 uint32 device_id, CUpti_CallbackId cbid,
467 const CUpti_CallbackData *cbdata,
468 uint64 start_time, uint64 end_time) {
469 const auto *params =
470 static_cast<const cuMemAlloc_v2_params *>(cbdata->functionParams);
471 CuptiTracerEvent event{};
472 event.type = CuptiTracerEventType::MemoryAlloc;
473 event.source = CuptiTracerEventSource::DriverCallback;
474 event.name = cbdata->functionName;
475 event.start_time_ns = start_time;
476 event.end_time_ns = end_time;
477 event.thread_id = Env::Default()->GetCurrentThreadId();
478 event.device_id = device_id;
479 event.context_id = cbdata->contextUid;
480 event.correlation_id = cbdata->correlationId;
481 event.memalloc_info.num_bytes = params->bytesize;
482 VLOG(3) << "Cuda MemAlloc API exit."
483 << " dptr=" << reinterpret_cast<void *>(*params->dptr)
484 << " sz=" << params->bytesize;
485 collector->AddEvent(std::move(event));
486 }
487
AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)488 void AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector *collector,
489 uint32 device_id, CUpti_CallbackId cbid,
490 const CUpti_CallbackData *cbdata,
491 uint64 start_time, uint64 end_time) {
492 const auto *params =
493 static_cast<const cuMemAllocPitch_v2_params *>(cbdata->functionParams);
494 CuptiTracerEvent event{};
495 event.type = CuptiTracerEventType::MemoryAlloc;
496 event.source = CuptiTracerEventSource::DriverCallback;
497 event.name = cbdata->functionName;
498 event.start_time_ns = start_time;
499 event.end_time_ns = end_time;
500 event.thread_id = Env::Default()->GetCurrentThreadId();
501 event.device_id = device_id;
502 event.context_id = cbdata->contextUid;
503 event.correlation_id = cbdata->correlationId;
504 const size_t size_in_bytes = *params->pPitch * params->Height;
505 event.memalloc_info.num_bytes = size_in_bytes;
506 VLOG(3) << "Cuda MemAllocPitch API exit."
507 << " dptr=" << reinterpret_cast<void *>(*params->dptr)
508 << " sz=" << size_in_bytes;
509 collector->AddEvent(std::move(event));
510 }
511
AddCuMemFreeEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)512 void AddCuMemFreeEventUponApiExit(CuptiTraceCollector *collector,
513 uint32 device_id, CUpti_CallbackId cbid,
514 const CUpti_CallbackData *cbdata,
515 uint64 start_time, uint64 end_time) {
516 const auto *params =
517 static_cast<const cuMemFree_v2_params *>(cbdata->functionParams);
518 CuptiTracerEvent event{};
519 event.type = CuptiTracerEventType::MemoryFree;
520 event.source = CuptiTracerEventSource::DriverCallback;
521 event.name = cbdata->functionName;
522 event.start_time_ns = start_time;
523 event.end_time_ns = end_time;
524 event.thread_id = Env::Default()->GetCurrentThreadId();
525 event.device_id = device_id;
526 event.context_id = cbdata->contextUid;
527 event.correlation_id = cbdata->correlationId;
528 VLOG(3) << "Cuda MemFree API exit."
529 << " dptr=" << reinterpret_cast<void *>(params->dptr);
530 collector->AddEvent(std::move(event));
531 }
532
AddGenericEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)533 void AddGenericEventUponApiExit(CuptiTraceCollector *collector,
534 uint32 device_id, CUpti_CallbackId cbid,
535 const CUpti_CallbackData *cbdata,
536 uint64 start_time, uint64 end_time) {
537 CuptiTracerEvent event{};
538 event.type = CuptiTracerEventType::Generic;
539 event.source = CuptiTracerEventSource::DriverCallback;
540 event.name = cbdata->functionName;
541 event.start_time_ns = start_time;
542 event.end_time_ns = end_time;
543 event.thread_id = Env::Default()->GetCurrentThreadId();
544 event.device_id = device_id;
545 event.context_id = cbdata->contextUid;
546 event.correlation_id = cbdata->correlationId;
547 VLOG(3) << "Observed generic API exit."
548 << " name=" << cbdata->functionName;
549 collector->AddEvent(std::move(event));
550 }
551
AddKernelActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityKernel4 * kernel)552 void AddKernelActivityEvent(CuptiTraceCollector *collector,
553 const CUpti_ActivityKernel4 *kernel) {
554 CuptiTracerEvent event{};
555 event.type = CuptiTracerEventType::Kernel;
556 event.source = CuptiTracerEventSource::Activity;
557 event.name = kernel->name;
558 event.start_time_ns = kernel->start;
559 event.end_time_ns = kernel->end;
560 event.device_id = kernel->deviceId;
561 event.context_id = kernel->contextId;
562 event.stream_id = kernel->streamId;
563 event.correlation_id = kernel->correlationId;
564 AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
565 event.device_id, event.correlation_id);
566 event.annotation = info.annotation;
567 event.nvtx_range = info.nvtx_range;
568 event.kernel_info.registers_per_thread = kernel->registersPerThread;
569 event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
570 event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
571 event.kernel_info.block_x = kernel->blockX;
572 event.kernel_info.block_y = kernel->blockY;
573 event.kernel_info.block_z = kernel->blockZ;
574 event.kernel_info.grid_x = kernel->gridX;
575 event.kernel_info.grid_y = kernel->gridY;
576 event.kernel_info.grid_z = kernel->gridZ;
577 collector->AddEvent(std::move(event));
578 }
579
AddMemcpyActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemcpy * memcpy)580 void AddMemcpyActivityEvent(CuptiTraceCollector *collector,
581 const CUpti_ActivityMemcpy *memcpy) {
582 CuptiTracerEvent event{};
583 switch (memcpy->copyKind) {
584 case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
585 event.type = CuptiTracerEventType::MemcpyH2D;
586 event.name = "MemcpyH2D";
587 break;
588 case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
589 event.type = CuptiTracerEventType::MemcpyD2H;
590 event.name = "MemcpyD2H";
591 break;
592 case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
593 event.type = CuptiTracerEventType::MemcpyD2D;
594 event.name = "MemcpyD2D";
595 break;
596 case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
597 event.type = CuptiTracerEventType::MemcpyP2P;
598 event.name = "MemcpyP2P";
599 break;
600 default:
601 event.type = CuptiTracerEventType::MemcpyOther;
602 event.name = "MemcpyOther";
603 break;
604 }
605
606 event.source = CuptiTracerEventSource::Activity;
607 event.start_time_ns = memcpy->start;
608 event.end_time_ns = memcpy->end;
609 event.device_id = memcpy->deviceId;
610 event.context_id = memcpy->contextId;
611 event.stream_id = memcpy->streamId;
612 event.correlation_id = memcpy->correlationId;
613 AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
614 event.device_id, event.correlation_id);
615 event.annotation = info.annotation;
616 event.memcpy_info.copy_kind = memcpy->copyKind;
617 event.memcpy_info.num_bytes = memcpy->bytes;
618 event.memcpy_info.destination = memcpy->deviceId;
619 event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
620 event.memcpy_info.src_mem_kind = memcpy->srcKind;
621 event.memcpy_info.dst_mem_kind = memcpy->dstKind;
622 collector->AddEvent(std::move(event));
623 }
624
625 // Invokes callback upon peer-2-peer memcpy between different GPU devices.
AddMemcpy2ActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemcpy2 * memcpy2)626 void AddMemcpy2ActivityEvent(CuptiTraceCollector *collector,
627 const CUpti_ActivityMemcpy2 *memcpy2) {
628 CuptiTracerEvent event{};
629 event.type = CuptiTracerEventType::MemcpyP2P;
630 event.name = "MemcpyP2P";
631 event.source = CuptiTracerEventSource::Activity;
632 event.start_time_ns = memcpy2->start;
633 event.end_time_ns = memcpy2->end;
634 event.device_id = memcpy2->srcDeviceId;
635 event.context_id = memcpy2->contextId;
636 event.stream_id = memcpy2->streamId;
637 event.correlation_id = memcpy2->correlationId;
638 AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
639 event.device_id, event.correlation_id);
640 event.annotation = info.annotation;
641 event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
642 event.memcpy_info.num_bytes = memcpy2->bytes;
643 event.memcpy_info.destination = memcpy2->dstDeviceId;
644 event.memcpy_info.async = memcpy2->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
645 event.memcpy_info.src_mem_kind = memcpy2->srcKind;
646 event.memcpy_info.dst_mem_kind = memcpy2->dstKind;
647 collector->AddEvent(std::move(event));
648 }
649
AddCuptiOverheadActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityOverhead * overhead)650 void AddCuptiOverheadActivityEvent(CuptiTraceCollector *collector,
651 const CUpti_ActivityOverhead *overhead) {
652 CuptiTracerEvent event{};
653 event.type = CuptiTracerEventType::Overhead;
654 event.name = getActivityOverheadKindString(overhead->overheadKind);
655 event.source = CuptiTracerEventSource::Activity;
656 event.start_time_ns = overhead->start;
657 event.end_time_ns = overhead->end;
658 // If the overhead is not related to a device, we assign it to device 0.
659 event.device_id = 0;
660 // NOTE: no correlation id.
661 switch (overhead->objectKind) {
662 case CUPTI_ACTIVITY_OBJECT_UNKNOWN:
663 // Don't know how to deal with such activities because of we need either
664 // attribute it to a GPU stream or a CPU thread.
665 return;
666
667 case CUPTI_ACTIVITY_OBJECT_THREAD:
668 case CUPTI_ACTIVITY_OBJECT_PROCESS:
669 event.thread_id = overhead->objectId.pt.threadId;
670 break;
671 case CUPTI_ACTIVITY_OBJECT_STREAM:
672 event.stream_id = overhead->objectId.dcs.streamId;
673 TF_FALLTHROUGH_INTENDED;
674 case CUPTI_ACTIVITY_OBJECT_DEVICE:
675 case CUPTI_ACTIVITY_OBJECT_CONTEXT:
676 event.device_id = overhead->objectId.dcs.deviceId;
677 break;
678 default:
679 LOG(ERROR) << "Unexpected object kind: " << overhead->objectKind;
680 return;
681 }
682 collector->AddEvent(std::move(event));
683 }
684
AddUnifiedMemoryActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityUnifiedMemoryCounter2 * record)685 void AddUnifiedMemoryActivityEvent(
686 CuptiTraceCollector *collector,
687 const CUpti_ActivityUnifiedMemoryCounter2 *record) {
688 VLOG(3) << "Cuda Unified Memory Activity, kind: " << record->counterKind
689 << " src: " << record->srcId << " dst: " << record->dstId;
690 CuptiTracerEvent event{};
691 event.type = CuptiTracerEventType::UnifiedMemory;
692 event.name = getActivityUnifiedMemoryKindString(record->counterKind);
693 event.source = CuptiTracerEventSource::Activity;
694 event.start_time_ns = record->start;
695 if (record->counterKind ==
696 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT ||
697 record->counterKind ==
698 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING ||
699 record->counterKind ==
700 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP ||
701 record->end <= record->start) {
702 // If the end time is not valid, trim it so that it can be shown on the UI.
703 event.end_time_ns = record->start + 1;
704 } else {
705 event.end_time_ns = record->end;
706 }
707 event.device_id = record->srcId;
708 // NOTE: not context id and correlation id.
709
710 // For visualization purpose, we assign a pseudo stream id for each
711 // record->counterKind of unified memory related events.
712 constexpr int kPseudoStreamId = 0x10000000;
713 event.stream_id = kPseudoStreamId + record->counterKind;
714 event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
715 // Check whether the activity is byte transfer.
716 if (record->counterKind ==
717 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD ||
718 record->counterKind ==
719 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH ||
720 record->counterKind ==
721 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD) {
722 event.memcpy_info.num_bytes = record->value;
723 } else {
724 event.memcpy_info.num_bytes = 0;
725 }
726 event.memcpy_info.destination = record->dstId;
727 event.memcpy_info.async = false;
728 collector->AddEvent(std::move(event));
729 }
730
AddMemoryActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemory * memory)731 void AddMemoryActivityEvent(CuptiTraceCollector *collector,
732 const CUpti_ActivityMemory *memory) {
733 CuptiTracerEvent event{};
734 event.name = absl::StrCat("Memory ", GetMemoryKindName(memory->memoryKind));
735 event.type = CuptiTracerEventType::MemoryResidency;
736 event.source = CuptiTracerEventSource::Activity;
737 event.start_time_ns = memory->start;
738 event.end_time_ns = std::max(memory->end, memory->start + 1);
739 event.device_id = memory->deviceId;
740 event.context_id = memory->contextId;
741 // Assign to default stream (0) so that event is included during Flush().
742 event.stream_id = 0;
743 event.memory_residency_info.num_bytes = memory->bytes;
744 event.memory_residency_info.mem_kind = memory->memoryKind;
745 event.memory_residency_info.address = memory->address;
746 VLOG(5) << "Cuda activity " << event.name
747 << " addr: " << reinterpret_cast<void *>(memory->address)
748 << " bytes: " << memory->bytes;
749 collector->AddEvent(std::move(event));
750 }
751
AddMemsetActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemset * memset)752 void AddMemsetActivityEvent(CuptiTraceCollector *collector,
753 const CUpti_ActivityMemset *memset) {
754 auto mem_kind = memset->memoryKind;
755 CuptiTracerEvent event{};
756 event.type = CuptiTracerEventType::Memset;
757 event.source = CuptiTracerEventSource::Activity;
758 event.name = absl::StrCat("Memset ", mem_kind);
759 event.start_time_ns = memset->start;
760 event.end_time_ns = std::max(memset->end, memset->start + 1);
761 event.device_id = memset->deviceId;
762 event.correlation_id = memset->correlationId;
763 event.context_id = memset->contextId;
764 event.stream_id = memset->streamId;
765 event.memset_info.num_bytes = memset->bytes;
766 event.memset_info.mem_kind = mem_kind;
767 event.memset_info.async = (memset->flags & CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC);
768 VLOG(5) << "Cuda activity " << event.name << " bytes: " << memset->bytes
769 << " async: " << event.memset_info.async;
770 collector->AddEvent(std::move(event));
771 }
772
AddSynchronizationActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivitySynchronization * sync)773 void AddSynchronizationActivityEvent(
774 CuptiTraceCollector *collector, const CUpti_ActivitySynchronization *sync) {
775 CuptiTracerEvent event{};
776 event.type = CuptiTracerEventType::Generic;
777 event.source = CuptiTracerEventSource::Activity;
778 switch (sync->type) {
779 case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE:
780 event.name = "cuEventSynchronize";
781 break;
782 case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT:
783 event.name = "cuStreamWaitEvent";
784 break;
785 case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE:
786 event.name = "cuStreamSynchronize";
787 break;
788 case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_CONTEXT_SYNCHRONIZE:
789 event.name = "cuCtxSynchronize";
790 break;
791 default:
792 event.name = "unknown synchronization event";
793 break;
794 }
795 event.start_time_ns = sync->start;
796 event.end_time_ns = std::max(sync->end, sync->start + 1);
797 event.correlation_id = sync->correlationId;
798 event.context_id = sync->contextId;
799 VLOG(5) << "Cuda activity " << event.name;
800 collector->AddEvent(std::move(event));
801 }
802
803 // This hook uses cupti activity api to measure device side activities.
804 class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
805 public:
CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions & option,CuptiInterface * cupti_interface,CuptiTraceCollector * collector)806 CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions &option,
807 CuptiInterface *cupti_interface,
808 CuptiTraceCollector *collector)
809 : option_(option),
810 cupti_interface_(cupti_interface),
811 collector_(collector) {}
812
OnDriverApiEnter(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)813 Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
814 CUpti_CallbackId cbid,
815 const CUpti_CallbackData *cbdata) override {
816 // Stash away the current Cupti timestamp into cbdata.
817 *cbdata->correlationData =
818 option_.required_callback_api_events ? CuptiTracer::GetTimestamp() : 0;
819 return Status::OK();
820 }
OnDriverApiExit(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)821 Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
822 CUpti_CallbackId cbid,
823 const CUpti_CallbackData *cbdata) override {
824 // If we are not collecting CPU events from Callback API, we can return now.
825 if (!option_.required_callback_api_events) {
826 return Status::OK();
827 }
828
829 // Grab timestamp for API exit. API entry timestamp saved in cbdata.
830 uint64 end_tsc = CuptiTracer::GetTimestamp();
831 uint64 start_tsc = *cbdata->correlationData;
832 TrackContext(cbid, cbdata->context);
833 return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
834 start_tsc, end_tsc, domain, cbid, cbdata);
835 }
SyncAndFlush()836 Status SyncAndFlush() override {
837 if (option_.sync_devices_before_stop) {
838 CuptiApiTracingDisabler disabler;
839 absl::MutexLock lock(&mutex_);
840 for (auto &ctx : contexts_) {
841 cuCtxPushCurrent(ctx);
842 cuCtxSynchronize(); // Ignore error here for best effort.
843 CUcontext current;
844 cuCtxPopCurrent(¤t);
845 }
846 }
847 return Status::OK();
848 }
849
850 private:
TrackContext(CUpti_CallbackId cbid,CUcontext ctx)851 void TrackContext(CUpti_CallbackId cbid, CUcontext ctx) {
852 if (!option_.sync_devices_before_stop) return;
853 if (ctx == NULL) return;
854 absl::MutexLock lock(&mutex_);
855 if (cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2 ||
856 cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy) {
857 contexts_.erase(ctx);
858 } else {
859 contexts_.emplace(ctx);
860 }
861 }
862
863 const CuptiTracerOptions option_;
864 CuptiInterface *cupti_interface_;
865 CuptiTraceCollector *collector_;
866 absl::Mutex mutex_;
867 absl::flat_hash_set<CUcontext> contexts_ TF_GUARDED_BY(mutex_);
868
869 TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithActivityApi);
870 };
871
872 struct KernelRecord {
873 const char *kernel_name;
874 // TODO(csigg): cuStreamGetCtx introduced in CUDA 9.2 would allow us to only
875 // record the stream and infer the context during collection.
876 CUcontext context;
877 CUstream stream;
878 uint32 correlation_id;
879 CUevent start_event;
880 CUevent stop_event;
881 KernelDetails details;
882 uint64 start_timestamp;
883 };
884
885 struct MemcpyRecord {
886 CuptiTracerEventType type;
887 size_t size_bytes;
888 CUcontext context;
889 CUstream stream;
890 uint32 correlation_id;
891 bool async;
892 CUevent start_event;
893 CUevent stop_event;
894 uint64 start_timestamp;
895 };
896
CreateAndRecordEvent(CUevent * event,CUstream stream)897 Status CreateAndRecordEvent(CUevent *event, CUstream stream) {
898 CuptiApiTracingDisabler disabler;
899 TF_RETURN_IF_ERROR(ToStatus(cuEventCreate(event, CU_EVENT_DEFAULT)));
900 return ToStatus(cuEventRecord(*event, stream));
901 }
902
903 #if CUDA_VERSION >= 10000
904 // Maintain and restore current thread's CUDA context.
905 // Note: cuStreamGetCtx only available after CUDA 9.2.
906 class ScopedCudaContext {
907 public:
ScopedCudaContext(CUstream stream)908 explicit ScopedCudaContext(CUstream stream) : stream_(stream) {
909 CuptiApiTracingDisabler disabler; // don't trace cuda call in this func.
910 CUcontext context;
911 if (cuStreamGetCtx(stream, &context) != CUDA_SUCCESS) return;
912 context_ = context;
913 uint32 device_ordinal;
914 if (cuptiGetDeviceId(context, &device_ordinal) != CUPTI_SUCCESS) return;
915 device_ordinal_ = device_ordinal;
916 context_pushed_ = cuCtxPushCurrent(context) == CUDA_SUCCESS;
917 }
~ScopedCudaContext()918 ~ScopedCudaContext() {
919 if (!context_pushed_) return;
920 CuptiApiTracingDisabler disabler; // don't trace cuda call in this func.
921 cuCtxPopCurrent(&*context_);
922 }
923
924 // If successful, return the device ordinal of the relevant cuda stream.
925 // Otherwise absl::nullopt;
GetDeviceOrdinal()926 absl::optional<uint32> GetDeviceOrdinal() { return device_ordinal_; }
927
928 // If successful, return the cuda context of the relevant cuda stream.
929 // Otherwise absl::nullopt;
GetContext()930 absl::optional<CUcontext> GetContext() { return context_; }
931
932 private:
933 CUstream stream_;
934 absl::optional<CUcontext> context_;
935 absl::optional<uint32> device_ordinal_;
936 bool context_pushed_ = false;
937 };
938 #endif
939
940 // Stores a series of kernel and memcpy records.
941 class CudaEventRecorder {
942 public:
CudaEventRecorder(CuptiInterface * cupti_interface,CuptiTraceCollector * collector,int ordinal)943 CudaEventRecorder(CuptiInterface *cupti_interface,
944 CuptiTraceCollector *collector, int ordinal)
945 : cupti_interface_(cupti_interface),
946 collector_(collector),
947 ordinal_(ordinal) {
948 device_name_ = absl::StrCat("gpu ", ordinal); // default.
949 CUdevice device;
950 if (cuDeviceGet(&device, ordinal) == CUDA_SUCCESS) {
951 char name[100];
952 if (cuDeviceGetName(name, sizeof(name), device) == CUDA_SUCCESS) {
953 device_name_ = name;
954 }
955 }
956 }
957
958 // Registers the start of a kernel launch. The returned index should be passed
959 // to StopKernel() after the kernel launch has completed.
960 template <typename T>
StartKernel(const char * kernel_name,CUcontext context,uint32 correlation_id,const T * params)961 size_t StartKernel(const char *kernel_name, CUcontext context,
962 uint32 correlation_id, const T *params) {
963 CUstream stream = params->hStream;
964 KernelRecord record = {kernel_name, context, stream, correlation_id};
965 record.details.registers_per_thread = 0; // unknown.
966 record.details.static_shared_memory_usage = params->sharedMemBytes;
967 record.details.dynamic_shared_memory_usage = 0; // unknown
968 record.details.block_x = params->blockDimX;
969 record.details.block_y = params->blockDimY;
970 record.details.block_z = params->blockDimZ;
971 record.details.grid_x = params->gridDimX;
972 record.details.grid_y = params->gridDimY;
973 record.details.grid_z = params->gridDimZ;
974 record.start_timestamp = CuptiTracer::GetTimestamp();
975 LogIfError(CreateAndRecordEvent(&record.start_event, stream));
976 absl::MutexLock lock(&mutex_);
977 if (stopped_) return -1;
978 kernel_records_.push_back(record);
979 return kernel_records_.size() - 1;
980 }
StopKernel(size_t index)981 uint64 StopKernel(size_t index) {
982 absl::MutexLock lock(&mutex_);
983 if (index >= kernel_records_.size()) return 0;
984 auto &record = kernel_records_[index];
985 LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
986 return record.start_timestamp;
987 }
988
989 // Registers the start of a copy operation. The returned index should be
990 // passed to StopMemcpy() after the memcpy has completed.
StartMemcpy(CuptiTracerEventType type,size_t size_bytes,CUcontext context,CUstream stream,uint32 correlation_id,bool async)991 size_t StartMemcpy(CuptiTracerEventType type, size_t size_bytes,
992 CUcontext context, CUstream stream, uint32 correlation_id,
993 bool async) {
994 MemcpyRecord record = {type, size_bytes, context,
995 stream, correlation_id, async};
996 record.start_timestamp = CuptiTracer::GetTimestamp();
997 LogIfError(CreateAndRecordEvent(&record.start_event, stream));
998 absl::MutexLock lock(&mutex_);
999 if (stopped_) return -1;
1000 memcpy_records_.push_back(record);
1001 return memcpy_records_.size() - 1;
1002 }
StopMemcpy(size_t index)1003 uint64 StopMemcpy(size_t index) {
1004 absl::MutexLock lock(&mutex_);
1005 if (index >= memcpy_records_.size()) return 0;
1006 auto &record = memcpy_records_[index];
1007 LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
1008 return record.start_timestamp;
1009 }
1010
Stop()1011 Status Stop() {
1012 {
1013 absl::MutexLock lock(&mutex_);
1014 stopped_ = true;
1015 LOG(INFO) << "Collecting " << kernel_records_.size()
1016 << " kernel records, " << memcpy_records_.size()
1017 << " memcpy records.";
1018
1019 // Gather all profiled streams and contexts.
1020 for (const auto &record : kernel_records_) {
1021 TF_RETURN_IF_ERROR(
1022 AddStreamInfo(record.context, record.stream, "Kernel"));
1023 }
1024 for (const auto &record : memcpy_records_) {
1025 TF_RETURN_IF_ERROR(AddStreamInfo(record.context, record.stream,
1026 GetTraceEventTypeName(record.type)));
1027 }
1028 }
1029
1030 // Synchronize all contexts, record end events, synchronize again.
1031 // This scheme is an unreliable measure to associate a event with the wall
1032 // time. There are chances that other threads might enque kernels which
1033 // delay the second synchronization.
1034 TF_RETURN_IF_ERROR(Synchronize());
1035 for (auto &pair : context_infos_) {
1036 TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
1037 TF_RETURN_IF_ERROR(CreateAndRecordEvent(&pair.second.end_event, nullptr));
1038 }
1039
1040 TF_RETURN_IF_ERROR(Synchronize());
1041 end_walltime_us_ = Env::Default()->NowMicros();
1042 return Status::OK();
1043 }
1044
Flush(AnnotationMap * annotation_map)1045 Status Flush(AnnotationMap *annotation_map) {
1046 auto kernel_records = ConsumeKernelRecords();
1047 auto memcpy_records = ConsumeMemcpyRecords();
1048 for (const auto &record : kernel_records) {
1049 TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
1050 }
1051 for (const auto &record : memcpy_records) {
1052 TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
1053 }
1054 return Status::OK();
1055 }
1056
ConsumeKernelRecords()1057 std::vector<KernelRecord> ConsumeKernelRecords() {
1058 absl::MutexLock lock(&mutex_);
1059 return std::move(kernel_records_);
1060 }
ConsumeMemcpyRecords()1061 std::vector<MemcpyRecord> ConsumeMemcpyRecords() {
1062 absl::MutexLock lock(&mutex_);
1063 return std::move(memcpy_records_);
1064 }
1065
1066 private:
1067 struct ContextInfo {
1068 uint32 context_id = 0;
1069 int num_streams = 0;
1070 CUevent end_event;
1071 };
1072
1073 struct StreamInfo {
1074 uint32 stream_id = 0;
1075 std::string name;
1076 int index; // 0 is reserved for null stream.
1077 const ContextInfo *ctx_info;
1078 };
1079
1080 // Synchronizes all contexts.
Synchronize() const1081 Status Synchronize() const {
1082 CuptiApiTracingDisabler disabler;
1083 for (const auto &pair : context_infos_) {
1084 TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
1085 TF_RETURN_IF_ERROR(ToStatus(cuCtxSynchronize()));
1086 }
1087 return Status::OK();
1088 }
1089
1090 // Returns element from context_infos_, adding it if not yet present.
GetContextInfo(CUcontext context,ContextInfo ** ctx_info_ptr)1091 Status GetContextInfo(CUcontext context, ContextInfo **ctx_info_ptr) {
1092 auto it = context_infos_.find(context);
1093
1094 if (it == context_infos_.end()) {
1095 uint32 context_id = 0;
1096 RETURN_IF_CUPTI_ERROR(
1097 cupti_interface_->GetContextId(context, &context_id));
1098 ContextInfo ctx_info = {context_id};
1099 it = context_infos_.emplace(context, ctx_info).first;
1100 }
1101
1102 *ctx_info_ptr = &it->second;
1103 return Status::OK();
1104 }
1105
1106 // Adds element to stream_infos_ if not yet present. If present, clear name
1107 // if it doesn't match parameter.
AddStreamInfo(CUcontext context,CUstream stream,absl::string_view name)1108 Status AddStreamInfo(CUcontext context, CUstream stream,
1109 absl::string_view name) {
1110 StreamKey key(context, stream);
1111 auto it = stream_infos_.find(key);
1112 if (it != stream_infos_.end()) {
1113 if (it->second.name != name) {
1114 it->second.name.clear(); // Stream with inconsistent names, clear it.
1115 }
1116 return Status::OK();
1117 }
1118
1119 ContextInfo *ctx_info;
1120 TF_RETURN_IF_ERROR(GetContextInfo(context, &ctx_info));
1121 int index = stream ? ++ctx_info->num_streams : 0;
1122 uint32 stream_id = 0;
1123 #if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
1124 RETURN_IF_CUPTI_ERROR(
1125 cupti_interface_->GetStreamIdEx(context, stream, 1, &stream_id));
1126 #else
1127 RETURN_IF_CUPTI_ERROR(
1128 cupti_interface_->GetStreamIdEx(context, stream, 0, &stream_id));
1129 #endif
1130
1131 StreamInfo stream_info = {stream_id, static_cast<std::string>(name), index,
1132 ctx_info};
1133 stream_infos_.emplace(key, stream_info);
1134 return Status::OK();
1135 }
1136
1137 // Returns time in microseconds between events recorded on the GPU.
GetElapsedTimeUs(CUevent start,CUevent stop)1138 static uint64_t GetElapsedTimeUs(CUevent start, CUevent stop) {
1139 CuptiApiTracingDisabler disabler;
1140 float elapsed_ms = 0.0f;
1141 LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop)));
1142 return static_cast<uint64>(
1143 std::llroundf(1000 * std::max(elapsed_ms, 0.0f)));
1144 }
1145
SaveRecord(const KernelRecord & record,AnnotationMap * annotation_map) const1146 Status SaveRecord(const KernelRecord &record,
1147 AnnotationMap *annotation_map) const {
1148 if (!record.start_event || !record.stop_event) {
1149 return Status::OK();
1150 }
1151 const auto &stream_info =
1152 stream_infos_.at(StreamKey(record.context, record.stream));
1153 auto start_us =
1154 GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
1155 auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
1156
1157 std::string annotation;
1158
1159 CuptiTracerEvent event{};
1160 event.type = CuptiTracerEventType::Kernel;
1161 event.source = CuptiTracerEventSource::Activity; // on gpu device.
1162 event.name = record.kernel_name;
1163 event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
1164 event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
1165 event.device_id = ordinal_;
1166 event.context_id = stream_info.ctx_info->context_id;
1167 event.stream_id = stream_info.stream_id;
1168 event.correlation_id = record.correlation_id;
1169 AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
1170 event.device_id, event.correlation_id);
1171 event.annotation = info.annotation;
1172 event.kernel_info = record.details;
1173 collector_->AddEvent(std::move(event));
1174 return Status::OK();
1175 }
1176
SaveRecord(const MemcpyRecord & record,AnnotationMap * annotation_map) const1177 Status SaveRecord(const MemcpyRecord &record,
1178 AnnotationMap *annotation_map) const {
1179 if (!record.start_event || !record.stop_event) {
1180 return Status::OK();
1181 }
1182 const auto &stream_info =
1183 stream_infos_.at(StreamKey(record.context, record.stream));
1184 auto start_us =
1185 GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
1186 auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
1187
1188 CuptiTracerEvent event{};
1189 event.type = record.type;
1190 event.name = GetTraceEventTypeName(event.type);
1191 event.source = CuptiTracerEventSource::Activity;
1192 event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
1193 event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
1194 event.device_id = ordinal_;
1195 event.context_id = stream_info.ctx_info->context_id;
1196 event.stream_id = stream_info.stream_id;
1197 event.correlation_id = record.correlation_id;
1198 AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
1199 event.device_id, event.correlation_id);
1200 event.annotation = info.annotation;
1201 event.memcpy_info.num_bytes = record.size_bytes;
1202 // TODO: support MemcpyD2D where destination != source;
1203 event.memcpy_info.destination = ordinal_;
1204 event.memcpy_info.async = record.async;
1205 // TODO: set src_mem_kind and dst_mem_kind.
1206 collector_->AddEvent(std::move(event));
1207 return Status::OK();
1208 }
1209
1210 absl::Mutex mutex_;
1211 bool stopped_ TF_GUARDED_BY(mutex_) = false;
1212 std::vector<KernelRecord> kernel_records_ TF_GUARDED_BY(mutex_);
1213 std::vector<MemcpyRecord> memcpy_records_ TF_GUARDED_BY(mutex_);
1214
1215 CuptiInterface *cupti_interface_;
1216 CuptiTraceCollector *collector_;
1217 const int ordinal_;
1218 std::string device_name_;
1219 uint64 end_walltime_us_;
1220 // Include context in key to distinguish null streams.
1221 using StreamKey = std::pair<CUcontext, CUstream>;
1222
1223 absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
1224 absl::flat_hash_map<StreamKey, StreamInfo> stream_infos_;
1225 };
1226
1227 // This hook uses cuda events to measure device side activities.
1228 class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
1229 public:
CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions & option,CuptiInterface * cupti_interface,CuptiTraceCollector * collector)1230 CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions &option,
1231 CuptiInterface *cupti_interface,
1232 CuptiTraceCollector *collector)
1233 : option_(option),
1234 cupti_interface_(cupti_interface),
1235 collector_(collector) {
1236 int num_gpus = CuptiTracer::NumGpus();
1237 cuda_event_recorders_.reserve(num_gpus);
1238 for (int i = 0; i < num_gpus; ++i) {
1239 cuda_event_recorders_.emplace_back(
1240 absl::make_unique<CudaEventRecorder>(cupti_interface, collector, i));
1241 }
1242 }
~CuptiDriverApiHookWithCudaEvent()1243 ~CuptiDriverApiHookWithCudaEvent() {
1244 for (auto *callback_context : callback_contexts_) delete callback_context;
1245 }
1246
OnDriverApiEnter(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1247 Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
1248 CUpti_CallbackId cbid,
1249 const CUpti_CallbackData *cbdata) override {
1250 auto *recorder = cuda_event_recorders_[device_id].get();
1251 switch (cbid) {
1252 case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: {
1253 DCHECK_NE(cbdata->symbolName, nullptr);
1254 const auto *params =
1255 static_cast<const cuLaunchKernel_params *>(cbdata->functionParams);
1256 *cbdata->correlationData = recorder->StartKernel<cuLaunchKernel_params>(
1257 cbdata->symbolName, cbdata->context, cbdata->correlationId, params);
1258 break;
1259 }
1260 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: {
1261 DCHECK_NE(cbdata->symbolName, nullptr);
1262 const auto *params =
1263 static_cast<const cuLaunchCooperativeKernel_params *>(
1264 cbdata->functionParams);
1265 *cbdata->correlationData =
1266 recorder->StartKernel<cuLaunchCooperativeKernel_params>(
1267 cbdata->symbolName, cbdata->context, cbdata->correlationId,
1268 params);
1269 break;
1270 }
1271 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
1272 #if CUDA_VERSION >= 10000
1273 const auto *params =
1274 static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
1275 cbdata->functionParams);
1276 std::vector<uint32> record_indices;
1277 record_indices.reserve(params->numDevices);
1278 *cbdata->correlationData = -1; // Invalid value.
1279 const auto &annotation = AnnotationStack::Get();
1280 for (int i = 0; i < params->numDevices; ++i) {
1281 CUstream stream = params->launchParamsList[i].hStream;
1282 ScopedCudaContext scoped_cuda_context(stream);
1283 auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
1284 auto context = scoped_cuda_context.GetContext();
1285 if (!dev_id) return errors::Internal("Invalid CUDA stream");
1286 // Because annotation are per device, therefore we need to populate
1287 // annotation for each device involved.
1288 collector_->annotation_map()->Add(*dev_id, cbdata->correlationId,
1289 annotation, "");
1290 record_indices.push_back(
1291 cuda_event_recorders_[*dev_id]->StartKernel<CUDA_LAUNCH_PARAMS>(
1292 "CooperativeKernelMultiDevice", *context,
1293 cbdata->correlationId, &(params->launchParamsList[i])));
1294 }
1295 auto *callback_context =
1296 new CuptiApiCallbackContext(std::move(record_indices));
1297 callback_contexts_.insert(callback_context);
1298 *cbdata->correlationData = reinterpret_cast<uint64>(callback_context);
1299 #else
1300 VLOG(1) << "Unhandled cuLaunchCooperativeKernelMultiDevice.";
1301 #endif
1302 } break;
1303 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
1304 const auto *params =
1305 static_cast<const cuMemcpy_params *>(cbdata->functionParams);
1306 StartMemcpy<cuMemcpy_params>(GetMemcpyType(params->src, params->dst),
1307 cbdata, recorder);
1308 break;
1309 }
1310 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
1311 const auto *params =
1312 static_cast<const cuMemcpyAsync_params *>(cbdata->functionParams);
1313 StartMemcpyAsync<cuMemcpyAsync_params>(
1314 GetMemcpyType(params->src, params->dst), cbdata, recorder);
1315 break;
1316 }
1317 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1318 StartMemcpy<cuMemcpyHtoD_v2_params>(CuptiTracerEventType::MemcpyH2D,
1319 cbdata, recorder);
1320 break;
1321 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1322 StartMemcpyAsync<cuMemcpyHtoDAsync_v2_params>(
1323 CuptiTracerEventType::MemcpyH2D, cbdata, recorder);
1324 break;
1325 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1326 StartMemcpy<cuMemcpyDtoH_v2_params>(CuptiTracerEventType::MemcpyD2H,
1327 cbdata, recorder);
1328 break;
1329 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1330 StartMemcpyAsync<cuMemcpyDtoHAsync_v2_params>(
1331 CuptiTracerEventType::MemcpyD2H, cbdata, recorder);
1332 break;
1333 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1334 StartMemcpy<cuMemcpyDtoD_v2_params>(CuptiTracerEventType::MemcpyD2D,
1335 cbdata, recorder);
1336 break;
1337 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1338 StartMemcpyAsync<cuMemcpyDtoDAsync_v2_params>(
1339 CuptiTracerEventType::MemcpyD2D, cbdata, recorder);
1340 break;
1341 default:
1342 VLOG(1) << "Unexpected callback id: " << cbid;
1343 break;
1344 }
1345 return Status::OK();
1346 }
1347
OnDriverApiExit(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1348 Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
1349 CUpti_CallbackId cbid,
1350 const CUpti_CallbackData *cbdata) override {
1351 auto *recorder = cuda_event_recorders_[device_id].get();
1352 if (*cbdata->correlationData == static_cast<size_t>(-1))
1353 return Status::OK();
1354 uint64 start_tsc = 0;
1355 switch (cbid) {
1356 case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
1357 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
1358 start_tsc = recorder->StopKernel(*cbdata->correlationData);
1359 break;
1360 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
1361 #if CUDA_VERSION >= 10000
1362 auto *callback_context = reinterpret_cast<CuptiApiCallbackContext *>(
1363 *cbdata->correlationData);
1364 callback_contexts_.erase(callback_context);
1365 auto record_indices = std::move(callback_context->record_indices);
1366 delete callback_context;
1367 const auto *params =
1368 static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
1369 cbdata->functionParams);
1370 if (record_indices.size() != params->numDevices)
1371 return errors::Internal("Invalid correlation data");
1372 for (int i = 0; i < params->numDevices; ++i) {
1373 CUstream stream = params->launchParamsList[i].hStream;
1374 ScopedCudaContext scoped_cuda_context(stream);
1375 auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
1376 if (!dev_id) return errors::Internal("Invalid CUDA stream");
1377 start_tsc =
1378 cuda_event_recorders_[*dev_id]->StopKernel(record_indices[i]);
1379 }
1380 #endif
1381 } break;
1382 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
1383 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
1384 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1385 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1386 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1387 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1388 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1389 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1390 start_tsc = recorder->StopMemcpy(*cbdata->correlationData);
1391 break;
1392 default:
1393 VLOG(1) << "Unexpected callback id: " << cbid;
1394 // TODO: figure out how to get start timestamp in this case.
1395 return Status::OK();
1396 }
1397 // If we are not collecting CPU events from Callback API, we can return now.
1398 if (!option_.required_callback_api_events) {
1399 return Status::OK();
1400 }
1401
1402 // Grab timestamp for API exit. API entry timestamp saved in cbdata.
1403 uint64 end_tsc = CuptiTracer::GetTimestamp();
1404 return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
1405 start_tsc, end_tsc, domain, cbid, cbdata);
1406 }
SyncAndFlush()1407 Status SyncAndFlush() override {
1408 for (auto &recorder : cuda_event_recorders_) {
1409 TF_RETURN_IF_ERROR(recorder->Stop());
1410 }
1411 for (auto &recorder : cuda_event_recorders_) {
1412 TF_RETURN_IF_ERROR(recorder->Flush(collector_->annotation_map()));
1413 }
1414 return Status::OK();
1415 }
1416
1417 private:
1418 template <typename T>
StartMemcpy(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,CudaEventRecorder * recorder)1419 static void StartMemcpy(CuptiTracerEventType type,
1420 const CUpti_CallbackData *cbdata,
1421 CudaEventRecorder *recorder) {
1422 const auto *params = static_cast<const T *>(cbdata->functionParams);
1423 *cbdata->correlationData =
1424 recorder->StartMemcpy(type, params->ByteCount, cbdata->context, nullptr,
1425 cbdata->correlationId, /*async*/ false);
1426 }
1427
1428 template <typename T>
StartMemcpyAsync(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,CudaEventRecorder * recorder)1429 static void StartMemcpyAsync(CuptiTracerEventType type,
1430 const CUpti_CallbackData *cbdata,
1431 CudaEventRecorder *recorder) {
1432 const auto *params = static_cast<const T *>(cbdata->functionParams);
1433 *cbdata->correlationData = recorder->StartMemcpy(
1434 type, params->ByteCount, cbdata->context, params->hStream,
1435 cbdata->correlationId, /*async*/ true);
1436 }
1437
GetMemoryType(CUdeviceptr ptr)1438 static CUmemorytype GetMemoryType(CUdeviceptr ptr) {
1439 CuptiApiTracingDisabler disabler;
1440 CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
1441 auto status =
1442 cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr);
1443 if (status == CUDA_ERROR_INVALID_VALUE) {
1444 // Pointer not registered with CUDA, must be host memory.
1445 return CU_MEMORYTYPE_HOST;
1446 }
1447 LogIfError(ToStatus(status));
1448 return mem_type;
1449 }
1450
GetMemcpyType(CUdeviceptr src,CUdeviceptr dst)1451 static CuptiTracerEventType GetMemcpyType(CUdeviceptr src, CUdeviceptr dst) {
1452 CUmemorytype src_type = GetMemoryType(src);
1453 CUmemorytype dst_type = GetMemoryType(dst);
1454 // TODO: handle CU_MEMORYTYPE_ARRAY case
1455 if (src_type == CU_MEMORYTYPE_HOST && dst_type == CU_MEMORYTYPE_DEVICE) {
1456 return CuptiTracerEventType::MemcpyH2D;
1457 } else if (src_type == CU_MEMORYTYPE_DEVICE &&
1458 dst_type == CU_MEMORYTYPE_HOST) {
1459 return CuptiTracerEventType::MemcpyD2H;
1460 } else if (src_type == CU_MEMORYTYPE_DEVICE &&
1461 dst_type == CU_MEMORYTYPE_DEVICE) {
1462 return CuptiTracerEventType::MemcpyD2D;
1463 }
1464 return CuptiTracerEventType::MemcpyOther;
1465 }
1466
1467 // Each cuLaunchCooperativeKernelMultiDevice will need to add an entry in
1468 // each corresponding device, therefore we need to keep records of all
1469 // the record indices in each device's record array.
1470 // We allocate such data structure during API entry and free during API exit.
1471 // However there is no guarantee that we receive such callbacks in pairs, we
1472 // maintain a on-going API calls to make sure no memory leaks.
1473 struct CuptiApiCallbackContext {
CuptiApiCallbackContexttensorflow::profiler::__anon4531e42b0111::CuptiDriverApiHookWithCudaEvent::CuptiApiCallbackContext1474 explicit CuptiApiCallbackContext(std::vector<uint32> &&r)
1475 : record_indices(std::move(r)) {}
1476 std::vector<uint32> record_indices;
1477 };
1478
1479 const CuptiTracerOptions option_;
1480 CuptiInterface *cupti_interface_;
1481 CuptiTraceCollector *collector_;
1482 absl::node_hash_set<CuptiApiCallbackContext *> callback_contexts_;
1483 std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
1484 TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
1485 };
1486
ErrorWithHostname(absl::string_view error_message)1487 /*static*/ std::string ErrorWithHostname(absl::string_view error_message) {
1488 return absl::StrCat(port::Hostname(), ": ", error_message);
1489 }
1490
1491 } // namespace
1492
AddDriverApiCallbackEvent(CuptiTraceCollector * collector,CuptiInterface * cupti_interface,int device_id,uint64 start_tsc,uint64 end_tsc,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1493 /*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
1494 CuptiTraceCollector *collector, CuptiInterface *cupti_interface,
1495 int device_id, uint64 start_tsc, uint64 end_tsc,
1496 CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
1497 const CUpti_CallbackData *cbdata) {
1498 switch (cbid) {
1499 case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
1500 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
1501 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice:
1502 AddKernelEventUponApiExit(collector, device_id, cbdata, start_tsc,
1503 end_tsc);
1504 break;
1505 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
1506 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
1507 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1508 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1509 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1510 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1511 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1512 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1513 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2:
1514 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2:
1515 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2:
1516 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2:
1517 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2:
1518 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2:
1519 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2:
1520 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2:
1521 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2:
1522 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2:
1523 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2:
1524 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2:
1525 // This would be the place to populate the memcpy API activity's src and
1526 // dst memory kind by casting cbdata->functionParams. However, we are not
1527 // doing that because that will incur significant overhead to get the
1528 // memory aperture of each argument.
1529 AddNormalMemcpyEventUponApiExit(collector, device_id, cbid, cbdata,
1530 start_tsc, end_tsc);
1531 break;
1532 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer:
1533 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync:
1534 AddP2PMemcpyEventUponApiExit(collector, cupti_interface, device_id, cbid,
1535 cbdata, start_tsc, end_tsc);
1536 break;
1537 case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2:
1538 AddCuMemAllocEventUponApiExit(collector, device_id, cbid, cbdata,
1539 start_tsc, end_tsc);
1540 break;
1541 case CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2:
1542 AddCuMemAllocPitchEventUponApiExit(collector, device_id, cbid, cbdata,
1543 start_tsc, end_tsc);
1544 break;
1545 case CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2:
1546 AddCuMemFreeEventUponApiExit(collector, device_id, cbid, cbdata,
1547 start_tsc, end_tsc);
1548 break;
1549 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2:
1550 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2:
1551 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2:
1552 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2:
1553 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2:
1554 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2:
1555 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async:
1556 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async:
1557 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async:
1558 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async:
1559 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async:
1560 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async:
1561 AddCuMemsetEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc,
1562 end_tsc);
1563 break;
1564 default:
1565 AddGenericEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc,
1566 end_tsc);
1567 break;
1568 }
1569 return Status::OK();
1570 }
1571
GetTraceEventTypeName(const CuptiTracerEventType & type)1572 const char *GetTraceEventTypeName(const CuptiTracerEventType &type) {
1573 // Do not use a default so that this gives a build error when
1574 // CuptiTracerEventType is extended but this is not.
1575 switch (type) {
1576 case CuptiTracerEventType::MemcpyH2D:
1577 return "MemcpyH2D";
1578 case CuptiTracerEventType::MemcpyD2H:
1579 return "MemcpyD2H";
1580 case CuptiTracerEventType::MemcpyD2D:
1581 return "MemcpyD2D";
1582 case CuptiTracerEventType::MemcpyP2P:
1583 return "MemcpyP2P";
1584 case CuptiTracerEventType::MemcpyOther:
1585 return "MemcpyOther";
1586 case CuptiTracerEventType::Kernel:
1587 return "Compute";
1588 case CuptiTracerEventType::MemoryAlloc:
1589 return "MemoryAlloc";
1590 case CuptiTracerEventType::MemoryFree:
1591 return "MemoryFree";
1592 case CuptiTracerEventType::Memset:
1593 return "Memset";
1594 case CuptiTracerEventType::Overhead:
1595 return "Overhead";
1596 case CuptiTracerEventType::UnifiedMemory:
1597 return "UnifiedMemory";
1598 case CuptiTracerEventType::Generic:
1599 return "Generic";
1600 case CuptiTracerEventType::MemoryResidency:
1601 return "MemoryResidency";
1602 case CuptiTracerEventType::Unsupported:
1603 return "";
1604 }
1605 }
1606
CuptiTracer(CuptiInterface * cupti_interface)1607 CuptiTracer::CuptiTracer(CuptiInterface *cupti_interface)
1608 : num_gpus_(NumGpus()),
1609 cupti_interface_(cupti_interface),
1610 buffer_pool_(kBufferSizeInBytes) {}
1611
GetCuptiTracerSingleton()1612 /* static */ CuptiTracer *CuptiTracer::GetCuptiTracerSingleton() {
1613 static auto *singleton = new CuptiTracer(GetCuptiInterface());
1614 return singleton;
1615 }
1616
IsAvailable() const1617 bool CuptiTracer::IsAvailable() const {
1618 return NumGpus() && !activity_tracing_enabled_ && !api_tracing_enabled_;
1619 }
1620
NumGpus()1621 int CuptiTracer::NumGpus() {
1622 static int num_gpus = []() -> int {
1623 if (cuInit(0) != CUDA_SUCCESS) {
1624 return 0;
1625 }
1626 int gpu_count;
1627 if (cuDeviceGetCount(&gpu_count) != CUDA_SUCCESS) {
1628 return 0;
1629 }
1630 LOG(INFO) << "Profiler found " << gpu_count << " GPUs";
1631 return gpu_count;
1632 }();
1633 return num_gpus;
1634 }
1635
Enable(const CuptiTracerOptions & option,CuptiTraceCollector * collector)1636 void CuptiTracer::Enable(const CuptiTracerOptions &option,
1637 CuptiTraceCollector *collector) {
1638 option_ = option;
1639 collector_ = collector;
1640 if (option_->enable_event_based_activity) {
1641 option_->enable_activity_api = false;
1642 cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithCudaEvent(
1643 option, cupti_interface_, collector));
1644 } else {
1645 cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithActivityApi(
1646 option, cupti_interface_, collector));
1647 }
1648
1649 Status status = EnableApiTracing();
1650 need_root_access_ |= status.code() == error::PERMISSION_DENIED;
1651 if (!status.ok()) return;
1652
1653 if (option_->enable_activity_api) {
1654 EnableActivityTracing().IgnoreError();
1655 }
1656 tensorflow::profiler::AnnotationStack::Enable(true);
1657 }
1658
Disable()1659 void CuptiTracer::Disable() {
1660 DisableApiTracing().IgnoreError();
1661 if (option_->enable_activity_api) {
1662 DisableActivityTracing().IgnoreError();
1663 }
1664 cupti_interface_->CleanUp();
1665 Finalize().IgnoreError();
1666 cupti_driver_api_hook_->SyncAndFlush().IgnoreError();
1667 collector_->Flush();
1668 collector_ = nullptr;
1669 option_.reset();
1670 cupti_driver_api_hook_.reset();
1671 tensorflow::profiler::AnnotationStack::Enable(false);
1672 }
1673
EnableApiTracing()1674 Status CuptiTracer::EnableApiTracing() {
1675 if (api_tracing_enabled_) return Status::OK();
1676
1677 VLOG(1) << "Enable subscriber";
1678 // Subscribe can return CUPTI_ERROR_MAX_LIMIT_REACHED.
1679 // The application which calls CUPTI APIs cannot be used with Nvidia tools
1680 // like nvprof, Nvidia Visual Profiler, Nsight Compute, Nsight Systems.
1681 RETURN_IF_CUPTI_ERROR(cupti_interface_->Subscribe(
1682 &subscriber_, (CUpti_CallbackFunc)ApiCallback, this));
1683 api_tracing_enabled_ = true;
1684
1685 if (!option_->cbids_selected.empty()) {
1686 for (auto cbid : option_->cbids_selected) {
1687 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
1688 1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
1689 }
1690 } else { // select all callback ids.
1691 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1692 1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
1693 }
1694
1695 if (option_->enable_nvtx_tracking) {
1696 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1697 1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
1698 }
1699 return Status::OK();
1700 }
1701
DisableApiTracing()1702 Status CuptiTracer::DisableApiTracing() {
1703 if (!api_tracing_enabled_) return Status::OK();
1704
1705 api_tracing_enabled_ = false;
1706
1707 if (!option_->cbids_selected.empty()) {
1708 for (auto cbid : option_->cbids_selected) {
1709 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
1710 0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
1711 }
1712 } else {
1713 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1714 0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
1715 }
1716
1717 if (option_->enable_nvtx_tracking) {
1718 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1719 0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
1720 }
1721
1722 VLOG(1) << "Disable subscriber";
1723 RETURN_IF_CUPTI_ERROR(cupti_interface_->Unsubscribe(subscriber_));
1724 return Status::OK();
1725 }
1726
EnableActivityTracing()1727 Status CuptiTracer::EnableActivityTracing() {
1728 if (!option_->activities_selected.empty()) {
1729 // Initialize callback functions for Cupti Activity API.
1730 VLOG(1) << "Registering CUPTI activity callbacks";
1731 RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityRegisterCallbacks(
1732 RequestCuptiActivityBuffer, ProcessCuptiActivityBuffer));
1733
1734 VLOG(1) << "Enabling activity tracing for "
1735 << option_->activities_selected.size() << " activities";
1736 for (auto activity : option_->activities_selected) {
1737 VLOG(1) << "Enabling activity tracing for: " << activity;
1738 if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
1739 ConfigureActivityUnifiedMemoryCounter(true);
1740 }
1741 RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityEnable(activity));
1742 }
1743 }
1744 activity_tracing_enabled_ = true;
1745 return Status::OK();
1746 }
1747
DisableActivityTracing()1748 Status CuptiTracer::DisableActivityTracing() {
1749 if (activity_tracing_enabled_) {
1750 VLOG(1) << "Disabling activity tracing for "
1751 << option_->activities_selected.size() << " activities";
1752 for (auto activity : option_->activities_selected) {
1753 VLOG(1) << "Disabling activity tracing for: " << activity;
1754 if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
1755 ConfigureActivityUnifiedMemoryCounter(false);
1756 }
1757 RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityDisable(activity));
1758 }
1759 option_->activities_selected.clear();
1760
1761 VLOG(1) << "Flushing CUPTI activity buffer";
1762 RETURN_IF_CUPTI_ERROR(
1763 cupti_interface_->ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
1764 LOG(INFO) << "CUPTI activity buffer flushed";
1765 }
1766 activity_tracing_enabled_ = false;
1767 return Status::OK();
1768 }
1769
Finalize()1770 Status CuptiTracer::Finalize() {
1771 if (option_->cupti_finalize) {
1772 VLOG(1) << "CuptiFinalize";
1773 RETURN_IF_CUPTI_ERROR(cupti_interface_->Finalize());
1774 }
1775 return Status::OK();
1776 }
1777
GetTimestamp()1778 /*static*/ uint64 CuptiTracer::GetTimestamp() {
1779 uint64_t tsc;
1780 CuptiInterface *cupti_interface = GetCuptiInterface();
1781 if (cupti_interface && cupti_interface->GetTimestamp(&tsc) == CUPTI_SUCCESS) {
1782 return tsc;
1783 }
1784 // Return 0 on error. If an activity timestamp is 0, the activity will be
1785 // dropped during time normalization.
1786 return 0;
1787 }
1788
HandleNVTXCallback(CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1789 Status CuptiTracer::HandleNVTXCallback(CUpti_CallbackId cbid,
1790 const CUpti_CallbackData *cbdata) {
1791 const CUpti_NvtxData *pdata =
1792 reinterpret_cast<const CUpti_NvtxData *>(cbdata);
1793 if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePushEx) {
1794 const nvtxDomainRangePushEx_params *params =
1795 reinterpret_cast<const nvtxDomainRangePushEx_params *>(
1796 pdata->functionParams);
1797 // TODO(profiler): The messageType is actually NVTX_MESSAGE_TYPE_REGISTERED
1798 // (which is 3), However it seems to me that we can not get the registered
1799 // string from nvtxDomainRegisterStringA_params. If we reinterpret the
1800 // payload as ascii, it happen to work.
1801 NVTXRangeTracker::EnterRange(params->core.eventAttrib->message.ascii);
1802 } else if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePop) {
1803 NVTXRangeTracker::ExitRange();
1804 }
1805 return Status::OK();
1806 }
1807
HandleCallback(CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1808 Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
1809 CUpti_CallbackId cbid,
1810 const CUpti_CallbackData *cbdata) {
1811 if (!api_tracing_enabled_) return Status::OK(); // already unsubscribed.
1812 if (!cupti_driver_api_hook_) return Status::OK(); // already unsubscribed.
1813 if (domain == CUPTI_CB_DOMAIN_NVTX) return HandleNVTXCallback(cbid, cbdata);
1814 if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return Status::OK();
1815 if (internalCuCall) return Status::OK();
1816
1817 if (cbdata->context == nullptr) {
1818 // API callback is called before any CUDA context is created.
1819 // This is expected to be rare, and we ignore this case.
1820 VLOG(3) << "API callback received before creation of CUDA context\n";
1821 return errors::Internal("cutpi callback without context");
1822 }
1823
1824 // Grab a correct device ID.
1825 uint32 device_id = -1;
1826 RETURN_IF_CUPTI_ERROR(
1827 cupti_interface_->GetDeviceId(cbdata->context, &device_id));
1828 if (device_id >= num_gpus_) {
1829 return errors::Internal("Invalid device id:", device_id);
1830 }
1831
1832 if (cbdata->callbackSite == CUPTI_API_ENTER) {
1833 TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiEnter(
1834 device_id, domain, cbid, cbdata));
1835 } else if (cbdata->callbackSite == CUPTI_API_EXIT) {
1836 // Set up the map from correlation id to annotation string.
1837 const auto &annotation = AnnotationStack::Get();
1838 if (!annotation.empty()) {
1839 if (cbid ==
1840 CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice) {
1841 // Kernels are launched on different devices by this API call, therefore
1842 // we need to populate per device annotation map respectively.
1843 for (int i = 0; i < num_gpus_; ++i) {
1844 collector_->annotation_map()->Add(i, cbdata->correlationId,
1845 annotation, "");
1846 }
1847 } else {
1848 absl::string_view nvtx_range = NVTXRangeTracker::CurrentRange();
1849 collector_->annotation_map()->Add(device_id, cbdata->correlationId,
1850 annotation, nvtx_range);
1851 }
1852 }
1853
1854 TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiExit(
1855 device_id, domain, cbid, cbdata));
1856 }
1857 return Status::OK();
1858 }
1859
ConfigureActivityUnifiedMemoryCounter(bool enable)1860 void CuptiTracer::ConfigureActivityUnifiedMemoryCounter(bool enable) {
1861 CUpti_ActivityUnifiedMemoryCounterConfig config[2];
1862 // By experiments, currently only measurements from these two activities are
1863 // trustworthy. Others like GPU page fault may be problematic.
1864 config[0].kind =
1865 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD;
1866 config[1].kind =
1867 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH;
1868
1869 for (size_t i = 0; i < 2; i++) {
1870 config[i].enable = enable;
1871 }
1872
1873 CUptiResult res;
1874
1875 res = cupti_interface_->ActivityConfigureUnifiedMemoryCounter(config, 2);
1876 if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED) {
1877 LOG(ERROR) << "Unified memory is not supported on the "
1878 "underlying platform.\n";
1879 } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE) {
1880 LOG(ERROR) << "Unified memory is not supported on the device.\n";
1881 } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES) {
1882 LOG(ERROR) << "Unified memory is not supported on the "
1883 "non-P2P multi-gpu setup.\n";
1884 } else if (res != CUPTI_SUCCESS) {
1885 const char *errstr = "";
1886 cuptiGetResultString(res, &errstr);
1887 LOG(ERROR) << "Error while enabling unified memory profiling: " << errstr;
1888 } else {
1889 VLOG(1) << "Configuring Unified memory profiling: " << res;
1890 }
1891 }
1892
RequestActivityBuffer(uint8_t ** buffer,size_t * size)1893 void CuptiTracer::RequestActivityBuffer(uint8_t **buffer, size_t *size) {
1894 *buffer = buffer_pool_.GetOrCreateBuffer();
1895 if (*buffer == nullptr) {
1896 LOG(WARNING)
1897 << "CUPTI Buffer not allocated, activity records will be dropped";
1898 *size = 0;
1899 return;
1900 }
1901 *size = buffer_pool_.GetBufferSizeInBytes();
1902 }
1903
ProcessActivityBuffer(CUcontext context,uint32_t stream_id,uint8_t * buffer,size_t size)1904 Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
1905 uint8_t *buffer, size_t size) {
1906 auto buffer_cleanup =
1907 gtl::MakeCleanup([&]() { buffer_pool_.ReclaimBuffer(buffer); });
1908 if (size == 0) {
1909 return Status::OK();
1910 }
1911 if (!activity_tracing_enabled_) {
1912 LOG(WARNING) << "CUPTI activity buffer is reclaimed after flush.";
1913 return Status::OK();
1914 }
1915 if (cupti_interface_->Disabled()) return errors::Internal("Disabled.");
1916
1917 CUpti_Activity *record = nullptr;
1918 while (true) {
1919 CUptiResult status =
1920 cupti_interface_->ActivityGetNextRecord(buffer, size, &record);
1921 if (status == CUPTI_SUCCESS) {
1922 switch (record->kind) {
1923 case CUPTI_ACTIVITY_KIND_KERNEL: // sequential
1924 case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
1925 AddKernelActivityEvent(
1926 collector_, reinterpret_cast<CUpti_ActivityKernel4 *>(record));
1927 break;
1928 case CUPTI_ACTIVITY_KIND_MEMCPY:
1929 AddMemcpyActivityEvent(
1930 collector_, reinterpret_cast<CUpti_ActivityMemcpy *>(record));
1931 break;
1932 case CUPTI_ACTIVITY_KIND_MEMCPY2:
1933 AddMemcpy2ActivityEvent(
1934 collector_, reinterpret_cast<CUpti_ActivityMemcpy2 *>(record));
1935 break;
1936 case CUPTI_ACTIVITY_KIND_OVERHEAD:
1937 AddCuptiOverheadActivityEvent(
1938 collector_, reinterpret_cast<CUpti_ActivityOverhead *>(record));
1939 break;
1940 case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER:
1941 AddUnifiedMemoryActivityEvent(
1942 collector_,
1943 reinterpret_cast<CUpti_ActivityUnifiedMemoryCounter2 *>(record));
1944 break;
1945 case CUPTI_ACTIVITY_KIND_MEMORY: {
1946 AddMemoryActivityEvent(
1947 collector_, reinterpret_cast<CUpti_ActivityMemory *>(record));
1948 } break;
1949 case CUPTI_ACTIVITY_KIND_MEMSET:
1950 AddMemsetActivityEvent(
1951 collector_, reinterpret_cast<CUpti_ActivityMemset *>(record));
1952 break;
1953 case CUPTI_ACTIVITY_KIND_SYNCHRONIZATION:
1954 AddSynchronizationActivityEvent(
1955 collector_,
1956 reinterpret_cast<CUpti_ActivitySynchronization *>(record));
1957 break;
1958 default:
1959 VLOG(3) << "Activity type " << record->kind << " is not supported.";
1960 break;
1961 }
1962 } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
1963 break;
1964 } else {
1965 return errors::Internal("Parse cupti activity buffer error.");
1966 }
1967 }
1968
1969 // Report dropped records.
1970 size_t dropped;
1971 RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityGetNumDroppedRecords(
1972 context, stream_id, &dropped));
1973 if (dropped != 0) {
1974 uint32 device_id = -1;
1975 RETURN_IF_CUPTI_ERROR(cupti_interface_->GetDeviceId(context, &device_id));
1976 collector_->OnEventsDropped("cupti activity buffer full", dropped);
1977 }
1978 return Status::OK();
1979 }
1980
ErrorIfAny()1981 /*static*/ std::string CuptiTracer::ErrorIfAny() {
1982 if (CuptiTracer::NumGpus() == 0) {
1983 return ErrorWithHostname("No GPU detected.");
1984 } else if (CuptiTracer::GetCuptiTracerSingleton()->NeedRootAccess()) {
1985 return ErrorWithHostname(
1986 "Insufficient privilege to run libcupti (you need root permission).");
1987 } else if (CuptiTracer::GetTimestamp() == 0) {
1988 return ErrorWithHostname(
1989 "Failed to load libcupti (is it installed and accessible?)");
1990 }
1991 return "";
1992 }
1993
1994 } // namespace profiler
1995 } // namespace tensorflow
1996