1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
17
18 #include "absl/container/flat_hash_map.h"
19 #include "absl/container/flat_hash_set.h"
20 #include "absl/container/node_hash_map.h"
21 #include "absl/container/node_hash_set.h"
22 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
23 #include "third_party/gpus/cuda/extras/CUPTI/include/generated_nvtx_meta.h"
24 #include "tensorflow/core/platform/env.h"
25 #include "tensorflow/core/platform/errors.h"
26 #include "tensorflow/core/platform/host_info.h"
27 #include "tensorflow/core/platform/logging.h"
28 #include "tensorflow/core/platform/macros.h"
29 #include "tensorflow/core/platform/mem.h"
30 #include "tensorflow/core/profiler/internal/cpu/annotation_stack.h"
31 #include "tensorflow/core/profiler/internal/gpu/cupti_collector.h"
32 #include "tensorflow/core/profiler/internal/gpu/nvtx_utils.h"
33
34 namespace tensorflow {
35 namespace profiler {
36
37 namespace {
38
39 static thread_local int internalCuCall = 0;
40
41 // Temporary disable cupti api tracing for this thread during the life scope of
42 // this class. Used for the API calls that initiated by us.
43 class CuptiApiTracingDisabler {
44 public:
CuptiApiTracingDisabler()45 CuptiApiTracingDisabler() { internalCuCall++; }
~CuptiApiTracingDisabler()46 ~CuptiApiTracingDisabler() { internalCuCall--; }
47 };
48
ToStatus(CUptiResult result)49 Status ToStatus(CUptiResult result) {
50 if (result == CUPTI_SUCCESS) {
51 return Status::OK();
52 }
53 const char *str = nullptr;
54 cuptiGetResultString(result, &str);
55 return errors::Unavailable("CUPTI error: ", str ? str : "<unknown>");
56 }
57
ToStatus(CUresult result)58 Status ToStatus(CUresult result) {
59 if (result == CUDA_SUCCESS) {
60 return Status::OK();
61 }
62 const char *str = nullptr;
63 cuGetErrorName(result, &str);
64 return errors::Unavailable("CUDA error: ", str ? str : "<unknown>");
65 }
66
LogIfError(const Status & status)67 inline void LogIfError(const Status &status) {
68 if (status.ok()) return;
69 LOG(ERROR) << status.error_message();
70 }
71
72 // Maps an OverheadKind enum to a const string.
getActivityOverheadKindString(CUpti_ActivityOverheadKind kind)73 const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
74 switch (kind) {
75 case CUPTI_ACTIVITY_OVERHEAD_DRIVER_COMPILER:
76 return "COMPILER";
77 case CUPTI_ACTIVITY_OVERHEAD_CUPTI_BUFFER_FLUSH:
78 return "BUFFER_FLUSH";
79 case CUPTI_ACTIVITY_OVERHEAD_CUPTI_INSTRUMENTATION:
80 return "INSTRUMENTATION";
81 case CUPTI_ACTIVITY_OVERHEAD_CUPTI_RESOURCE:
82 return "RESOURCE";
83 default:
84 break;
85 }
86 return "<UNKNOWN>";
87 }
88
getActivityUnifiedMemoryKindString(CUpti_ActivityUnifiedMemoryCounterKind kind)89 const char *getActivityUnifiedMemoryKindString(
90 CUpti_ActivityUnifiedMemoryCounterKind kind) {
91 switch (kind) {
92 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
93 return "UM_BYTES_TRANSFER_HTOD";
94 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
95 return "UM_BYTES_TRANSFER_DTOH";
96 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
97 return "UM_CPU_PAGE_FAULT";
98 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
99 return "UM_GPU_PAGE_FAULT";
100 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING:
101 return "UM_THRASHING";
102 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THROTTLING:
103 return "UM_THROTTLING";
104 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP:
105 return "UM_REMOTE_MAP";
106 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD:
107 return "UM_BYTES_TRANSFER_DTOD";
108 default:
109 break;
110 }
111 return "<UNKNOWN>";
112 }
113
114 // CUPTI_ERROR_INSUFFICIENT_PRIVILEGES is introduced at CUDA 10.1.
115 #if CUDA_VERSION <= 10000
116 #define CUPTI_ERROR_INSUFFICIENT_PRIVILEGES 35
117 #endif
118
119 #define RETURN_IF_CUPTI_ERROR(expr) \
120 do { \
121 CUptiResult status = expr; \
122 if (ABSL_PREDICT_FALSE(status != CUPTI_SUCCESS)) { \
123 const char *errstr = ""; \
124 cupti_interface_->GetResultString(status, &errstr); \
125 LOG(ERROR) << "function " << #expr << "failed with error " << errstr; \
126 if (status == CUPTI_ERROR_INSUFFICIENT_PRIVILEGES) { \
127 return errors::PermissionDenied("CUPTI need root access!"); \
128 } else { \
129 return errors::Internal("CUPTI call error", errstr); \
130 } \
131 } \
132 } while (false)
133
Bytes2D(const CUDA_MEMCPY2D * p)134 size_t Bytes2D(const CUDA_MEMCPY2D *p) { return p->Height * p->WidthInBytes; }
135
Bytes3D(const CUDA_MEMCPY3D * p)136 size_t Bytes3D(const CUDA_MEMCPY3D *p) {
137 return p->Depth * p->Height * p->WidthInBytes;
138 }
139
140 template <typename CudaMemcpy>
MemcpyKind(const CudaMemcpy * p)141 CuptiTracerEventType MemcpyKind(const CudaMemcpy *p) {
142 if (p->srcMemoryType == CU_MEMORYTYPE_HOST &&
143 p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
144 return CuptiTracerEventType::MemcpyH2D;
145 }
146 if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
147 p->dstMemoryType == CU_MEMORYTYPE_HOST) {
148 return CuptiTracerEventType::MemcpyD2H;
149 }
150 if (p->srcMemoryType == CU_MEMORYTYPE_DEVICE &&
151 p->dstMemoryType == CU_MEMORYTYPE_DEVICE) {
152 return CuptiTracerEventType::MemcpyD2D;
153 }
154 return CuptiTracerEventType::Unsupported;
155 }
156
157 std::tuple<size_t /*bytes*/, CuptiTracerEventType, bool /*async*/>
DecodeDriverMemcpy(CUpti_CallbackId cbid,const void * params)158 DecodeDriverMemcpy(CUpti_CallbackId cbid, const void *params) {
159 switch (cbid) {
160 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2: {
161 const auto *p = reinterpret_cast<const cuMemcpyHtoD_v2_params *>(params);
162 return {p->ByteCount, CuptiTracerEventType::MemcpyH2D, false};
163 }
164 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2: {
165 const auto *p =
166 reinterpret_cast<const cuMemcpyHtoDAsync_v2_params *>(params);
167 return {p->ByteCount, CuptiTracerEventType::MemcpyH2D, true};
168 }
169 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2: {
170 const auto *p = reinterpret_cast<const cuMemcpyDtoH_v2_params *>(params);
171 return {p->ByteCount, CuptiTracerEventType::MemcpyD2H, false};
172 }
173 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2: {
174 const auto *p =
175 reinterpret_cast<const cuMemcpyDtoHAsync_v2_params *>(params);
176 return {p->ByteCount, CuptiTracerEventType::MemcpyD2H, true};
177 }
178 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2: {
179 const auto *p = reinterpret_cast<const cuMemcpyDtoD_v2_params *>(params);
180 return {p->ByteCount, CuptiTracerEventType::MemcpyD2D, false};
181 }
182 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2: {
183 const auto *p =
184 reinterpret_cast<const cuMemcpyDtoDAsync_v2_params *>(params);
185 return {p->ByteCount, CuptiTracerEventType::MemcpyD2D, true};
186 }
187 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
188 const auto *p = reinterpret_cast<const cuMemcpy_params *>(params);
189 return {p->ByteCount, CuptiTracerEventType::MemcpyOther, false};
190 }
191 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
192 const auto *p = reinterpret_cast<const cuMemcpyAsync_params *>(params);
193 return {p->ByteCount, CuptiTracerEventType::MemcpyOther, true};
194 }
195 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2: {
196 const auto *p = reinterpret_cast<const cuMemcpy2D_v2_params *>(params);
197 return {Bytes2D(p->pCopy), MemcpyKind(p->pCopy), false};
198 }
199 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2: {
200 const auto *p =
201 reinterpret_cast<const cuMemcpy2DAsync_v2_params *>(params);
202 return {Bytes2D(p->pCopy), MemcpyKind(p->pCopy), true};
203 }
204 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2: {
205 const auto *p = reinterpret_cast<const cuMemcpy3D_v2_params *>(params);
206 return {Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true};
207 }
208 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2: {
209 const auto *p =
210 reinterpret_cast<const cuMemcpy3DAsync_v2_params *>(params);
211 return {Bytes3D(p->pCopy), MemcpyKind(p->pCopy), true};
212 }
213 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer: {
214 const auto *p2p_params =
215 reinterpret_cast<const cuMemcpyPeer_params *>(params);
216 return {p2p_params->ByteCount, CuptiTracerEventType::MemcpyP2P, false};
217 }
218 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync: {
219 const auto *p2p_params =
220 reinterpret_cast<const cuMemcpyPeerAsync_params *>(params);
221 return {p2p_params->ByteCount, CuptiTracerEventType::MemcpyP2P, true};
222 }
223 default: {
224 LOG(ERROR) << "Unsupported memcpy activity observed: " << cbid;
225 return {0, CuptiTracerEventType::Unsupported, false};
226 }
227 }
228 }
229
230 std::tuple<size_t /*bytes*/, CuptiTracerEventType, bool /*async*/>
DecodeDriverMemset(CUpti_CallbackId cbid,const void * params)231 DecodeDriverMemset(CUpti_CallbackId cbid, const void *params) {
232 switch (cbid) {
233 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2: {
234 const auto *p = reinterpret_cast<const cuMemsetD8_v2_params *>(params);
235 return {p->N, CuptiTracerEventType::Memset, false};
236 }
237 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2: {
238 const auto *p = reinterpret_cast<const cuMemsetD16_v2_params *>(params);
239 return {p->N, CuptiTracerEventType::Memset, false};
240 }
241 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2: {
242 const auto *p = reinterpret_cast<const cuMemsetD32_v2_params *>(params);
243 return {p->N, CuptiTracerEventType::Memset, false};
244 }
245 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2: {
246 const auto *p = reinterpret_cast<const cuMemsetD2D8_v2_params *>(params);
247 return {p->dstPitch * p->Height, CuptiTracerEventType::Memset, false};
248 }
249 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2: {
250 const auto *p = reinterpret_cast<const cuMemsetD2D16_v2_params *>(params);
251 return {p->dstPitch * p->Height, CuptiTracerEventType::Memset, false};
252 }
253 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2: {
254 const auto *p = reinterpret_cast<const cuMemsetD2D32_v2_params *>(params);
255 return {p->dstPitch * p->Height, CuptiTracerEventType::Memset, false};
256 }
257 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async: {
258 const auto *p = reinterpret_cast<const cuMemsetD8Async_params *>(params);
259 return {p->N, CuptiTracerEventType::Memset, true};
260 }
261 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async: {
262 const auto *p = reinterpret_cast<const cuMemsetD16Async_params *>(params);
263 return {p->N, CuptiTracerEventType::Memset, true};
264 }
265 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async: {
266 const auto *p = reinterpret_cast<const cuMemsetD32Async_params *>(params);
267 return {p->N, CuptiTracerEventType::Memset, true};
268 }
269 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async: {
270 const auto *p =
271 reinterpret_cast<const cuMemsetD2D8Async_params *>(params);
272 return {p->dstPitch * p->Height, CuptiTracerEventType::Memset, true};
273 }
274 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async: {
275 const auto *p =
276 reinterpret_cast<const cuMemsetD2D16Async_params *>(params);
277 return {p->dstPitch * p->Height, CuptiTracerEventType::Memset, true};
278 }
279 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async: {
280 const auto *p =
281 reinterpret_cast<const cuMemsetD2D32Async_params *>(params);
282 return {p->dstPitch * p->Height, CuptiTracerEventType::Memset, true};
283 }
284 default: {
285 LOG(ERROR) << "Unsupported memset activity observed: " << cbid;
286 return {0, CuptiTracerEventType::Unsupported, false};
287 }
288 }
289 }
290
291 // Cupti callback corresponding to a driver or runtime API. This global function
292 // is invoked twice for each API: at entry and at exit. The cbdata
293 // parameter is guaranteed by Cupti to be thread-safe. Most invocations are
294 // dropped to the floor and entry/exit is tracked for the APIs we deem
295 // performance-relevant.
ApiCallback(void * user_data,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)296 void CUPTIAPI ApiCallback(void *user_data, CUpti_CallbackDomain domain,
297 CUpti_CallbackId cbid,
298 const CUpti_CallbackData *cbdata) {
299 CuptiTracer *tracer = reinterpret_cast<CuptiTracer *>(user_data);
300 tracer->HandleCallback(domain, cbid, cbdata).IgnoreError();
301 }
302
303 // Callback which is invoked when an empty buffer is requested by CUPTI.
304 // Allocates an empty aligned-memory buffer. The buffer is used by CUPTI as a
305 // ring buffer where device maintains activity profiles that have been
306 // collected.
AllocCuptiActivityBuffer(uint8_t ** buffer,size_t * size,size_t * maxNumRecords)307 void CUPTIAPI AllocCuptiActivityBuffer(uint8_t **buffer, size_t *size,
308 size_t *maxNumRecords) {
309 // Buffer size and alignment, 32K and 8 as in CUPTI samples.
310 constexpr size_t kBufferSize = 32 * 1024;
311 constexpr int kBufferAlignSize = 8;
312 *buffer = reinterpret_cast<uint8_t *>(
313 port::AlignedMalloc(kBufferSize, kBufferAlignSize));
314 if (*buffer == nullptr) {
315 LOG(WARNING)
316 << "Cupti Buffer not allocated, activity records will be dropped";
317 return;
318 }
319 *size = kBufferSize;
320 *maxNumRecords = 0; // Cupti to fill as many records as fit in the buffer.
321 VLOG(3) << "Allocated Cupti Buffer, buffer=" << std::hex
322 << reinterpret_cast<uintptr_t>(*buffer) << std::dec
323 << " size=" << *size;
324 }
325
326 // Callback which is invoked when a buffer containing activity records is
327 // available from CUPTI. Frees the buffer after reading activity records from
328 // it.
FreeCuptiActivityBuffer(CUcontext context,uint32_t stream_id,uint8_t * buffer,size_t size,size_t valid_size)329 void CUPTIAPI FreeCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
330 uint8_t *buffer, size_t size,
331 size_t valid_size) {
332 VLOG(3) << "Freeing Cupti Buffer, buffer:" << std::hex
333 << reinterpret_cast<uintptr_t>(buffer) << std::dec
334 << " size: " << size << " valid_size: " << valid_size;
335
336 if (valid_size > 0) {
337 VLOG(3) << "Activity profile for stream " << stream_id;
338
339 CuptiTracer *cupti_tracer = CuptiTracer::GetCuptiTracerSingleton();
340 cupti_tracer->ProcessActivityBuffer(context, stream_id, buffer, valid_size)
341 .IgnoreError();
342 }
343 port::AlignedFree(buffer);
344 }
345
AddKernelEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)346 void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
347 const CUpti_CallbackData *cbdata,
348 uint64 start_time, uint64 end_time) {
349 CuptiTracerEvent event{};
350 event.type = CuptiTracerEventType::Kernel;
351 event.source = CuptiTracerEventSource::DriverCallback;
352 event.name = cbdata->symbolName ? cbdata->symbolName : cbdata->functionName;
353 event.start_time_ns = start_time;
354 event.end_time_ns = end_time;
355 event.thread_id = Env::Default()->GetCurrentThreadId();
356 event.device_id = device_id;
357 event.context_id = cbdata->contextUid;
358 event.correlation_id = cbdata->correlationId;
359 VLOG(3) << "Cuda Kernel launch API exit. name=" << event.name;
360 collector->AddEvent(std::move(event));
361 }
362
363 // Performs the actual callback for both normal and P2P memcpy operations.
PopulateMemcpyCallbackEvent(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,size_t num_bytes,uint32 src_device,uint32 dst_device,bool async,uint64 start_time,uint64 end_time)364 CuptiTracerEvent PopulateMemcpyCallbackEvent(
365 CuptiTracerEventType type, const CUpti_CallbackData *cbdata,
366 size_t num_bytes, uint32 src_device, uint32 dst_device, bool async,
367 uint64 start_time, uint64 end_time) {
368 CuptiTracerEvent event{};
369 event.type = type;
370 event.source = CuptiTracerEventSource::DriverCallback;
371 event.start_time_ns = start_time;
372 event.end_time_ns = end_time;
373 event.thread_id = Env::Default()->GetCurrentThreadId();
374 event.device_id = src_device;
375 event.context_id = cbdata->contextUid;
376 event.correlation_id = cbdata->correlationId;
377 event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
378 event.memcpy_info.num_bytes = num_bytes;
379 event.memcpy_info.destination = dst_device;
380 event.memcpy_info.async = async;
381 return event;
382 }
383
AddNormalMemcpyEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)384 void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector,
385 uint32 device_id, CUpti_CallbackId cbid,
386 const CUpti_CallbackData *cbdata,
387 uint64 start_time, uint64 end_time) {
388 size_t num_bytes;
389 CuptiTracerEventType type;
390 bool async;
391 std::tie(num_bytes, type, async) =
392 DecodeDriverMemcpy(cbid, cbdata->functionParams);
393
394 VLOG(3) << "Cuda Memcpy API exit. sz=" << num_bytes;
395 CuptiTracerEvent event =
396 PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, device_id, device_id,
397 async, start_time, end_time);
398 collector->AddEvent(std::move(event));
399 }
400
AddCuMemsetEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)401 void AddCuMemsetEventUponApiExit(CuptiTraceCollector *collector,
402 uint32 device_id, CUpti_CallbackId cbid,
403 const CUpti_CallbackData *cbdata,
404 uint64 start_time, uint64 end_time) {
405 // We are casting all variants of cuMemset to cuMemsetD8 for accessing the
406 // first member attribute, a CUdeviceptr.
407 const auto *params =
408 static_cast<const cuMemsetD8_v2_params *>(cbdata->functionParams);
409 size_t num_bytes;
410 bool async;
411 CuptiTracerEventType type;
412 std::tie(num_bytes, type, async) =
413 DecodeDriverMemset(cbid, cbdata->functionParams);
414
415 CuptiTracerEvent event{};
416 event.type = type;
417 event.source = CuptiTracerEventSource::DriverCallback;
418 event.start_time_ns = start_time;
419 event.end_time_ns = end_time;
420 event.thread_id = Env::Default()->GetCurrentThreadId();
421 event.device_id = device_id;
422 event.context_id = cbdata->contextUid;
423 event.correlation_id = cbdata->correlationId;
424 event.memset_info.num_bytes = num_bytes;
425 // memset_info.kind cannot be determined from API.
426 event.memset_info.async = async;
427 VLOG(3) << "Cuda Memset API exit."
428 << " dptr=" << reinterpret_cast<void *>(params->dstDevice)
429 << " sz=" << num_bytes;
430 collector->AddEvent(std::move(event));
431 }
432
AddP2PMemcpyEventUponApiExit(CuptiTraceCollector * collector,CuptiInterface * cupti_interface,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)433 void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector,
434 CuptiInterface *cupti_interface,
435 uint32 device_id, CUpti_CallbackId cbid,
436 const CUpti_CallbackData *cbdata,
437 uint64 start_time, uint64 end_time) {
438 size_t num_bytes;
439 CuptiTracerEventType type;
440 bool async;
441 std::tie(num_bytes, type, async) =
442 DecodeDriverMemcpy(cbid, cbdata->functionParams);
443
444 uint32 dst_device = -1, src_device = -1;
445 const auto *p2p_params =
446 static_cast<const cuMemcpyPeer_params *>(cbdata->functionParams);
447 cupti_interface->GetDeviceId(p2p_params->srcContext, &src_device);
448 cupti_interface->GetDeviceId(p2p_params->dstContext, &dst_device);
449 VLOG(3) << "Cuda P2P Memcpy API exit, src: " << src_device
450 << " dst: " << dst_device << " size:" << num_bytes;
451 CuptiTracerEvent event =
452 PopulateMemcpyCallbackEvent(type, cbdata, num_bytes, src_device,
453 dst_device, async, start_time, end_time);
454 collector->AddEvent(std::move(event));
455 }
456
AddCuMemAllocEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)457 void AddCuMemAllocEventUponApiExit(CuptiTraceCollector *collector,
458 uint32 device_id, CUpti_CallbackId cbid,
459 const CUpti_CallbackData *cbdata,
460 uint64 start_time, uint64 end_time) {
461 const auto *params =
462 static_cast<const cuMemAlloc_v2_params *>(cbdata->functionParams);
463 CuptiTracerEvent event{};
464 event.type = CuptiTracerEventType::MemoryAlloc;
465 event.source = CuptiTracerEventSource::DriverCallback;
466 event.name = cbdata->functionName;
467 event.start_time_ns = start_time;
468 event.end_time_ns = end_time;
469 event.thread_id = Env::Default()->GetCurrentThreadId();
470 event.device_id = device_id;
471 event.context_id = cbdata->contextUid;
472 event.correlation_id = cbdata->correlationId;
473 event.memalloc_info.num_bytes = params->bytesize;
474 VLOG(3) << "Cuda MemAlloc API exit."
475 << " dptr=" << reinterpret_cast<void *>(*params->dptr)
476 << " sz=" << params->bytesize;
477 collector->AddEvent(std::move(event));
478 }
479
AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)480 void AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector *collector,
481 uint32 device_id, CUpti_CallbackId cbid,
482 const CUpti_CallbackData *cbdata,
483 uint64 start_time, uint64 end_time) {
484 const auto *params =
485 static_cast<const cuMemAllocPitch_v2_params *>(cbdata->functionParams);
486 CuptiTracerEvent event{};
487 event.type = CuptiTracerEventType::MemoryAlloc;
488 event.source = CuptiTracerEventSource::DriverCallback;
489 event.name = cbdata->functionName;
490 event.start_time_ns = start_time;
491 event.end_time_ns = end_time;
492 event.thread_id = Env::Default()->GetCurrentThreadId();
493 event.device_id = device_id;
494 event.context_id = cbdata->contextUid;
495 event.correlation_id = cbdata->correlationId;
496 const size_t size_in_bytes = *params->pPitch * params->Height;
497 event.memalloc_info.num_bytes = size_in_bytes;
498 VLOG(3) << "Cuda MemAllocPitch API exit."
499 << " dptr=" << reinterpret_cast<void *>(*params->dptr)
500 << " sz=" << size_in_bytes;
501 collector->AddEvent(std::move(event));
502 }
503
AddCuMemFreeEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)504 void AddCuMemFreeEventUponApiExit(CuptiTraceCollector *collector,
505 uint32 device_id, CUpti_CallbackId cbid,
506 const CUpti_CallbackData *cbdata,
507 uint64 start_time, uint64 end_time) {
508 const auto *params =
509 static_cast<const cuMemFree_v2_params *>(cbdata->functionParams);
510 CuptiTracerEvent event{};
511 event.type = CuptiTracerEventType::MemoryFree;
512 event.source = CuptiTracerEventSource::DriverCallback;
513 event.name = cbdata->functionName;
514 event.start_time_ns = start_time;
515 event.end_time_ns = end_time;
516 event.thread_id = Env::Default()->GetCurrentThreadId();
517 event.device_id = device_id;
518 event.context_id = cbdata->contextUid;
519 event.correlation_id = cbdata->correlationId;
520 VLOG(3) << "Cuda MemFree API exit."
521 << " dptr=" << reinterpret_cast<void *>(params->dptr);
522 collector->AddEvent(std::move(event));
523 }
524
AddGenericEventUponApiExit(CuptiTraceCollector * collector,uint32 device_id,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,uint64 start_time,uint64 end_time)525 void AddGenericEventUponApiExit(CuptiTraceCollector *collector,
526 uint32 device_id, CUpti_CallbackId cbid,
527 const CUpti_CallbackData *cbdata,
528 uint64 start_time, uint64 end_time) {
529 CuptiTracerEvent event{};
530 event.type = CuptiTracerEventType::Generic;
531 event.source = CuptiTracerEventSource::DriverCallback;
532 event.name = cbdata->functionName;
533 event.start_time_ns = start_time;
534 event.end_time_ns = end_time;
535 event.thread_id = Env::Default()->GetCurrentThreadId();
536 event.device_id = device_id;
537 event.context_id = cbdata->contextUid;
538 event.correlation_id = cbdata->correlationId;
539 VLOG(3) << "Observed generic API exit."
540 << " name=" << cbdata->functionName;
541 collector->AddEvent(std::move(event));
542 }
543
AddKernelActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityKernel4 * kernel)544 void AddKernelActivityEvent(CuptiTraceCollector *collector,
545 const CUpti_ActivityKernel4 *kernel) {
546 CuptiTracerEvent event{};
547 event.type = CuptiTracerEventType::Kernel;
548 event.source = CuptiTracerEventSource::Activity;
549 event.name = kernel->name;
550 event.start_time_ns = kernel->start;
551 event.end_time_ns = kernel->end;
552 event.device_id = kernel->deviceId;
553 event.context_id = kernel->contextId;
554 event.stream_id = kernel->streamId;
555 event.correlation_id = kernel->correlationId;
556 AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
557 event.device_id, event.correlation_id);
558 event.annotation = info.annotation;
559 event.nvtx_range = info.nvtx_range;
560 event.kernel_info.registers_per_thread = kernel->registersPerThread;
561 event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
562 event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
563 event.kernel_info.block_x = kernel->blockX;
564 event.kernel_info.block_y = kernel->blockY;
565 event.kernel_info.block_z = kernel->blockZ;
566 event.kernel_info.grid_x = kernel->gridX;
567 event.kernel_info.grid_y = kernel->gridY;
568 event.kernel_info.grid_z = kernel->gridZ;
569 collector->AddEvent(std::move(event));
570 }
571
AddMemcpyActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemcpy * memcpy)572 void AddMemcpyActivityEvent(CuptiTraceCollector *collector,
573 const CUpti_ActivityMemcpy *memcpy) {
574 CuptiTracerEvent event{};
575 switch (memcpy->copyKind) {
576 case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
577 event.type = CuptiTracerEventType::MemcpyH2D;
578 event.name = "MemcpyH2D";
579 break;
580 case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
581 event.type = CuptiTracerEventType::MemcpyD2H;
582 event.name = "MemcpyD2H";
583 break;
584 case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
585 event.type = CuptiTracerEventType::MemcpyD2D;
586 event.name = "MemcpyD2D";
587 break;
588 case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
589 event.type = CuptiTracerEventType::MemcpyP2P;
590 event.name = "MemcpyP2P";
591 break;
592 default:
593 event.type = CuptiTracerEventType::MemcpyOther;
594 event.name = "MemcpyOther";
595 break;
596 }
597 event.source = CuptiTracerEventSource::Activity;
598 event.start_time_ns = memcpy->start;
599 event.end_time_ns = memcpy->end;
600 event.device_id = memcpy->deviceId;
601 event.context_id = memcpy->contextId;
602 event.stream_id = memcpy->streamId;
603 event.correlation_id = memcpy->correlationId;
604 AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
605 event.device_id, event.correlation_id);
606 event.annotation = info.annotation;
607 event.memcpy_info.kind = memcpy->copyKind;
608 event.memcpy_info.num_bytes = memcpy->bytes;
609 event.memcpy_info.destination = memcpy->deviceId;
610 event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
611 event.memcpy_info.src_mem_kind = memcpy->srcKind;
612 event.memcpy_info.dst_mem_kind = memcpy->dstKind;
613 collector->AddEvent(std::move(event));
614 }
615
616 // Invokes callback upon peer-2-peer memcpy between different GPU devices.
AddMemcpy2ActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemcpy2 * memcpy2)617 void AddMemcpy2ActivityEvent(CuptiTraceCollector *collector,
618 const CUpti_ActivityMemcpy2 *memcpy2) {
619 CuptiTracerEvent event{};
620 event.type = CuptiTracerEventType::MemcpyP2P;
621 event.name = "MemcpyP2P";
622 event.source = CuptiTracerEventSource::Activity;
623 event.start_time_ns = memcpy2->start;
624 event.end_time_ns = memcpy2->end;
625 event.device_id = memcpy2->srcDeviceId;
626 event.context_id = memcpy2->contextId;
627 event.stream_id = memcpy2->streamId;
628 event.correlation_id = memcpy2->correlationId;
629 AnnotationMap::AnnotationInfo info = collector->annotation_map()->LookUp(
630 event.device_id, event.correlation_id);
631 event.annotation = info.annotation;
632 event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
633 event.memcpy_info.num_bytes = memcpy2->bytes;
634 event.memcpy_info.destination = memcpy2->dstDeviceId;
635 event.memcpy_info.async = memcpy2->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC;
636 event.memcpy_info.src_mem_kind = memcpy2->srcKind;
637 event.memcpy_info.dst_mem_kind = memcpy2->dstKind;
638 collector->AddEvent(std::move(event));
639 }
640
AddCuptiOverheadActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityOverhead * overhead)641 void AddCuptiOverheadActivityEvent(CuptiTraceCollector *collector,
642 const CUpti_ActivityOverhead *overhead) {
643 CuptiTracerEvent event{};
644 event.type = CuptiTracerEventType::Overhead;
645 event.name = getActivityOverheadKindString(overhead->overheadKind);
646 event.source = CuptiTracerEventSource::Activity;
647 event.start_time_ns = overhead->start;
648 event.end_time_ns = overhead->end;
649 // If the overhead is not related to a device, we assign it to device 0.
650 event.device_id = 0;
651 // NOTE: no correlation id.
652 switch (overhead->objectKind) {
653 case CUPTI_ACTIVITY_OBJECT_UNKNOWN:
654 // Don't know how to deal with such activities because of we need either
655 // attribute it to a GPU stream or a CPU thread.
656 return;
657
658 case CUPTI_ACTIVITY_OBJECT_THREAD:
659 case CUPTI_ACTIVITY_OBJECT_PROCESS:
660 event.thread_id = overhead->objectId.pt.threadId;
661 break;
662 case CUPTI_ACTIVITY_OBJECT_STREAM:
663 event.stream_id = overhead->objectId.dcs.streamId;
664 TF_FALLTHROUGH_INTENDED;
665 case CUPTI_ACTIVITY_OBJECT_DEVICE:
666 case CUPTI_ACTIVITY_OBJECT_CONTEXT:
667 event.device_id = overhead->objectId.dcs.deviceId;
668 break;
669 default:
670 LOG(ERROR) << "Unexpected object kind: " << overhead->objectKind;
671 return;
672 }
673 collector->AddEvent(std::move(event));
674 }
675
AddUnifiedMemoryActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityUnifiedMemoryCounter2 * record)676 void AddUnifiedMemoryActivityEvent(
677 CuptiTraceCollector *collector,
678 const CUpti_ActivityUnifiedMemoryCounter2 *record) {
679 VLOG(3) << "Cuda Unified Memory Activity, kind: " << record->counterKind
680 << " src: " << record->srcId << " dst: " << record->dstId;
681 CuptiTracerEvent event{};
682 event.type = CuptiTracerEventType::UnifiedMemory;
683 event.name = getActivityUnifiedMemoryKindString(record->counterKind);
684 event.source = CuptiTracerEventSource::Activity;
685 event.start_time_ns = record->start;
686 if (record->counterKind ==
687 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT ||
688 record->counterKind ==
689 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_THRASHING ||
690 record->counterKind ==
691 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_REMOTE_MAP ||
692 record->end <= record->start) {
693 // If the end time is not valid, trim it so that it can be shown on the UI.
694 event.end_time_ns = record->start + 1;
695 } else {
696 event.end_time_ns = record->end;
697 }
698 event.device_id = record->srcId;
699 // NOTE: not context id and correlation id.
700
701 // For visualization purpose, we assign a pseudo stream id for each
702 // record->counterKind of unified memory related events.
703 constexpr int kPseudoStreamId = 0x10000000;
704 event.stream_id = kPseudoStreamId + record->counterKind;
705 event.memcpy_info.kind = CUPTI_ACTIVITY_MEMCPY_KIND_UNKNOWN;
706 // Check whether the activity is byte transfer.
707 if (record->counterKind ==
708 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD ||
709 record->counterKind ==
710 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH ||
711 record->counterKind ==
712 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOD) {
713 event.memcpy_info.num_bytes = record->value;
714 } else {
715 event.memcpy_info.num_bytes = 0;
716 }
717 event.memcpy_info.destination = record->dstId;
718 event.memcpy_info.async = false;
719 collector->AddEvent(std::move(event));
720 }
721
AddMemoryActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemory * memory)722 void AddMemoryActivityEvent(CuptiTraceCollector *collector,
723 const CUpti_ActivityMemory *memory) {
724 CuptiTracerEvent event{};
725 event.name = absl::StrCat("Memory ", GetMemoryKindName(memory->memoryKind));
726 event.type = CuptiTracerEventType::MemoryResidency;
727 event.source = CuptiTracerEventSource::Activity;
728 event.start_time_ns = memory->start;
729 event.end_time_ns = std::max(memory->end, memory->start + 1);
730 event.device_id = memory->deviceId;
731 event.context_id = memory->contextId;
732 // Assign to default stream (0) so that event is included during Flush().
733 event.stream_id = 0;
734 event.memory_residency_info.num_bytes = memory->bytes;
735 event.memory_residency_info.kind = memory->memoryKind;
736 event.memory_residency_info.address = memory->address;
737 VLOG(5) << "Cuda activity " << event.name
738 << " addr: " << reinterpret_cast<void *>(memory->address)
739 << " bytes: " << memory->bytes;
740 collector->AddEvent(std::move(event));
741 }
742
AddMemsetActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivityMemset * memset)743 void AddMemsetActivityEvent(CuptiTraceCollector *collector,
744 const CUpti_ActivityMemset *memset) {
745 CuptiTracerEvent event{};
746 event.type = CuptiTracerEventType::Memset;
747 event.source = CuptiTracerEventSource::Activity;
748 event.name = absl::StrCat("Memset ", GetMemoryKindName(memset->memoryKind));
749 event.start_time_ns = memset->start;
750 event.end_time_ns = std::max(memset->end, memset->start + 1);
751 event.device_id = memset->deviceId;
752 event.correlation_id = memset->correlationId;
753 event.context_id = memset->contextId;
754 event.stream_id = memset->streamId;
755 event.memset_info.num_bytes = memset->bytes;
756 event.memset_info.kind = memset->memoryKind;
757 event.memset_info.async = (memset->flags & CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC);
758 VLOG(5) << "Cuda activity " << event.name << " bytes: " << memset->bytes
759 << " async: " << event.memset_info.async;
760 collector->AddEvent(std::move(event));
761 }
762
AddSynchronizationActivityEvent(CuptiTraceCollector * collector,const CUpti_ActivitySynchronization * sync)763 void AddSynchronizationActivityEvent(
764 CuptiTraceCollector *collector, const CUpti_ActivitySynchronization *sync) {
765 CuptiTracerEvent event{};
766 event.type = CuptiTracerEventType::Generic;
767 event.source = CuptiTracerEventSource::Activity;
768 switch (sync->type) {
769 case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_EVENT_SYNCHRONIZE:
770 event.name = "cuEventSynchronize";
771 break;
772 case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_WAIT_EVENT:
773 event.name = "cuStreamWaitEvent";
774 break;
775 case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_STREAM_SYNCHRONIZE:
776 event.name = "cuStreamSynchronize";
777 break;
778 case CUPTI_ACTIVITY_SYNCHRONIZATION_TYPE_CONTEXT_SYNCHRONIZE:
779 event.name = "cuCtxSynchronize";
780 break;
781 default:
782 event.name = "unknown synchronization event";
783 break;
784 }
785 event.start_time_ns = sync->start;
786 event.end_time_ns = std::max(sync->end, sync->start + 1);
787 event.correlation_id = sync->correlationId;
788 event.context_id = sync->contextId;
789 VLOG(5) << "Cuda activity " << event.name;
790 collector->AddEvent(std::move(event));
791 }
792
793 // This hook uses cupti activity api to measure device side activities.
794 class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
795 public:
CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions & option,CuptiInterface * cupti_interface,CuptiTraceCollector * collector)796 CuptiDriverApiHookWithActivityApi(const CuptiTracerOptions &option,
797 CuptiInterface *cupti_interface,
798 CuptiTraceCollector *collector)
799 : option_(option),
800 cupti_interface_(cupti_interface),
801 collector_(collector) {}
802
OnDriverApiEnter(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)803 Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
804 CUpti_CallbackId cbid,
805 const CUpti_CallbackData *cbdata) override {
806 // Stash away the current Cupti timestamp into cbdata.
807 *cbdata->correlationData =
808 option_.required_callback_api_events ? CuptiTracer::GetTimestamp() : 0;
809 return Status::OK();
810 }
OnDriverApiExit(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)811 Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
812 CUpti_CallbackId cbid,
813 const CUpti_CallbackData *cbdata) override {
814 // If we are not collecting CPU events from Callback API, we can return now.
815 if (!option_.required_callback_api_events) {
816 return Status::OK();
817 }
818
819 // Grab timestamp for API exit. API entry timestamp saved in cbdata.
820 uint64 end_tsc = CuptiTracer::GetTimestamp();
821 uint64 start_tsc = *cbdata->correlationData;
822 TrackContext(cbid, cbdata->context);
823 return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
824 start_tsc, end_tsc, domain, cbid, cbdata);
825 }
SyncAndFlush()826 Status SyncAndFlush() override {
827 if (option_.sync_devices_before_stop) {
828 CuptiApiTracingDisabler disabler;
829 absl::MutexLock lock(&mutex_);
830 for (auto &ctx : contexts_) {
831 cuCtxPushCurrent(ctx);
832 cuCtxSynchronize(); // Ignore error here for best effort.
833 CUcontext current;
834 cuCtxPopCurrent(¤t);
835 }
836 }
837 return Status::OK();
838 }
839
840 private:
TrackContext(CUpti_CallbackId cbid,CUcontext ctx)841 void TrackContext(CUpti_CallbackId cbid, CUcontext ctx) {
842 if (!option_.sync_devices_before_stop) return;
843 if (ctx == NULL) return;
844 absl::MutexLock lock(&mutex_);
845 if (cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2 ||
846 cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy) {
847 contexts_.erase(ctx);
848 } else {
849 contexts_.emplace(ctx);
850 }
851 }
852
853 const CuptiTracerOptions option_;
854 CuptiInterface *cupti_interface_;
855 CuptiTraceCollector *collector_;
856 absl::Mutex mutex_;
857 absl::flat_hash_set<CUcontext> contexts_ TF_GUARDED_BY(mutex_);
858
859 TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithActivityApi);
860 };
861
862 struct KernelRecord {
863 const char *kernel_name;
864 // TODO(csigg): cuStreamGetCtx introduced in CUDA 9.2 would allow us to only
865 // record the stream and infer the context during collection.
866 CUcontext context;
867 CUstream stream;
868 uint32 correlation_id;
869 CUevent start_event;
870 CUevent stop_event;
871 KernelDetails details;
872 uint64 start_timestamp;
873 };
874
875 struct MemcpyRecord {
876 CuptiTracerEventType type;
877 size_t size_bytes;
878 CUcontext context;
879 CUstream stream;
880 uint32 correlation_id;
881 bool async;
882 CUevent start_event;
883 CUevent stop_event;
884 uint64 start_timestamp;
885 };
886
CreateAndRecordEvent(CUevent * event,CUstream stream)887 Status CreateAndRecordEvent(CUevent *event, CUstream stream) {
888 CuptiApiTracingDisabler disabler;
889 TF_RETURN_IF_ERROR(ToStatus(cuEventCreate(event, CU_EVENT_DEFAULT)));
890 return ToStatus(cuEventRecord(*event, stream));
891 }
892
893 #if CUDA_VERSION >= 10000
894 // Maintain and restore current thread's CUDA context.
895 // Note: cuStreamGetCtx only available after CUDA 9.2.
896 class ScopedCudaContext {
897 public:
ScopedCudaContext(CUstream stream)898 explicit ScopedCudaContext(CUstream stream) : stream_(stream) {
899 CuptiApiTracingDisabler disabler; // don't trace cuda call in this func.
900 CUcontext context;
901 if (cuStreamGetCtx(stream, &context) != CUDA_SUCCESS) return;
902 context_ = context;
903 uint32 device_ordinal;
904 if (cuptiGetDeviceId(context, &device_ordinal) != CUPTI_SUCCESS) return;
905 device_ordinal_ = device_ordinal;
906 context_pushed_ = cuCtxPushCurrent(context) == CUDA_SUCCESS;
907 }
~ScopedCudaContext()908 ~ScopedCudaContext() {
909 if (!context_pushed_) return;
910 CuptiApiTracingDisabler disabler; // don't trace cuda call in this func.
911 cuCtxPopCurrent(&*context_);
912 }
913
914 // If successful, return the device ordinal of the relevant cuda stream.
915 // Otherwise absl::nullopt;
GetDeviceOrdinal()916 absl::optional<uint32> GetDeviceOrdinal() { return device_ordinal_; }
917
918 // If successful, return the cuda context of the relevant cuda stream.
919 // Otherwise absl::nullopt;
GetContext()920 absl::optional<CUcontext> GetContext() { return context_; }
921
922 private:
923 CUstream stream_;
924 absl::optional<CUcontext> context_;
925 absl::optional<uint32> device_ordinal_;
926 bool context_pushed_ = false;
927 };
928 #endif
929
930 // Stores a series of kernel and memcpy records.
931 class CudaEventRecorder {
932 public:
CudaEventRecorder(CuptiInterface * cupti_interface,CuptiTraceCollector * collector,int ordinal)933 CudaEventRecorder(CuptiInterface *cupti_interface,
934 CuptiTraceCollector *collector, int ordinal)
935 : cupti_interface_(cupti_interface),
936 collector_(collector),
937 ordinal_(ordinal) {
938 device_name_ = absl::StrCat("gpu ", ordinal); // default.
939 CUdevice device;
940 if (cuDeviceGet(&device, ordinal) == CUDA_SUCCESS) {
941 char name[100];
942 if (cuDeviceGetName(name, sizeof(name), device) == CUDA_SUCCESS) {
943 device_name_ = name;
944 }
945 }
946 }
947
948 // Registers the start of a kernel launch. The returned index should be passed
949 // to StopKernel() after the kernel launch has completed.
950 template <typename T>
StartKernel(const char * kernel_name,CUcontext context,uint32 correlation_id,const T * params)951 size_t StartKernel(const char *kernel_name, CUcontext context,
952 uint32 correlation_id, const T *params) {
953 CUstream stream = params->hStream;
954 KernelRecord record = {kernel_name, context, stream, correlation_id};
955 record.details.registers_per_thread = 0; // unknown.
956 record.details.static_shared_memory_usage = params->sharedMemBytes;
957 record.details.dynamic_shared_memory_usage = 0; // unknown
958 record.details.block_x = params->blockDimX;
959 record.details.block_y = params->blockDimY;
960 record.details.block_z = params->blockDimZ;
961 record.details.grid_x = params->gridDimX;
962 record.details.grid_y = params->gridDimY;
963 record.details.grid_z = params->gridDimZ;
964 record.start_timestamp = CuptiTracer::GetTimestamp();
965 LogIfError(CreateAndRecordEvent(&record.start_event, stream));
966 absl::MutexLock lock(&mutex_);
967 if (stopped_) return -1;
968 kernel_records_.push_back(record);
969 return kernel_records_.size() - 1;
970 }
StopKernel(size_t index)971 uint64 StopKernel(size_t index) {
972 absl::MutexLock lock(&mutex_);
973 if (index >= kernel_records_.size()) return 0;
974 auto &record = kernel_records_[index];
975 LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
976 return record.start_timestamp;
977 }
978
979 // Registers the start of a copy operation. The returned index should be
980 // passed to StopMemcpy() after the memcpy has completed.
StartMemcpy(CuptiTracerEventType type,size_t size_bytes,CUcontext context,CUstream stream,uint32 correlation_id,bool async)981 size_t StartMemcpy(CuptiTracerEventType type, size_t size_bytes,
982 CUcontext context, CUstream stream, uint32 correlation_id,
983 bool async) {
984 MemcpyRecord record = {type, size_bytes, context,
985 stream, correlation_id, async};
986 record.start_timestamp = CuptiTracer::GetTimestamp();
987 LogIfError(CreateAndRecordEvent(&record.start_event, stream));
988 absl::MutexLock lock(&mutex_);
989 if (stopped_) return -1;
990 memcpy_records_.push_back(record);
991 return memcpy_records_.size() - 1;
992 }
StopMemcpy(size_t index)993 uint64 StopMemcpy(size_t index) {
994 absl::MutexLock lock(&mutex_);
995 if (index >= memcpy_records_.size()) return 0;
996 auto &record = memcpy_records_[index];
997 LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
998 return record.start_timestamp;
999 }
1000
Stop()1001 Status Stop() {
1002 {
1003 absl::MutexLock lock(&mutex_);
1004 stopped_ = true;
1005 LOG(INFO) << "Collecting " << kernel_records_.size()
1006 << " kernel records, " << memcpy_records_.size()
1007 << " memcpy records.";
1008
1009 // Gather all profiled streams and contexts.
1010 for (const auto &record : kernel_records_) {
1011 TF_RETURN_IF_ERROR(
1012 AddStreamInfo(record.context, record.stream, "Kernel"));
1013 }
1014 for (const auto &record : memcpy_records_) {
1015 TF_RETURN_IF_ERROR(AddStreamInfo(record.context, record.stream,
1016 GetTraceEventTypeName(record.type)));
1017 }
1018 }
1019
1020 // Synchronize all contexts, record end events, synchronize again.
1021 // This scheme is an unreliable measure to associate a event with the wall
1022 // time. There are chances that other threads might enque kernels which
1023 // delay the second synchronization.
1024 TF_RETURN_IF_ERROR(Synchronize());
1025 for (auto &pair : context_infos_) {
1026 TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
1027 TF_RETURN_IF_ERROR(CreateAndRecordEvent(&pair.second.end_event, nullptr));
1028 }
1029
1030 TF_RETURN_IF_ERROR(Synchronize());
1031 end_walltime_us_ = Env::Default()->NowMicros();
1032 return Status::OK();
1033 }
1034
Flush(AnnotationMap * annotation_map)1035 Status Flush(AnnotationMap *annotation_map) {
1036 auto kernel_records = ConsumeKernelRecords();
1037 auto memcpy_records = ConsumeMemcpyRecords();
1038 for (const auto &record : kernel_records) {
1039 TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
1040 }
1041 for (const auto &record : memcpy_records) {
1042 TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
1043 }
1044 return Status::OK();
1045 }
1046
ConsumeKernelRecords()1047 std::vector<KernelRecord> ConsumeKernelRecords() {
1048 absl::MutexLock lock(&mutex_);
1049 return std::move(kernel_records_);
1050 }
ConsumeMemcpyRecords()1051 std::vector<MemcpyRecord> ConsumeMemcpyRecords() {
1052 absl::MutexLock lock(&mutex_);
1053 return std::move(memcpy_records_);
1054 }
1055
1056 private:
1057 struct ContextInfo {
1058 uint32 context_id = 0;
1059 int num_streams = 0;
1060 CUevent end_event;
1061 };
1062
1063 struct StreamInfo {
1064 uint32 stream_id = 0;
1065 std::string name;
1066 int index; // 0 is reserved for null stream.
1067 const ContextInfo *ctx_info;
1068 };
1069
1070 // Synchronizes all contexts.
Synchronize() const1071 Status Synchronize() const {
1072 CuptiApiTracingDisabler disabler;
1073 for (const auto &pair : context_infos_) {
1074 TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
1075 TF_RETURN_IF_ERROR(ToStatus(cuCtxSynchronize()));
1076 }
1077 return Status::OK();
1078 }
1079
1080 // Returns element from context_infos_, adding it if not yet present.
GetContextInfo(CUcontext context,ContextInfo ** ctx_info_ptr)1081 Status GetContextInfo(CUcontext context, ContextInfo **ctx_info_ptr) {
1082 auto it = context_infos_.find(context);
1083
1084 if (it == context_infos_.end()) {
1085 uint32 context_id = 0;
1086 RETURN_IF_CUPTI_ERROR(
1087 cupti_interface_->GetContextId(context, &context_id));
1088 ContextInfo ctx_info = {context_id};
1089 it = context_infos_.emplace(context, ctx_info).first;
1090 }
1091
1092 *ctx_info_ptr = &it->second;
1093 return Status::OK();
1094 }
1095
1096 // Adds element to stream_infos_ if not yet present. If present, clear name
1097 // if it doesn't match parameter.
AddStreamInfo(CUcontext context,CUstream stream,absl::string_view name)1098 Status AddStreamInfo(CUcontext context, CUstream stream,
1099 absl::string_view name) {
1100 StreamKey key(context, stream);
1101 auto it = stream_infos_.find(key);
1102 if (it != stream_infos_.end()) {
1103 if (it->second.name != name) {
1104 it->second.name.clear(); // Stream with inconsistent names, clear it.
1105 }
1106 return Status::OK();
1107 }
1108
1109 ContextInfo *ctx_info;
1110 TF_RETURN_IF_ERROR(GetContextInfo(context, &ctx_info));
1111 int index = stream ? ++ctx_info->num_streams : 0;
1112 uint32 stream_id = 0;
1113 #if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
1114 RETURN_IF_CUPTI_ERROR(
1115 cupti_interface_->GetStreamIdEx(context, stream, 1, &stream_id));
1116 #else
1117 RETURN_IF_CUPTI_ERROR(
1118 cupti_interface_->GetStreamIdEx(context, stream, 0, &stream_id));
1119 #endif
1120
1121 StreamInfo stream_info = {stream_id, static_cast<std::string>(name), index,
1122 ctx_info};
1123 stream_infos_.emplace(key, stream_info);
1124 return Status::OK();
1125 }
1126
1127 // Returns time in microseconds between events recorded on the GPU.
GetElapsedTimeUs(CUevent start,CUevent stop)1128 static uint64_t GetElapsedTimeUs(CUevent start, CUevent stop) {
1129 CuptiApiTracingDisabler disabler;
1130 float elapsed_ms = 0.0f;
1131 LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop)));
1132 return static_cast<uint64>(
1133 std::llroundf(1000 * std::max(elapsed_ms, 0.0f)));
1134 }
1135
SaveRecord(const KernelRecord & record,AnnotationMap * annotation_map) const1136 Status SaveRecord(const KernelRecord &record,
1137 AnnotationMap *annotation_map) const {
1138 if (!record.start_event || !record.stop_event) {
1139 return Status::OK();
1140 }
1141 const auto &stream_info =
1142 stream_infos_.at(StreamKey(record.context, record.stream));
1143 auto start_us =
1144 GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
1145 auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
1146
1147 std::string annotation;
1148
1149 CuptiTracerEvent event{};
1150 event.type = CuptiTracerEventType::Kernel;
1151 event.source = CuptiTracerEventSource::Activity; // on gpu device.
1152 event.name = record.kernel_name;
1153 event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
1154 event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
1155 event.device_id = ordinal_;
1156 event.context_id = stream_info.ctx_info->context_id;
1157 event.stream_id = stream_info.stream_id;
1158 event.correlation_id = record.correlation_id;
1159 AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
1160 event.device_id, event.correlation_id);
1161 event.annotation = info.annotation;
1162 event.kernel_info = record.details;
1163 collector_->AddEvent(std::move(event));
1164 return Status::OK();
1165 }
1166
SaveRecord(const MemcpyRecord & record,AnnotationMap * annotation_map) const1167 Status SaveRecord(const MemcpyRecord &record,
1168 AnnotationMap *annotation_map) const {
1169 if (!record.start_event || !record.stop_event) {
1170 return Status::OK();
1171 }
1172 const auto &stream_info =
1173 stream_infos_.at(StreamKey(record.context, record.stream));
1174 auto start_us =
1175 GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
1176 auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
1177
1178 CuptiTracerEvent event{};
1179 event.type = record.type;
1180 event.name = GetTraceEventTypeName(event.type);
1181 event.source = CuptiTracerEventSource::Activity;
1182 event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
1183 event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
1184 event.device_id = ordinal_;
1185 event.context_id = stream_info.ctx_info->context_id;
1186 event.stream_id = stream_info.stream_id;
1187 event.correlation_id = record.correlation_id;
1188 AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
1189 event.device_id, event.correlation_id);
1190 event.annotation = info.annotation;
1191 event.memcpy_info.num_bytes = record.size_bytes;
1192 // TODO: support MemcpyD2D where destination != source;
1193 event.memcpy_info.destination = ordinal_;
1194 event.memcpy_info.async = record.async;
1195 // TODO: set src_mem_kind and dst_mem_kind.
1196 collector_->AddEvent(std::move(event));
1197 return Status::OK();
1198 }
1199
1200 absl::Mutex mutex_;
1201 bool stopped_ TF_GUARDED_BY(mutex_) = false;
1202 std::vector<KernelRecord> kernel_records_ TF_GUARDED_BY(mutex_);
1203 std::vector<MemcpyRecord> memcpy_records_ TF_GUARDED_BY(mutex_);
1204
1205 CuptiInterface *cupti_interface_;
1206 CuptiTraceCollector *collector_;
1207 const int ordinal_;
1208 std::string device_name_;
1209 uint64 end_walltime_us_;
1210 // Include context in key to distinguish null streams.
1211 using StreamKey = std::pair<CUcontext, CUstream>;
1212
1213 absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
1214 absl::flat_hash_map<StreamKey, StreamInfo> stream_infos_;
1215 };
1216
1217 // This hook uses cuda events to measure device side activities.
1218 class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
1219 public:
CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions & option,CuptiInterface * cupti_interface,CuptiTraceCollector * collector)1220 CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions &option,
1221 CuptiInterface *cupti_interface,
1222 CuptiTraceCollector *collector)
1223 : option_(option),
1224 cupti_interface_(cupti_interface),
1225 collector_(collector) {
1226 int num_gpus = CuptiTracer::NumGpus();
1227 cuda_event_recorders_.reserve(num_gpus);
1228 for (int i = 0; i < num_gpus; ++i) {
1229 cuda_event_recorders_.emplace_back(
1230 absl::make_unique<CudaEventRecorder>(cupti_interface, collector, i));
1231 }
1232 }
~CuptiDriverApiHookWithCudaEvent()1233 ~CuptiDriverApiHookWithCudaEvent() {
1234 for (auto *callback_context : callback_contexts_) delete callback_context;
1235 }
1236
OnDriverApiEnter(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1237 Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
1238 CUpti_CallbackId cbid,
1239 const CUpti_CallbackData *cbdata) override {
1240 auto *recorder = cuda_event_recorders_[device_id].get();
1241 switch (cbid) {
1242 case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: {
1243 DCHECK_NE(cbdata->symbolName, nullptr);
1244 const auto *params =
1245 static_cast<const cuLaunchKernel_params *>(cbdata->functionParams);
1246 *cbdata->correlationData = recorder->StartKernel<cuLaunchKernel_params>(
1247 cbdata->symbolName, cbdata->context, cbdata->correlationId, params);
1248 break;
1249 }
1250 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: {
1251 DCHECK_NE(cbdata->symbolName, nullptr);
1252 const auto *params =
1253 static_cast<const cuLaunchCooperativeKernel_params *>(
1254 cbdata->functionParams);
1255 *cbdata->correlationData =
1256 recorder->StartKernel<cuLaunchCooperativeKernel_params>(
1257 cbdata->symbolName, cbdata->context, cbdata->correlationId,
1258 params);
1259 break;
1260 }
1261 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
1262 #if CUDA_VERSION >= 10000
1263 const auto *params =
1264 static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
1265 cbdata->functionParams);
1266 std::vector<uint32> record_indices;
1267 record_indices.reserve(params->numDevices);
1268 *cbdata->correlationData = -1; // Invalid value.
1269 const auto &annotation = AnnotationStack::Get();
1270 for (int i = 0; i < params->numDevices; ++i) {
1271 CUstream stream = params->launchParamsList[i].hStream;
1272 ScopedCudaContext scoped_cuda_context(stream);
1273 auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
1274 auto context = scoped_cuda_context.GetContext();
1275 if (!dev_id) return errors::Internal("Invalid CUDA stream");
1276 // Because annotation are per device, therefore we need to populate
1277 // annotation for each device involved.
1278 collector_->annotation_map()->Add(*dev_id, cbdata->correlationId,
1279 annotation, "");
1280 record_indices.push_back(
1281 cuda_event_recorders_[*dev_id]->StartKernel<CUDA_LAUNCH_PARAMS>(
1282 "CooperativeKernelMultiDevice", *context,
1283 cbdata->correlationId, &(params->launchParamsList[i])));
1284 }
1285 auto *callback_context =
1286 new CuptiApiCallbackContext(std::move(record_indices));
1287 callback_contexts_.insert(callback_context);
1288 *cbdata->correlationData = reinterpret_cast<uint64>(callback_context);
1289 #else
1290 VLOG(1) << "Unhandled cuLaunchCooperativeKernelMultiDevice.";
1291 #endif
1292 } break;
1293 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
1294 const auto *params =
1295 static_cast<const cuMemcpy_params *>(cbdata->functionParams);
1296 StartMemcpy<cuMemcpy_params>(GetMemcpyType(params->src, params->dst),
1297 cbdata, recorder);
1298 break;
1299 }
1300 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
1301 const auto *params =
1302 static_cast<const cuMemcpyAsync_params *>(cbdata->functionParams);
1303 StartMemcpyAsync<cuMemcpyAsync_params>(
1304 GetMemcpyType(params->src, params->dst), cbdata, recorder);
1305 break;
1306 }
1307 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1308 StartMemcpy<cuMemcpyHtoD_v2_params>(CuptiTracerEventType::MemcpyH2D,
1309 cbdata, recorder);
1310 break;
1311 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1312 StartMemcpyAsync<cuMemcpyHtoDAsync_v2_params>(
1313 CuptiTracerEventType::MemcpyH2D, cbdata, recorder);
1314 break;
1315 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1316 StartMemcpy<cuMemcpyDtoH_v2_params>(CuptiTracerEventType::MemcpyD2H,
1317 cbdata, recorder);
1318 break;
1319 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1320 StartMemcpyAsync<cuMemcpyDtoHAsync_v2_params>(
1321 CuptiTracerEventType::MemcpyD2H, cbdata, recorder);
1322 break;
1323 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1324 StartMemcpy<cuMemcpyDtoD_v2_params>(CuptiTracerEventType::MemcpyD2D,
1325 cbdata, recorder);
1326 break;
1327 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1328 StartMemcpyAsync<cuMemcpyDtoDAsync_v2_params>(
1329 CuptiTracerEventType::MemcpyD2D, cbdata, recorder);
1330 break;
1331 default:
1332 VLOG(1) << "Unexpected callback id: " << cbid;
1333 break;
1334 }
1335 return Status::OK();
1336 }
1337
OnDriverApiExit(int device_id,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1338 Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
1339 CUpti_CallbackId cbid,
1340 const CUpti_CallbackData *cbdata) override {
1341 auto *recorder = cuda_event_recorders_[device_id].get();
1342 if (*cbdata->correlationData == static_cast<size_t>(-1))
1343 return Status::OK();
1344 uint64 start_tsc = 0;
1345 switch (cbid) {
1346 case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
1347 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
1348 start_tsc = recorder->StopKernel(*cbdata->correlationData);
1349 break;
1350 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
1351 #if CUDA_VERSION >= 10000
1352 auto *callback_context = reinterpret_cast<CuptiApiCallbackContext *>(
1353 *cbdata->correlationData);
1354 callback_contexts_.erase(callback_context);
1355 auto record_indices = std::move(callback_context->record_indices);
1356 delete callback_context;
1357 const auto *params =
1358 static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
1359 cbdata->functionParams);
1360 if (record_indices.size() != params->numDevices)
1361 return errors::Internal("Invalid correlation data");
1362 for (int i = 0; i < params->numDevices; ++i) {
1363 CUstream stream = params->launchParamsList[i].hStream;
1364 ScopedCudaContext scoped_cuda_context(stream);
1365 auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
1366 if (!dev_id) return errors::Internal("Invalid CUDA stream");
1367 start_tsc =
1368 cuda_event_recorders_[*dev_id]->StopKernel(record_indices[i]);
1369 }
1370 #endif
1371 } break;
1372 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
1373 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
1374 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1375 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1376 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1377 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1378 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1379 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1380 start_tsc = recorder->StopMemcpy(*cbdata->correlationData);
1381 break;
1382 default:
1383 VLOG(1) << "Unexpected callback id: " << cbid;
1384 // TODO: figure out how to get start timestamp in this case.
1385 return Status::OK();
1386 }
1387 // If we are not collecting CPU events from Callback API, we can return now.
1388 if (!option_.required_callback_api_events) {
1389 return Status::OK();
1390 }
1391
1392 // Grab timestamp for API exit. API entry timestamp saved in cbdata.
1393 uint64 end_tsc = CuptiTracer::GetTimestamp();
1394 return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
1395 start_tsc, end_tsc, domain, cbid, cbdata);
1396 }
SyncAndFlush()1397 Status SyncAndFlush() override {
1398 for (auto &recorder : cuda_event_recorders_) {
1399 TF_RETURN_IF_ERROR(recorder->Stop());
1400 }
1401 for (auto &recorder : cuda_event_recorders_) {
1402 TF_RETURN_IF_ERROR(recorder->Flush(collector_->annotation_map()));
1403 }
1404 return Status::OK();
1405 }
1406
1407 private:
1408 template <typename T>
StartMemcpy(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,CudaEventRecorder * recorder)1409 static void StartMemcpy(CuptiTracerEventType type,
1410 const CUpti_CallbackData *cbdata,
1411 CudaEventRecorder *recorder) {
1412 const auto *params = static_cast<const T *>(cbdata->functionParams);
1413 *cbdata->correlationData =
1414 recorder->StartMemcpy(type, params->ByteCount, cbdata->context, nullptr,
1415 cbdata->correlationId, /*async*/ false);
1416 }
1417
1418 template <typename T>
StartMemcpyAsync(CuptiTracerEventType type,const CUpti_CallbackData * cbdata,CudaEventRecorder * recorder)1419 static void StartMemcpyAsync(CuptiTracerEventType type,
1420 const CUpti_CallbackData *cbdata,
1421 CudaEventRecorder *recorder) {
1422 const auto *params = static_cast<const T *>(cbdata->functionParams);
1423 *cbdata->correlationData = recorder->StartMemcpy(
1424 type, params->ByteCount, cbdata->context, params->hStream,
1425 cbdata->correlationId, /*async*/ true);
1426 }
1427
GetMemoryType(CUdeviceptr ptr)1428 static CUmemorytype GetMemoryType(CUdeviceptr ptr) {
1429 CuptiApiTracingDisabler disabler;
1430 CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
1431 auto status =
1432 cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr);
1433 if (status == CUDA_ERROR_INVALID_VALUE) {
1434 // Pointer not registered with CUDA, must be host memory.
1435 return CU_MEMORYTYPE_HOST;
1436 }
1437 LogIfError(ToStatus(status));
1438 return mem_type;
1439 }
1440
GetMemcpyType(CUdeviceptr src,CUdeviceptr dst)1441 static CuptiTracerEventType GetMemcpyType(CUdeviceptr src, CUdeviceptr dst) {
1442 CUmemorytype src_type = GetMemoryType(src);
1443 CUmemorytype dst_type = GetMemoryType(dst);
1444 // TODO: handle CU_MEMORYTYPE_ARRAY case
1445 if (src_type == CU_MEMORYTYPE_HOST && dst_type == CU_MEMORYTYPE_DEVICE) {
1446 return CuptiTracerEventType::MemcpyH2D;
1447 } else if (src_type == CU_MEMORYTYPE_DEVICE &&
1448 dst_type == CU_MEMORYTYPE_HOST) {
1449 return CuptiTracerEventType::MemcpyD2H;
1450 } else if (src_type == CU_MEMORYTYPE_DEVICE &&
1451 dst_type == CU_MEMORYTYPE_DEVICE) {
1452 return CuptiTracerEventType::MemcpyD2D;
1453 }
1454 return CuptiTracerEventType::MemcpyOther;
1455 }
1456
1457 // Each cuLaunchCooperativeKernelMultiDevice will need to add an entry in
1458 // each corresponding device, therefore we need to keep records of all
1459 // the record indices in each device's record array.
1460 // We allocate such data structure during API entry and free during API exit.
1461 // However there is no guarantee that we receive such callbacks in pairs, we
1462 // maintain a on-going API calls to make sure no memory leaks.
1463 struct CuptiApiCallbackContext {
CuptiApiCallbackContexttensorflow::profiler::__anond046b8460111::CuptiDriverApiHookWithCudaEvent::CuptiApiCallbackContext1464 explicit CuptiApiCallbackContext(std::vector<uint32> &&r)
1465 : record_indices(std::move(r)) {}
1466 std::vector<uint32> record_indices;
1467 };
1468
1469 const CuptiTracerOptions option_;
1470 CuptiInterface *cupti_interface_;
1471 CuptiTraceCollector *collector_;
1472 absl::node_hash_set<CuptiApiCallbackContext *> callback_contexts_;
1473 std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
1474 TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
1475 };
1476
ErrorWithHostname(absl::string_view error_message)1477 /*static*/ std::string ErrorWithHostname(absl::string_view error_message) {
1478 return absl::StrCat(port::Hostname(), ": ", error_message);
1479 }
1480
1481 } // namespace
1482
AddDriverApiCallbackEvent(CuptiTraceCollector * collector,CuptiInterface * cupti_interface,int device_id,uint64 start_tsc,uint64 end_tsc,CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1483 /*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
1484 CuptiTraceCollector *collector, CuptiInterface *cupti_interface,
1485 int device_id, uint64 start_tsc, uint64 end_tsc,
1486 CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
1487 const CUpti_CallbackData *cbdata) {
1488 switch (cbid) {
1489 case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
1490 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
1491 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice:
1492 AddKernelEventUponApiExit(collector, device_id, cbdata, start_tsc,
1493 end_tsc);
1494 break;
1495 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
1496 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
1497 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
1498 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
1499 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
1500 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
1501 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
1502 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
1503 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2:
1504 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2:
1505 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2:
1506 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2:
1507 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2:
1508 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2:
1509 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2:
1510 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2:
1511 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2:
1512 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2:
1513 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2:
1514 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2:
1515 AddNormalMemcpyEventUponApiExit(collector, device_id, cbid, cbdata,
1516 start_tsc, end_tsc);
1517 break;
1518 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer:
1519 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync:
1520 AddP2PMemcpyEventUponApiExit(collector, cupti_interface, device_id, cbid,
1521 cbdata, start_tsc, end_tsc);
1522 break;
1523 case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2:
1524 AddCuMemAllocEventUponApiExit(collector, device_id, cbid, cbdata,
1525 start_tsc, end_tsc);
1526 break;
1527 case CUPTI_DRIVER_TRACE_CBID_cuMemAllocPitch_v2:
1528 AddCuMemAllocPitchEventUponApiExit(collector, device_id, cbid, cbdata,
1529 start_tsc, end_tsc);
1530 break;
1531 case CUPTI_DRIVER_TRACE_CBID_cuMemFree_v2:
1532 AddCuMemFreeEventUponApiExit(collector, device_id, cbid, cbdata,
1533 start_tsc, end_tsc);
1534 break;
1535 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8_v2:
1536 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16_v2:
1537 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32_v2:
1538 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8_v2:
1539 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16_v2:
1540 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32_v2:
1541 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD8Async:
1542 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD16Async:
1543 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD32Async:
1544 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D8Async:
1545 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D16Async:
1546 case CUPTI_DRIVER_TRACE_CBID_cuMemsetD2D32Async:
1547 AddCuMemsetEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc,
1548 end_tsc);
1549 break;
1550 default:
1551 AddGenericEventUponApiExit(collector, device_id, cbid, cbdata, start_tsc,
1552 end_tsc);
1553 break;
1554 }
1555 return Status::OK();
1556 }
1557
GetTraceEventTypeName(const CuptiTracerEventType & type)1558 const char *GetTraceEventTypeName(const CuptiTracerEventType &type) {
1559 // Do not use a default so that this gives a build error when
1560 // CuptiTracerEventType is extended but this is not.
1561 switch (type) {
1562 case CuptiTracerEventType::MemcpyH2D:
1563 return "MemcpyH2D";
1564 case CuptiTracerEventType::MemcpyD2H:
1565 return "MemcpyD2H";
1566 case CuptiTracerEventType::MemcpyD2D:
1567 return "MemcpyD2D";
1568 case CuptiTracerEventType::MemcpyP2P:
1569 return "MemcpyP2P";
1570 case CuptiTracerEventType::MemcpyOther:
1571 return "MemcpyOther";
1572 case CuptiTracerEventType::Kernel:
1573 return "Compute";
1574 case CuptiTracerEventType::MemoryAlloc:
1575 return "MemoryAlloc";
1576 case CuptiTracerEventType::MemoryFree:
1577 return "MemoryFree";
1578 case CuptiTracerEventType::Memset:
1579 return "Memset";
1580 case CuptiTracerEventType::Overhead:
1581 return "Overhead";
1582 case CuptiTracerEventType::UnifiedMemory:
1583 return "UnifiedMemory";
1584 case CuptiTracerEventType::Generic:
1585 return "Generic";
1586 case CuptiTracerEventType::MemoryResidency:
1587 return "MemoryResidency";
1588 case CuptiTracerEventType::Unsupported:
1589 return "";
1590 }
1591 }
1592
GetCuptiTracerSingleton()1593 /* static */ CuptiTracer *CuptiTracer::GetCuptiTracerSingleton() {
1594 static auto *singleton = new CuptiTracer(GetCuptiInterface());
1595 return singleton;
1596 }
1597
IsAvailable() const1598 bool CuptiTracer::IsAvailable() const {
1599 return NumGpus() && !activity_tracing_enabled_ && !api_tracing_enabled_;
1600 }
1601
NumGpus()1602 int CuptiTracer::NumGpus() {
1603 static int num_gpus = []() -> int {
1604 if (cuInit(0) != CUDA_SUCCESS) {
1605 return 0;
1606 }
1607 int gpu_count;
1608 if (cuDeviceGetCount(&gpu_count) != CUDA_SUCCESS) {
1609 return 0;
1610 }
1611 LOG(INFO) << "Profiler found " << gpu_count << " GPUs";
1612 return gpu_count;
1613 }();
1614 return num_gpus;
1615 }
1616
Enable(const CuptiTracerOptions & option,CuptiTraceCollector * collector)1617 void CuptiTracer::Enable(const CuptiTracerOptions &option,
1618 CuptiTraceCollector *collector) {
1619 option_ = option;
1620 collector_ = collector;
1621 if (option_->enable_event_based_activity) {
1622 option_->enable_activity_api = false;
1623 cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithCudaEvent(
1624 option, cupti_interface_, collector));
1625 } else {
1626 cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithActivityApi(
1627 option, cupti_interface_, collector));
1628 }
1629
1630 Status status = EnableApiTracing();
1631 need_root_access_ |= status.code() == error::PERMISSION_DENIED;
1632 if (!status.ok()) return;
1633
1634 if (option_->enable_activity_api) {
1635 EnableActivityTracing().IgnoreError();
1636 }
1637 }
1638
Disable()1639 void CuptiTracer::Disable() {
1640 DisableApiTracing().IgnoreError();
1641 if (option_->enable_activity_api) {
1642 DisableActivityTracing().IgnoreError();
1643 }
1644 cupti_interface_->CleanUp();
1645 Finalize().IgnoreError();
1646 cupti_driver_api_hook_->SyncAndFlush().IgnoreError();
1647 collector_->Flush();
1648 collector_ = nullptr;
1649 option_.reset();
1650 cupti_driver_api_hook_.reset();
1651 }
1652
EnableApiTracing()1653 Status CuptiTracer::EnableApiTracing() {
1654 if (api_tracing_enabled_) return Status::OK();
1655
1656 VLOG(1) << "Enable subscriber";
1657 // Subscribe can return CUPTI_ERROR_MAX_LIMIT_REACHED.
1658 // The application which calls CUPTI APIs cannot be used with Nvidia tools
1659 // like nvprof, Nvidia Visual Profiler, Nsight Compute, Nsight Systems.
1660 RETURN_IF_CUPTI_ERROR(cupti_interface_->Subscribe(
1661 &subscriber_, (CUpti_CallbackFunc)ApiCallback, this));
1662 api_tracing_enabled_ = true;
1663
1664 if (!option_->cbids_selected.empty()) {
1665 for (auto cbid : option_->cbids_selected) {
1666 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
1667 1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
1668 }
1669 } else { // select all callback ids.
1670 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1671 1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
1672 }
1673
1674 if (option_->enable_nvtx_tracking) {
1675 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1676 1 /* ENABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
1677 }
1678 return Status::OK();
1679 }
1680
DisableApiTracing()1681 Status CuptiTracer::DisableApiTracing() {
1682 if (!api_tracing_enabled_) return Status::OK();
1683
1684 api_tracing_enabled_ = false;
1685
1686 if (!option_->cbids_selected.empty()) {
1687 for (auto cbid : option_->cbids_selected) {
1688 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableCallback(
1689 0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, cbid));
1690 }
1691 } else {
1692 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1693 0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API));
1694 }
1695
1696 if (option_->enable_nvtx_tracking) {
1697 RETURN_IF_CUPTI_ERROR(cupti_interface_->EnableDomain(
1698 0 /* DISABLE */, subscriber_, CUPTI_CB_DOMAIN_NVTX));
1699 }
1700
1701 VLOG(1) << "Disable subscriber";
1702 RETURN_IF_CUPTI_ERROR(cupti_interface_->Unsubscribe(subscriber_));
1703 return Status::OK();
1704 }
1705
EnableActivityTracing()1706 Status CuptiTracer::EnableActivityTracing() {
1707 if (!option_->activities_selected.empty()) {
1708 // Initialize callback functions for Cupti Activity API.
1709 VLOG(1) << "Registering CUPTI activity callbacks";
1710 RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityRegisterCallbacks(
1711 AllocCuptiActivityBuffer, FreeCuptiActivityBuffer));
1712
1713 VLOG(1) << "Enabling activity tracing for "
1714 << option_->activities_selected.size() << " activities";
1715 for (auto activity : option_->activities_selected) {
1716 VLOG(1) << "Enabling activity tracing for: " << activity;
1717 if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
1718 ConfigureActivityUnifiedMemoryCounter(true);
1719 }
1720 RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityEnable(activity));
1721 }
1722 }
1723 activity_tracing_enabled_ = true;
1724 return Status::OK();
1725 }
1726
DisableActivityTracing()1727 Status CuptiTracer::DisableActivityTracing() {
1728 if (activity_tracing_enabled_) {
1729 VLOG(1) << "Disabling activity tracing for "
1730 << option_->activities_selected.size() << " activities";
1731 for (auto activity : option_->activities_selected) {
1732 VLOG(1) << "Disabling activity tracing for: " << activity;
1733 if (activity == CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER) {
1734 ConfigureActivityUnifiedMemoryCounter(false);
1735 }
1736 RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityDisable(activity));
1737 }
1738 option_->activities_selected.clear();
1739
1740 VLOG(1) << "Flushing CUPTI activity buffer";
1741 RETURN_IF_CUPTI_ERROR(
1742 cupti_interface_->ActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
1743 LOG(INFO) << "CUPTI activity buffer flushed";
1744 }
1745 activity_tracing_enabled_ = false;
1746 return Status::OK();
1747 }
1748
Finalize()1749 Status CuptiTracer::Finalize() {
1750 if (option_->cupti_finalize) {
1751 VLOG(1) << "CuptiFinalize";
1752 RETURN_IF_CUPTI_ERROR(cupti_interface_->Finalize());
1753 }
1754 return Status::OK();
1755 }
1756
GetTimestamp()1757 /*static*/ uint64 CuptiTracer::GetTimestamp() {
1758 uint64_t tsc;
1759 CuptiInterface *cupti_interface = GetCuptiInterface();
1760 if (cupti_interface && cupti_interface->GetTimestamp(&tsc) == CUPTI_SUCCESS) {
1761 return tsc;
1762 }
1763 // Return 0 on error. If an activity timestamp is 0, the activity will be
1764 // dropped during time normalization.
1765 return 0;
1766 }
1767
HandleNVTXCallback(CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1768 Status CuptiTracer::HandleNVTXCallback(CUpti_CallbackId cbid,
1769 const CUpti_CallbackData *cbdata) {
1770 const CUpti_NvtxData *pdata =
1771 reinterpret_cast<const CUpti_NvtxData *>(cbdata);
1772 if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePushEx) {
1773 const nvtxDomainRangePushEx_params *params =
1774 reinterpret_cast<const nvtxDomainRangePushEx_params *>(
1775 pdata->functionParams);
1776 // TODO(profiler): The messageType is actually NVTX_MESSAGE_TYPE_REGISTERED
1777 // (which is 3), However it seems to me that we can not get the registered
1778 // string from nvtxDomainRegisterStringA_params. If we reinterpret the
1779 // payload as ascii, it happen to work.
1780 NVTXRangeTracker::EnterRange(params->core.eventAttrib->message.ascii);
1781 } else if (cbid == CUPTI_CBID_NVTX_nvtxDomainRangePop) {
1782 NVTXRangeTracker::ExitRange();
1783 }
1784 return Status::OK();
1785 }
1786
HandleCallback(CUpti_CallbackDomain domain,CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata)1787 Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
1788 CUpti_CallbackId cbid,
1789 const CUpti_CallbackData *cbdata) {
1790 if (!api_tracing_enabled_) return Status::OK(); // already unsubscribed.
1791 if (!cupti_driver_api_hook_) return Status::OK(); // already unsubscribed.
1792 if (domain == CUPTI_CB_DOMAIN_NVTX) return HandleNVTXCallback(cbid, cbdata);
1793 if (domain != CUPTI_CB_DOMAIN_DRIVER_API) return Status::OK();
1794 if (internalCuCall) return Status::OK();
1795
1796 if (cbdata->context == nullptr) {
1797 // API callback is called before any CUDA context is created.
1798 // This is expected to be rare, and we ignore this case.
1799 VLOG(3) << "API callback received before creation of CUDA context\n";
1800 return errors::Internal("cutpi callback without context");
1801 }
1802
1803 // Grab a correct device ID.
1804 uint32 device_id = -1;
1805 RETURN_IF_CUPTI_ERROR(
1806 cupti_interface_->GetDeviceId(cbdata->context, &device_id));
1807 if (device_id >= num_gpus_) {
1808 return errors::Internal("Invalid device id:", device_id);
1809 }
1810
1811 if (cbdata->callbackSite == CUPTI_API_ENTER) {
1812 TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiEnter(
1813 device_id, domain, cbid, cbdata));
1814 } else if (cbdata->callbackSite == CUPTI_API_EXIT) {
1815 // Set up the map from correlation id to annotation string.
1816 const auto &annotation = AnnotationStack::Get();
1817 if (!annotation.empty()) {
1818 if (cbid ==
1819 CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice) {
1820 // Kernels are launched on different devices by this API call, therefore
1821 // we need to populate per device annotation map respectively.
1822 for (int i = 0; i < num_gpus_; ++i) {
1823 collector_->annotation_map()->Add(i, cbdata->correlationId,
1824 annotation, "");
1825 }
1826 } else {
1827 absl::string_view nvtx_range = NVTXRangeTracker::CurrentRange();
1828 collector_->annotation_map()->Add(device_id, cbdata->correlationId,
1829 annotation, nvtx_range);
1830 }
1831 }
1832
1833 TF_RETURN_IF_ERROR(cupti_driver_api_hook_->OnDriverApiExit(
1834 device_id, domain, cbid, cbdata));
1835 }
1836 return Status::OK();
1837 }
1838
ConfigureActivityUnifiedMemoryCounter(bool enable)1839 void CuptiTracer::ConfigureActivityUnifiedMemoryCounter(bool enable) {
1840 CUpti_ActivityUnifiedMemoryCounterConfig config[2];
1841 // By experiments, currently only measurements from these two activities are
1842 // trustworthy. Others like GPU page fault may be problematic.
1843 config[0].kind =
1844 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD;
1845 config[1].kind =
1846 CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH;
1847
1848 for (size_t i = 0; i < 2; i++) {
1849 config[i].enable = enable;
1850 }
1851
1852 CUptiResult res;
1853
1854 res = cupti_interface_->ActivityConfigureUnifiedMemoryCounter(config, 2);
1855 if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED) {
1856 LOG(ERROR) << "Unified memory is not supported on the "
1857 "underlying platform.\n";
1858 } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_DEVICE) {
1859 LOG(ERROR) << "Unified memory is not supported on the device.\n";
1860 } else if (res == CUPTI_ERROR_UM_PROFILING_NOT_SUPPORTED_ON_NON_P2P_DEVICES) {
1861 LOG(ERROR) << "Unified memory is not supported on the "
1862 "non-P2P multi-gpu setup.\n";
1863 } else if (res != CUPTI_SUCCESS) {
1864 const char *errstr = "";
1865 cuptiGetResultString(res, &errstr);
1866 LOG(ERROR) << "Error while enabling unified memory profiling: " << errstr;
1867 } else {
1868 VLOG(1) << "Configuring Unified memory profiling: " << res;
1869 }
1870 }
1871
ProcessActivityBuffer(CUcontext context,uint32_t stream_id,uint8_t * buffer,size_t size)1872 Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
1873 uint8_t *buffer, size_t size) {
1874 if (!activity_tracing_enabled_) {
1875 LOG(WARNING) << "CUPTI activity buffer is freed after flush.";
1876 return Status::OK();
1877 }
1878 if (cupti_interface_->Disabled()) return errors::Internal("Disabled.");
1879
1880 CUpti_Activity *record = nullptr;
1881 while (true) {
1882 CUptiResult status =
1883 cupti_interface_->ActivityGetNextRecord(buffer, size, &record);
1884 if (status == CUPTI_SUCCESS) {
1885 switch (record->kind) {
1886 case CUPTI_ACTIVITY_KIND_KERNEL: // sequential
1887 case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL:
1888 AddKernelActivityEvent(
1889 collector_, reinterpret_cast<CUpti_ActivityKernel4 *>(record));
1890 break;
1891 case CUPTI_ACTIVITY_KIND_MEMCPY:
1892 AddMemcpyActivityEvent(
1893 collector_, reinterpret_cast<CUpti_ActivityMemcpy *>(record));
1894 break;
1895 case CUPTI_ACTIVITY_KIND_MEMCPY2:
1896 AddMemcpy2ActivityEvent(
1897 collector_, reinterpret_cast<CUpti_ActivityMemcpy2 *>(record));
1898 break;
1899 case CUPTI_ACTIVITY_KIND_OVERHEAD:
1900 AddCuptiOverheadActivityEvent(
1901 collector_, reinterpret_cast<CUpti_ActivityOverhead *>(record));
1902 break;
1903 case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER:
1904 AddUnifiedMemoryActivityEvent(
1905 collector_,
1906 reinterpret_cast<CUpti_ActivityUnifiedMemoryCounter2 *>(record));
1907 break;
1908 case CUPTI_ACTIVITY_KIND_MEMORY: {
1909 AddMemoryActivityEvent(
1910 collector_, reinterpret_cast<CUpti_ActivityMemory *>(record));
1911 } break;
1912 case CUPTI_ACTIVITY_KIND_MEMSET:
1913 AddMemsetActivityEvent(
1914 collector_, reinterpret_cast<CUpti_ActivityMemset *>(record));
1915 break;
1916 case CUPTI_ACTIVITY_KIND_SYNCHRONIZATION:
1917 AddSynchronizationActivityEvent(
1918 collector_,
1919 reinterpret_cast<CUpti_ActivitySynchronization *>(record));
1920 break;
1921 default:
1922 VLOG(3) << "Activity type " << record->kind << " is not supported.";
1923 break;
1924 }
1925 } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
1926 break;
1927 } else {
1928 return errors::Internal("Parse cupti activity buffer error.");
1929 }
1930 }
1931
1932 // Report dropped records.
1933 size_t dropped;
1934 RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityGetNumDroppedRecords(
1935 context, stream_id, &dropped));
1936 if (dropped != 0) {
1937 uint32 device_id = -1;
1938 RETURN_IF_CUPTI_ERROR(cupti_interface_->GetDeviceId(context, &device_id));
1939 collector_->OnEventsDropped("cupti activity buffer full", dropped);
1940 }
1941 return Status::OK();
1942 }
1943
ErrorIfAny()1944 /*static*/ std::string CuptiTracer::ErrorIfAny() {
1945 if (CuptiTracer::NumGpus() == 0) {
1946 return ErrorWithHostname("No GPU detected.");
1947 } else if (CuptiTracer::GetCuptiTracerSingleton()->NeedRootAccess()) {
1948 return ErrorWithHostname(
1949 "Insufficient privilege to run libcupti (you need root permission).");
1950 } else if (CuptiTracer::GetTimestamp() == 0) {
1951 return ErrorWithHostname(
1952 "Failed to load libcupti (is it installed and accessible?)");
1953 }
1954 return "";
1955 }
1956
1957 } // namespace profiler
1958 } // namespace tensorflow
1959