1 /**
2 * Copyright 2020-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "profiler/device/gpu/gpu_profiling.h"
18
19 #include <cxxabi.h>
20 #include <chrono>
21 #include <cmath>
22 #include <ctime>
23 #include "profiler/device/gpu/cupti_interface.h"
24 #include "profiler/device/gpu/gpu_data_saver.h"
25 #include "pybind_api/api_register.h"
26 #include "utils/log_adapter.h"
27 #include "utils/utils.h"
28 #include "utils/profile.h"
29 #include "utils/ms_context.h"
30
31 namespace mindspore {
32 namespace profiler {
33 namespace gpu {
34 const size_t BUF_SIZE = 32 * 1024;
35 const size_t ALIGN_SIZE = 8;
36 #define CHECK_CUPTI_RET_WITH_ERROR(expression, message) \
37 if ((expression) != CUPTI_SUCCESS) { \
38 const char *errstr; \
39 CuptiGetResultString(expression, &errstr); \
40 MS_LOG(ERROR) << "CUPTI Error:" << errstr << " function:" << (message) \
41 << ". You may not have access to the NVIDIA GPU performance counters on " \
42 << "the target device. Please use the root account to run profiling or " \
43 << "configure permissions. If there is still the problem, please refer to the" \
44 << " GPU performance tuning document on the official website of mindinsight."; \
45 }
46
47 #define CHECK_CUPTI_RET_WITH_EXCEPT(expression, message) \
48 if ((expression) != CUPTI_SUCCESS) { \
49 const char *errstr; \
50 CuptiGetResultString(expression, &errstr); \
51 MS_LOG(EXCEPTION) << "CUPTI Error:" << errstr << " function:" << (message); \
52 }
53 #define CHECK_CUDA_RET_WITH_ERROR(expression, message) \
54 do { \
55 cudaError_t status = (expression); \
56 if (status != cudaSuccess) { \
57 MS_LOG(ERROR) << "CUDA Error: " << (message) << " | Error Number: " << status << " " \
58 << cudaGetErrorString(status); \
59 } \
60 } while (0)
61 #define PROFILER_ERROR_IF_NULLPTR(ptr) \
62 do { \
63 if ((ptr) == nullptr) { \
64 MS_LOG(ERROR) << ": The pointer[" << #ptr << "] is null."; \
65 return; \
66 } \
67 } while (0)
68
69 std::shared_ptr<GPUProfiler> GPUProfiler::profiler_inst_ = std::make_shared<GPUProfiler>();
70
GetThreadID()71 int32_t GetThreadID() {
72 uint32_t thread_id = static_cast<uint32_t>(pthread_self());
73 return thread_id;
74 }
75
GetStreamID(const CUcontext context,const void * stream)76 uint32_t GetStreamID(const CUcontext context, const void *stream) {
77 uint32_t stream_id = 0;
78 if (stream != nullptr) {
79 CHECK_CUPTI_RET_WITH_ERROR(CuptiGetStreamId(context, (CUstream)stream, &stream_id), "CuptiGetStreamId");
80 if (CuptiGetStreamId(context, (CUstream)stream, &stream_id) != CUPTI_SUCCESS) {
81 MS_LOG(ERROR) << "Training process unexpectedly stopped, profiling data cannot be write to file"
82 << "To obtain the profiling data, do not interrupt the training process.";
83 }
84 }
85 return stream_id;
86 }
87
GetCUPTITimeStamp()88 uint64_t GetCUPTITimeStamp() {
89 uint64_t time_stamp = 0l;
90 CHECK_CUPTI_RET_WITH_ERROR(CuptiGetTimestamp(&time_stamp), "CuptiGetTimestamp");
91 return time_stamp;
92 }
93
GetHostTimeStamp()94 uint64_t GetHostTimeStamp() {
95 auto cur_sys_clock = std::chrono::system_clock::now();
96 uint64_t cur_time_stamp =
97 std::chrono::duration_cast<std::chrono::nanoseconds>(cur_sys_clock.time_since_epoch()).count();
98 return cur_time_stamp;
99 }
100
GetKernelFunc(const char * name)101 std::string GetKernelFunc(const char *name) {
102 char *demangledName = abi::__cxa_demangle(name, nullptr, nullptr, nullptr);
103 if (demangledName != nullptr) {
104 return demangledName;
105 } else {
106 return name;
107 }
108 }
109
IsMemcpyAsyncEvent(CUpti_CallbackId cb_id)110 bool IsMemcpyAsyncEvent(CUpti_CallbackId cb_id) {
111 switch (cb_id) {
112 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
113 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
114 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
115 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
116 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2:
117 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2:
118 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2:
119 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2:
120 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync:
121 return true;
122 default:
123 return false;
124 }
125 return false;
126 }
127
IsMemcpySyncEvent(CUpti_CallbackId cb_id)128 bool IsMemcpySyncEvent(CUpti_CallbackId cb_id) {
129 switch (cb_id) {
130 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
131 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
132 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
133 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
134 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2:
135 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2:
136 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2:
137 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2:
138 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2:
139 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2:
140 case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2:
141 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2:
142 case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer:
143 return true;
144 default:
145 return false;
146 }
147 return false;
148 }
149
CUPTIApiExit(const std::shared_ptr<GPUProfiler> & gpu_profiler_inst,CUpti_CallbackId cb_id,const CUpti_CallbackData * cb_data)150 void CUPTIApiExit(const std::shared_ptr<GPUProfiler> &gpu_profiler_inst, CUpti_CallbackId cb_id,
151 const CUpti_CallbackData *cb_data) {
152 uint64_t start_timestamp = *cb_data->correlationData;
153 uint64_t end_timestamp = GetCUPTITimeStamp();
154 switch (cb_id) {
155 case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
156 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
157 case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice:
158 gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuLaunchKernel", start_timestamp, end_timestamp);
159 break;
160 case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc:
161 case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2:
162 gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemAlloc", start_timestamp, end_timestamp);
163 break;
164 case CUPTI_DRIVER_TRACE_CBID_cuEventCreate:
165 case CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2:
166 case CUPTI_DRIVER_TRACE_CBID_cuEventRecord:
167 case CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize:
168 case CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime:
169 // In some cases, the callback of cuctxsetcurrent is only exist
170 // without entry, so this callback is ignored
171 case CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent:
172 break;
173 default:
174 gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "others_api", start_timestamp, end_timestamp);
175 break;
176 }
177 if (IsMemcpyAsyncEvent(cb_id) || IsMemcpySyncEvent(cb_id)) {
178 gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemcpy", start_timestamp, end_timestamp);
179 }
180 }
181
CUPTICallBackFunc(void * user_data,CUpti_CallbackDomain domain,CUpti_CallbackId cb_id,const CUpti_CallbackData * cb_data)182 void CUPTICallBackFunc(void *user_data, CUpti_CallbackDomain domain, CUpti_CallbackId cb_id,
183 const CUpti_CallbackData *cb_data) {
184 if (domain != CUPTI_CB_DOMAIN_DRIVER_API) {
185 return;
186 }
187 auto gpu_profiler_inst = GPUProfiler::GetInstance();
188 PROFILER_ERROR_IF_NULLPTR(gpu_profiler_inst);
189 if (!gpu_profiler_inst->GetEnableFlag()) {
190 return;
191 }
192
193 PROFILER_ERROR_IF_NULLPTR(cb_data);
194 if (cb_data->context == nullptr) {
195 MS_LOG(DEBUG) << "Callback data context is null , correlation Id:" << cb_data->correlationId
196 << " callback id:" << cb_id;
197 return;
198 }
199
200 if (cb_data->callbackSite == CUPTI_API_ENTER) {
201 *cb_data->correlationData = GetCUPTITimeStamp();
202 } else if (cb_data->callbackSite == CUPTI_API_EXIT) {
203 CUPTIApiExit(gpu_profiler_inst, cb_id, cb_data);
204 }
205 }
206
GetKernelFuncName(std::string kernel_name)207 std::string GetKernelFuncName(std::string kernel_name) {
208 // remove the return type name (void) in kernel_name.
209 std::string search_pattern("void ");
210 auto func_name_begin_iter = kernel_name.find(search_pattern);
211 if (func_name_begin_iter == kernel_name.npos) {
212 func_name_begin_iter = 0;
213 } else {
214 func_name_begin_iter += search_pattern.length();
215 }
216 return kernel_name.substr(func_name_begin_iter);
217 }
218
GetInstance()219 std::shared_ptr<GPUProfiler> &GPUProfiler::GetInstance() {
220 MS_EXCEPTION_IF_NULL(profiler_inst_);
221 return profiler_inst_;
222 }
223
SyncEnable(const bool enable_flag)224 void GPUProfiler::SyncEnable(const bool enable_flag) {
225 MS_LOG(INFO) << "GPU Profiler synchronous enable flag:" << enable_flag;
226 sync_enable_flag_ = enable_flag;
227 }
228
StepProfilingEnable(const bool enable_flag)229 void GPUProfiler::StepProfilingEnable(const bool enable_flag) {
230 MS_LOG(INFO) << "GPU Profiler enable flag:" << enable_flag;
231 CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityFlushAll(0), "CuptiActivityFlushAll");
232 enable_flag_ = enable_flag;
233 }
234
FixOpNameByCorrelationId(Event * event)235 void GPUProfiler::FixOpNameByCorrelationId(Event *event) {
236 PROFILER_ERROR_IF_NULLPTR(event);
237 if (event->api_type != CUPTIApiType::kActivity) {
238 return;
239 }
240 auto iter = op_name_map_.find(event->correlation_id);
241 if (iter != op_name_map_.end()) {
242 event->op_name = std::move(iter->second);
243 }
244 }
245
AddEvent(Event && event)246 void GPUProfiler::AddEvent(Event &&event) {
247 // protect callback concurrency for driver api and activity
248 std::unique_lock<std::mutex> lock(event_mutex_);
249 switch (event.api_type) {
250 case CUPTIApiType::kCallback: {
251 if (cupti_callback_events_count_ < max_cupti_callback_events_) {
252 events_.emplace_back(std::move(event));
253 cupti_callback_events_count_++;
254 } else {
255 cupti_callback_events_drop_count_++;
256 }
257 break;
258 }
259 case CUPTIApiType::kActivity: {
260 if (cupti_activity_events_count_ < max_cupti_activity_events_) {
261 events_.emplace_back(std::move(event));
262 cupti_activity_events_count_++;
263 } else {
264 cupti_activity_events_drop_count_++;
265 }
266 break;
267 }
268 default:
269 break;
270 }
271 }
272
EventLog(const Event & event)273 void GPUProfiler::EventLog(const Event &event) {
274 MS_LOG(DEBUG) << "GPUProfiler"
275 << ",\"kernel_name:" << event.kernel_name << "\",kernel_type:" << event.kernel_type
276 << ",api_type:" << static_cast<int>(event.api_type) << ",start_time_stamp:" << event.start_time_stamp
277 << ",end_time_stamp:" << event.end_time_stamp << ",cost:,"
278 << (event.end_time_stamp - event.start_time_stamp) / kTimeUnit << ",op_name:" << event.op_name
279 << ",device_id:" << event.device_id << ",correlation_id:" << event.correlation_id
280 << ",thread_id:" << event.thread_id << ",context_id:" << event.context_id
281 << ",stream_id:" << event.stream_id << ",cb_id:" << event.cb_id;
282 }
283
ProcessEvents()284 void GPUProfiler::ProcessEvents() {
285 for (Event &event : events_) {
286 if (event.op_name.empty()) {
287 FixOpNameByCorrelationId(&event);
288 }
289
290 EventLog(event);
291
292 if (event.op_name.empty() || event.cb_id == CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize) {
293 continue;
294 }
295
296 auto iter = op_info_map_.find(event.op_name);
297 if (iter != op_info_map_.end()) {
298 switch (event.api_type) {
299 case CUPTIApiType::kCallback: {
300 iter->second.op_kernel_api_count += 1;
301 // The time unit from ns to us
302 iter->second.cupti_api_call_time += (event.end_time_stamp - event.start_time_stamp) / kTimeUnit;
303 break;
304 }
305 case CUPTIApiType::kActivity: {
306 iter->second.op_kernel_count += 1;
307 // The time unit from ns to us
308 iter->second.cupti_activity_time += (event.end_time_stamp - event.start_time_stamp) / kTimeUnit;
309 break;
310 }
311 default:
312 break;
313 }
314 }
315 }
316 }
317
OpsParser()318 void GPUProfiler::OpsParser() {
319 MS_LOG(INFO) << "Count the number of events size:" << events_.size()
320 << " callback api:" << cupti_callback_events_count_ << " activity:" << cupti_activity_events_count_;
321
322 if (cupti_activity_events_drop_count_ > 0 || cupti_callback_events_drop_count_ > 0) {
323 MS_LOG(WARNING)
324 << "The total number of events exceeded the profiler's processing capacity, some events were discarded."
325 << " activity api events:" << cupti_activity_events_drop_count_
326 << " callback api events:" << cupti_callback_events_drop_count_;
327 }
328
329 if (events_.size() == 0) {
330 return;
331 }
332
333 ProcessEvents();
334 MS_LOG(DEBUG) << "GPU_profiler, op_name, op_count , kernel_count, kernel_api_count,|"
335 ",cupti_activity_total_time, cupti_api_call_total_time, op_host_cost_total_time,|"
336 ",cupti_activity_average_time,cupti_api_call_average_time, op_host_cost_average_time"
337 << std::endl;
338
339 std::vector<std::pair<std::string, OpInfo>> order_vec(op_info_map_.begin(), op_info_map_.end());
340
341 auto cmp_func = [](const std::pair<std::string, OpInfo> &a, const std::pair<std::string, OpInfo> &b) {
342 return a.second.cupti_activity_time > b.second.cupti_activity_time;
343 };
344 std::sort(order_vec.begin(), order_vec.end(), cmp_func);
345
346 for (auto iter = order_vec.begin(); iter != order_vec.end(); iter++) {
347 if (iter->second.op_count == 0) {
348 MS_LOG(ERROR) << "The num of operations can not be 0.";
349 return;
350 }
351 MS_LOG(DEBUG) << "GPU_profiler"
352 << "," << iter->first << "," << iter->second.op_count << "," << iter->second.op_kernel_count << ","
353 << iter->second.op_kernel_api_count << ","
354 << "|," << iter->second.cupti_activity_time << "," << iter->second.cupti_api_call_time << ","
355 << iter->second.op_host_cost_time << ","
356 << "|," << round(iter->second.cupti_activity_time / iter->second.op_count) << ","
357 << round(iter->second.cupti_api_call_time / iter->second.op_count) << ","
358 << round(iter->second.op_host_cost_time / iter->second.op_count) << std::endl;
359 }
360 }
361
EventHandleProcess(CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,const std::string & typestring,uint64_t startTimestamp,uint64_t endTimestamp)362 void GPUProfiler::EventHandleProcess(CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata,
363 const std::string &typestring, uint64_t startTimestamp, uint64_t endTimestamp) {
364 Event event;
365 uint32_t device_id = -1;
366 CuptiGetDeviceId(cbdata->context, &device_id);
367 event.kernel_name = cbdata->symbolName ? GetKernelFunc(cbdata->symbolName) : cbdata->functionName;
368 event.kernel_name = GetKernelFuncName(event.kernel_name);
369 event.kernel_type = typestring;
370 event.api_type = CUPTIApiType::kCallback;
371 event.start_time_stamp = startTimestamp;
372 event.end_time_stamp = endTimestamp;
373 event.op_name = op_name_;
374 event.device_id = device_id;
375 event.correlation_id = cbdata->correlationId;
376 event.thread_id = GetThreadID();
377 event.context_id = cbdata->contextUid;
378 event.stream_id = GetStreamID(cbdata->context, stream_);
379 event.cb_id = cbid;
380 op_name_map_[event.correlation_id] = event.op_name;
381 AddEvent(std::move(event));
382 }
383
384 void CUPTIAPI ActivityAllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords);
385
386 void CUPTIAPI ActivityProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize);
387
Init(const std::string & profileDataPath="")388 void GPUProfiler::Init(const std::string &profileDataPath = "") {
389 MS_LOG(INFO) << "Initialize GPU Profiling";
390 if (subscriber_ != nullptr) {
391 StopCUPTI();
392 MS_LOG(EXCEPTION)
393 << "Repeated initialization, Please check whether you have created the Profiler object multiple times";
394 }
395 CHECK_CUPTI_RET_WITH_EXCEPT(CuptiSubscribe(&subscriber_, (CUpti_CallbackFunc)CUPTICallBackFunc, this),
396 "CuptiSubscribe");
397 CHECK_CUPTI_RET_WITH_EXCEPT(CuptiEnableDomain(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API), "CuptiEnableDomain");
398
399 activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_MEMCPY);
400 activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_MEMCPY2);
401 activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_KERNEL);
402
403 for (std::vector<CUpti_ActivityKind>::iterator it = activities_enable_.begin(); it != activities_enable_.end();
404 ++it) {
405 CHECK_CUPTI_RET_WITH_EXCEPT(CuptiActivityEnable(*it), "CuptiActivityEnable");
406 }
407
408 CHECK_CUPTI_RET_WITH_EXCEPT(CuptiActivityRegisterCallbacks(ActivityAllocBuffer, ActivityProcessBuffer),
409 "CuptiActivityRegisterCallbacks");
410
411 base_time_.gpu_start_time = GetCUPTITimeStamp();
412 base_time_.host_start_time = GetHostTimeStamp();
413 base_time_.host_start_monotonic_raw_time = GetHostMonoTimeStamp();
414
415 profile_data_path_ = profileDataPath;
416 MS_LOG(INFO) << "GPU start time(ns):" << base_time_.gpu_start_time
417 << " Host start time(ns):" << base_time_.host_start_time << " profile data path: " << profile_data_path_;
418 }
419
SetRunTimeData(const std::string & op_name,void * stream)420 void GPUProfiler::SetRunTimeData(const std::string &op_name, void *stream) {
421 auto iter = op_info_map_.find(op_name);
422 if (iter != op_info_map_.end()) {
423 iter->second.op_count += 1;
424 } else {
425 OpInfo op_info;
426 op_info.op_name = op_name;
427 op_info.stream = stream;
428 op_info.op_count = 1;
429 op_info_map_[op_name] = op_info;
430 }
431 op_name_ = op_name;
432 stream_ = stream;
433 }
434
OpDataProducerBegin(const std::string op_name,void * stream)435 void GPUProfiler::OpDataProducerBegin(const std::string op_name, void *stream) {
436 if (sync_enable_flag_) {
437 CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&op_event_start_), "cudaEventCreate op event start failed");
438 CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&op_event_stop_), "cudaEventCreate op event stop failed");
439 CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(op_event_start_, (CUstream)stream_),
440 "cudaEventRecord op event start failed");
441 op_host_time_start_ = GetHostTimeStamp();
442 op_cupti_time_start_ = GetCUPTITimeStamp();
443 } else {
444 op_host_time_start_ = GetHostTimeStamp();
445 op_cupti_time_start_ = GetCUPTITimeStamp();
446 }
447 SetRunTimeData(op_name, stream);
448
449 if (MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
450 RecordOneStepStartEndInfo(op_name);
451 }
452 }
453
SingleOpLaunchTimeProcess(float op_time_elapsed)454 void GPUProfiler::SingleOpLaunchTimeProcess(float op_time_elapsed) {
455 auto launch_end_time = GetTime();
456 double launch_start_time = launch_end_time - op_time_elapsed / kTimeUnit / kTimeUnit;
457 SetSingleOpLaunchTime(std::make_pair(launch_start_time, launch_end_time));
458 }
459
OpDataProducerEnd()460 void GPUProfiler::OpDataProducerEnd() {
461 float op_time_elapsed = 0;
462 if (sync_enable_flag_) {
463 CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(op_event_stop_, (CUstream)stream_),
464 "cudaEventRecord op event stop failed");
465 CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(op_event_start_), "cudaEventSynchronize op event start failed");
466 CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(op_event_stop_), "cudaEventSynchronize op event stop failed");
467 CHECK_CUDA_RET_WITH_ERROR(cudaEventElapsedTime(&op_time_elapsed, op_event_start_, op_event_stop_),
468 "cudaEventElapsedTime failed");
469 CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_start_), "cudaEventDestroy op event start failed");
470 CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_stop_), "cudaEventDestroy op event stop failed");
471 op_time_elapsed = op_time_elapsed * kTimeUnit;
472 op_host_time_stop_ = GetHostTimeStamp();
473 SingleOpLaunchTimeProcess(op_time_elapsed);
474 } else {
475 op_host_time_stop_ = GetHostTimeStamp();
476 op_time_elapsed = (op_host_time_stop_ - op_host_time_start_) / kTimeUnit;
477 SingleOpLaunchTimeProcess(op_time_elapsed);
478 }
479 MS_LOG(DEBUG) << "Host Time Elapsed(us)," << op_name_ << "," << op_time_elapsed;
480 Profiler::SetRunTimeData(op_name_, op_time_elapsed);
481 Profiler::SetRunTimeData(op_name_, op_cupti_time_start_, op_time_elapsed);
482 }
483
StopCUPTI()484 void GPUProfiler::StopCUPTI() {
485 if (subscriber_ != nullptr) {
486 CHECK_CUPTI_RET_WITH_ERROR(CuptiUnsubscribe(subscriber_), "CuptiUnsubscribe");
487 CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityFlushAll(0), "CuptiActivityFlushAll");
488 for (std::vector<CUpti_ActivityKind>::iterator it = activities_enable_.begin(); it != activities_enable_.end();
489 ++it) {
490 CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityDisable(*it), "CuptiActivityDisable");
491 }
492 subscriber_ = nullptr;
493 }
494 }
495
Stop()496 void GPUProfiler::Stop() {
497 MS_LOG(INFO) << "Stop GPU Profiling";
498 StopCUPTI();
499 OpsParser();
500 SaveProfileData();
501 ClearInst();
502 }
503
SaveExtraProfileData()504 void GPUProfiler::SaveExtraProfileData() {
505 for (auto op : profiling_op_) {
506 op.second->SaveProfilingData();
507 }
508 MS_LOG(INFO) << "Save extra profiling data end.";
509 }
510
SaveProfileData()511 void GPUProfiler::SaveProfileData() {
512 if (profile_data_path_.empty()) {
513 MS_LOG(WARNING) << "Profile data path is empty, skip save profile data.";
514 } else {
515 GpuDataSaver dataSaver(step_trace_op_name_, all_step_start_end_info_);
516 dataSaver.ParseOpInfo(op_info_map_);
517 dataSaver.ParseEvent(events_);
518 dataSaver.WriteFile(profile_data_path_, base_time_);
519 SaveExtraProfileData();
520 }
521 }
522
ClearInst()523 void GPUProfiler::ClearInst() {
524 op_info_map_.clear();
525 op_name_map_.clear();
526 events_.clear();
527 activities_enable_.clear();
528 enable_flag_ = false;
529 sync_enable_flag_ = true;
530 cupti_callback_events_count_ = 0l;
531 cupti_callback_events_drop_count_ = 0l;
532 cupti_activity_events_count_ = 0l;
533 cupti_activity_events_drop_count_ = 0l;
534 }
535
ActivityAllocBuffer(uint8_t ** buffer,size_t * size,size_t * maxNumRecords)536 void CUPTIAPI ActivityAllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) {
537 auto gpu_profiler_inst = GPUProfiler::GetInstance();
538 if (gpu_profiler_inst == nullptr) {
539 MS_LOG(ERROR) << "GPU profiler instance is nullptr";
540 return;
541 }
542 gpu_profiler_inst->AllocBuffer(buffer, size, maxNumRecords);
543 }
544
ActivityProcessBuffer(CUcontext ctx,uint32_t streamId,uint8_t * buffer,size_t size,size_t validSize)545 void CUPTIAPI ActivityProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) {
546 PROFILER_ERROR_IF_NULLPTR(buffer);
547 auto gpu_profiler_inst = GPUProfiler::GetInstance();
548 if (gpu_profiler_inst == nullptr) {
549 MS_LOG(ERROR) << "GPU profiler instance is nullptr";
550 return;
551 }
552 gpu_profiler_inst->ProcessBuffer(ctx, streamId, buffer, size, validSize);
553 }
554
ProcessActivityMemcpyRecord(Event * profilingData,CUpti_Activity * record,CUpti_ActivityMemcpy * memcpy)555 void ProcessActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record, CUpti_ActivityMemcpy *memcpy) {
556 switch (memcpy->copyKind) {
557 case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
558 profilingData->activity_type = ActivityType::kMemcpyH2D;
559 profilingData->kernel_name = "MemcpyH2D";
560 break;
561 case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
562 profilingData->activity_type = ActivityType::kMemcpyD2H;
563 profilingData->kernel_name = "MemcpyD2H";
564 break;
565 case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
566 profilingData->activity_type = ActivityType::kMemcpyH2A;
567 profilingData->kernel_name = "MemcpyH2A";
568 break;
569 case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
570 profilingData->activity_type = ActivityType::kMemcpyA2H;
571 profilingData->kernel_name = "MemcpyA2H";
572 break;
573 case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
574 profilingData->activity_type = ActivityType::kMemcpyA2D;
575 profilingData->kernel_name = "MemcpyA2D";
576 break;
577 case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
578 profilingData->activity_type = ActivityType::kMemcpyD2A;
579 profilingData->kernel_name = "MemcpyD2A";
580 break;
581 case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
582 profilingData->activity_type = ActivityType::kMemcpyD2D;
583 profilingData->kernel_name = "MemcpyD2D";
584 break;
585 case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
586 profilingData->activity_type = ActivityType::kMemcpyH2H;
587 profilingData->kernel_name = "MemcpyH2H";
588 break;
589 case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
590 profilingData->activity_type = ActivityType::kMemcpyP2P;
591 profilingData->kernel_name = "MemcpyP2P";
592 break;
593 default:
594 profilingData->activity_type = ActivityType::kMemcpyUnknown;
595 profilingData->kernel_name = "MemcpyUnknown";
596 break;
597 }
598 }
599
HandleActivityMemcpyRecord(Event * profilingData,CUpti_Activity * record)600 void HandleActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record) {
601 CUpti_ActivityMemcpy *memcpy = reinterpret_cast<CUpti_ActivityMemcpy *>(record);
602 ProcessActivityMemcpyRecord(profilingData, record, memcpy);
603
604 profilingData->kernel_type = "cuMemcpy";
605 profilingData->api_type = CUPTIApiType::kActivity;
606 profilingData->start_time_stamp = memcpy->start;
607 profilingData->end_time_stamp = memcpy->end;
608 profilingData->device_id = memcpy->deviceId;
609 profilingData->context_id = memcpy->contextId;
610 profilingData->stream_id = memcpy->streamId;
611 profilingData->correlation_id = memcpy->correlationId;
612 profilingData->memcpy_info.bytes = memcpy->bytes;
613 profilingData->memcpy_info.src_kind = memcpy->srcKind;
614 profilingData->memcpy_info.dst_kind = memcpy->dstKind;
615 }
616
HandleActivityMemcpy2Record(Event * profilingData,CUpti_Activity * record)617 void HandleActivityMemcpy2Record(Event *profilingData, CUpti_Activity *record) {
618 CUpti_ActivityMemcpy2 *memcpyP2P = reinterpret_cast<CUpti_ActivityMemcpy2 *>(record);
619 profilingData->activity_type = ActivityType::kMemcpyP2P;
620 profilingData->kernel_name = "MemcpyP2P";
621 profilingData->kernel_type = "cuMemcpy";
622 profilingData->api_type = CUPTIApiType::kActivity;
623 profilingData->start_time_stamp = memcpyP2P->start;
624 profilingData->end_time_stamp = memcpyP2P->end;
625 profilingData->device_id = memcpyP2P->deviceId;
626 profilingData->context_id = memcpyP2P->contextId;
627 profilingData->stream_id = memcpyP2P->streamId;
628 profilingData->correlation_id = memcpyP2P->correlationId;
629 profilingData->memcpy_info.bytes = memcpyP2P->bytes;
630 profilingData->memcpy_info.src_kind = memcpyP2P->srcKind;
631 profilingData->memcpy_info.dst_kind = memcpyP2P->dstKind;
632 }
633
HandleActivityMemsetRecord(Event * profilingData,CUpti_Activity * record)634 void HandleActivityMemsetRecord(Event *profilingData, CUpti_Activity *record) {
635 CUpti_ActivityMemset *memset = reinterpret_cast<CUpti_ActivityMemset *>(record);
636 profilingData->activity_type = ActivityType::kMemset;
637 profilingData->kernel_name = "MemorySet";
638 profilingData->api_type = CUPTIApiType::kActivity;
639 profilingData->start_time_stamp = memset->start;
640 profilingData->end_time_stamp = memset->end;
641 profilingData->device_id = memset->deviceId;
642 profilingData->context_id = memset->contextId;
643 profilingData->stream_id = memset->streamId;
644 profilingData->correlation_id = memset->correlationId;
645 profilingData->memcpy_info.bytes = memset->bytes;
646 }
647
HandleActivityKernelRecord(Event * profilingData,CUpti_Activity * record)648 void HandleActivityKernelRecord(Event *profilingData, CUpti_Activity *record) {
649 CUpti_ActivityKernel4 *kernel = reinterpret_cast<CUpti_ActivityKernel4 *>(record);
650 profilingData->activity_type = ActivityType::kKernel;
651 profilingData->api_type = CUPTIApiType::kActivity;
652 profilingData->kernel_name = GetKernelFunc(kernel->name);
653 profilingData->kernel_name = GetKernelFuncName(profilingData->kernel_name);
654 profilingData->kernel_type = "cuLaunchKernel";
655 profilingData->start_time_stamp = kernel->start;
656 profilingData->end_time_stamp = kernel->end;
657 profilingData->device_id = kernel->deviceId;
658 profilingData->context_id = kernel->contextId;
659 profilingData->stream_id = kernel->streamId;
660 profilingData->correlation_id = kernel->correlationId;
661 profilingData->kernel_info.registers_per_thread = kernel->registersPerThread;
662 profilingData->kernel_info.static_shared_memory = kernel->staticSharedMemory;
663 profilingData->kernel_info.dynamic_shared_memory = kernel->dynamicSharedMemory;
664 profilingData->kernel_info.block_x = kernel->blockX;
665 profilingData->kernel_info.block_y = kernel->blockY;
666 profilingData->kernel_info.block_z = kernel->blockZ;
667 profilingData->kernel_info.grid_x = kernel->gridX;
668 profilingData->kernel_info.grid_y = kernel->gridY;
669 profilingData->kernel_info.grid_z = kernel->gridZ;
670 }
671
HandleActivityRecord(CUpti_Activity * record)672 void GPUProfiler::HandleActivityRecord(CUpti_Activity *record) {
673 PROFILER_ERROR_IF_NULLPTR(record);
674 Event profilingData;
675 profilingData.cb_id = 0;
676 switch (record->kind) {
677 case CUPTI_ACTIVITY_KIND_MEMCPY: {
678 HandleActivityMemcpyRecord(&profilingData, record);
679 break;
680 }
681 case CUPTI_ACTIVITY_KIND_MEMCPY2: {
682 HandleActivityMemcpy2Record(&profilingData, record);
683 break;
684 }
685 case CUPTI_ACTIVITY_KIND_MEMSET: {
686 HandleActivityMemsetRecord(&profilingData, record);
687 break;
688 }
689 case CUPTI_ACTIVITY_KIND_KERNEL:
690 case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
691 HandleActivityKernelRecord(&profilingData, record);
692 break;
693 }
694 default:
695 MS_LOG(WARNING) << "Unknown activity type!";
696 return;
697 }
698
699 AddEvent(std::move(profilingData));
700 }
701
SetStepTraceOpName(ProfilingTraceInfo trace_op_name)702 void GPUProfiler::SetStepTraceOpName(ProfilingTraceInfo trace_op_name) { step_trace_op_name_ = trace_op_name; }
703
RegisterProfilingOp(std::shared_ptr<ProfilingOp> node)704 void GPUProfiler::RegisterProfilingOp(std::shared_ptr<ProfilingOp> node) {
705 PROFILER_ERROR_IF_NULLPTR(node);
706 if (profiling_op_.find(node->Name()) != profiling_op_.end()) {
707 return;
708 }
709 node->Init();
710 profiling_op_[node->Name()] = node;
711 }
712
AllocBuffer(uint8_t ** buffer,size_t * size,size_t * maxNumRecords)713 void CUPTIAPI GPUProfiler::AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) {
714 PROFILER_ERROR_IF_NULLPTR(size);
715 PROFILER_ERROR_IF_NULLPTR(maxNumRecords);
716 int stat = posix_memalign(reinterpret_cast<void **>(buffer), ALIGN_SIZE, BUF_SIZE);
717 if (stat) {
718 MS_LOG(ERROR) << "Out of memory, activity buffer alloc failed.";
719 return;
720 }
721 MS_LOG(DEBUG) << "Alloc activity buffer, buffer size: " << BUF_SIZE;
722 *size = BUF_SIZE;
723 *maxNumRecords = 0;
724 }
725
ProcessBuffer(CUcontext ctx,uint32_t streamId,uint8_t * buffer,size_t size,size_t validSize)726 void CUPTIAPI GPUProfiler::ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size,
727 size_t validSize) {
728 if (!enable_flag_) {
729 MS_LOG(DEBUG) << "Profiler is not enable, skip to process activity record.";
730 free(buffer);
731 return;
732 }
733 CUptiResult status;
734 CUpti_Activity *record = NULL;
735
736 MS_LOG(DEBUG) << "Process activity buffer, valid size:" << validSize << ",Stream ID:" << streamId;
737 if (validSize > 0) {
738 do {
739 status = CuptiActivityGetNextRecord(buffer, validSize, &record);
740 if (status == CUPTI_SUCCESS) {
741 HandleActivityRecord(record);
742 } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
743 break;
744 } else {
745 CHECK_CUPTI_RET_WITH_ERROR(status, "CuptiActivityGetNextRecord");
746 }
747 } while (1);
748
749 // report any records dropped from the queue
750 size_t dropped;
751 CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped),
752 "CuptiActivityGetNumDroppedRecords");
753 if (dropped != 0) {
754 MS_LOG(INFO) << "Dropped " << (unsigned int)dropped << " activity records\n";
755 }
756 }
757
758 free(buffer);
759 }
760
__anonf90a38340202(const py::module *m) 761 REGISTER_PYBIND_DEFINE(GPUProfiler_, ([](const py::module *m) {
762 (void)py::class_<GPUProfiler, std::shared_ptr<GPUProfiler>>(*m, "GPUProfiler")
763 .def_static("get_instance", &GPUProfiler::GetInstance, "GPUProfiler get_instance.")
764 .def("init", &GPUProfiler::Init, py::arg("profile_data_path"), "init")
765 .def("stop", &GPUProfiler::Stop, "stop")
766 .def("step_profiling_enable", &GPUProfiler::StepProfilingEnable, py::arg("enable_flag"),
767 "enable or disable step profiling")
768 .def("sync_enable", &GPUProfiler::SyncEnable, py::arg("enable_flag"),
769 "enable or disable synchronization profiling");
770 }));
771 } // namespace gpu
772 } // namespace profiler
773 } // namespace mindspore
774