• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "profiler/device/gpu/gpu_profiling.h"
18 
19 #include <cxxabi.h>
20 #include <chrono>
21 #include <cmath>
22 #include <ctime>
23 #include "profiler/device/gpu/cupti_interface.h"
24 #include "profiler/device/gpu/gpu_data_saver.h"
25 #include "pybind_api/api_register.h"
26 #include "utils/log_adapter.h"
27 #include "utils/utils.h"
28 #include "utils/profile.h"
29 #include "utils/ms_context.h"
30 
31 namespace mindspore {
32 namespace profiler {
33 namespace gpu {
34 const size_t BUF_SIZE = 32 * 1024;
35 const size_t ALIGN_SIZE = 8;
36 #define CHECK_CUPTI_RET_WITH_ERROR(expression, message)                                          \
37   if ((expression) != CUPTI_SUCCESS) {                                                           \
38     const char *errstr;                                                                          \
39     CuptiGetResultString(expression, &errstr);                                                   \
40     MS_LOG(ERROR) << "CUPTI Error:" << errstr << " function:" << (message)                       \
41                   << ". You may not have access to the NVIDIA GPU performance counters on "      \
42                   << "the target device. Please use the root account to run profiling or "       \
43                   << "configure permissions. If there is still the problem, please refer to the" \
44                   << " GPU performance tuning document on the official website of mindinsight."; \
45   }
46 
47 #define CHECK_CUPTI_RET_WITH_EXCEPT(expression, message)                        \
48   if ((expression) != CUPTI_SUCCESS) {                                          \
49     const char *errstr;                                                         \
50     CuptiGetResultString(expression, &errstr);                                  \
51     MS_LOG(EXCEPTION) << "CUPTI Error:" << errstr << " function:" << (message); \
52   }
53 #define CHECK_CUDA_RET_WITH_ERROR(expression, message)                                     \
54   do {                                                                                     \
55     cudaError_t status = (expression);                                                     \
56     if (status != cudaSuccess) {                                                           \
57       MS_LOG(ERROR) << "CUDA Error: " << (message) << " | Error Number: " << status << " " \
58                     << cudaGetErrorString(status);                                         \
59     }                                                                                      \
60   } while (0)
61 #define PROFILER_ERROR_IF_NULLPTR(ptr)                           \
62   do {                                                           \
63     if ((ptr) == nullptr) {                                      \
64       MS_LOG(ERROR) << ": The pointer[" << #ptr << "] is null."; \
65       return;                                                    \
66     }                                                            \
67   } while (0)
68 
69 std::shared_ptr<GPUProfiler> GPUProfiler::profiler_inst_ = std::make_shared<GPUProfiler>();
70 
GetThreadID()71 int32_t GetThreadID() {
72   uint32_t thread_id = static_cast<uint32_t>(pthread_self());
73   return thread_id;
74 }
75 
GetStreamID(const CUcontext context,const void * stream)76 uint32_t GetStreamID(const CUcontext context, const void *stream) {
77   uint32_t stream_id = 0;
78   if (stream != nullptr) {
79     CHECK_CUPTI_RET_WITH_ERROR(CuptiGetStreamId(context, (CUstream)stream, &stream_id), "CuptiGetStreamId");
80     if (CuptiGetStreamId(context, (CUstream)stream, &stream_id) != CUPTI_SUCCESS) {
81       MS_LOG(ERROR) << "Training process unexpectedly stopped, profiling data cannot be write to file"
82                     << "To obtain the profiling data, do not interrupt the training process.";
83     }
84   }
85   return stream_id;
86 }
87 
GetCUPTITimeStamp()88 uint64_t GetCUPTITimeStamp() {
89   uint64_t time_stamp = 0l;
90   CHECK_CUPTI_RET_WITH_ERROR(CuptiGetTimestamp(&time_stamp), "CuptiGetTimestamp");
91   return time_stamp;
92 }
93 
GetHostTimeStamp()94 uint64_t GetHostTimeStamp() {
95   auto cur_sys_clock = std::chrono::system_clock::now();
96   uint64_t cur_time_stamp =
97     std::chrono::duration_cast<std::chrono::nanoseconds>(cur_sys_clock.time_since_epoch()).count();
98   return cur_time_stamp;
99 }
100 
GetKernelFunc(const char * name)101 std::string GetKernelFunc(const char *name) {
102   char *demangledName = abi::__cxa_demangle(name, nullptr, nullptr, nullptr);
103   if (demangledName != nullptr) {
104     return demangledName;
105   } else {
106     return name;
107   }
108 }
109 
IsMemcpyAsyncEvent(CUpti_CallbackId cb_id)110 bool IsMemcpyAsyncEvent(CUpti_CallbackId cb_id) {
111   switch (cb_id) {
112     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
113     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
114     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
115     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
116     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoHAsync_v2:
117     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DAsync_v2:
118     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3DAsync_v2:
119     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoAAsync_v2:
120     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeerAsync:
121       return true;
122     default:
123       return false;
124   }
125   return false;
126 }
127 
IsMemcpySyncEvent(CUpti_CallbackId cb_id)128 bool IsMemcpySyncEvent(CUpti_CallbackId cb_id) {
129   switch (cb_id) {
130     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
131     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
132     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
133     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
134     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoH_v2:
135     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoD_v2:
136     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoA_v2:
137     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAtoA_v2:
138     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2D_v2:
139     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy2DUnaligned_v2:
140     case CUPTI_DRIVER_TRACE_CBID_cuMemcpy3D_v2:
141     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoA_v2:
142     case CUPTI_DRIVER_TRACE_CBID_cuMemcpyPeer:
143       return true;
144     default:
145       return false;
146   }
147   return false;
148 }
149 
CUPTIApiExit(const std::shared_ptr<GPUProfiler> & gpu_profiler_inst,CUpti_CallbackId cb_id,const CUpti_CallbackData * cb_data)150 void CUPTIApiExit(const std::shared_ptr<GPUProfiler> &gpu_profiler_inst, CUpti_CallbackId cb_id,
151                   const CUpti_CallbackData *cb_data) {
152   uint64_t start_timestamp = *cb_data->correlationData;
153   uint64_t end_timestamp = GetCUPTITimeStamp();
154   switch (cb_id) {
155     case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
156     case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
157     case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice:
158       gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuLaunchKernel", start_timestamp, end_timestamp);
159       break;
160     case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc:
161     case CUPTI_DRIVER_TRACE_CBID_cuMemAlloc_v2:
162       gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemAlloc", start_timestamp, end_timestamp);
163       break;
164     case CUPTI_DRIVER_TRACE_CBID_cuEventCreate:
165     case CUPTI_DRIVER_TRACE_CBID_cuEventDestroy_v2:
166     case CUPTI_DRIVER_TRACE_CBID_cuEventRecord:
167     case CUPTI_DRIVER_TRACE_CBID_cuEventSynchronize:
168     case CUPTI_DRIVER_TRACE_CBID_cuEventElapsedTime:
169     // In some cases, the callback of cuctxsetcurrent is only exist
170     // without entry, so this callback is ignored
171     case CUPTI_DRIVER_TRACE_CBID_cuCtxSetCurrent:
172       break;
173     default:
174       gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "others_api", start_timestamp, end_timestamp);
175       break;
176   }
177   if (IsMemcpyAsyncEvent(cb_id) || IsMemcpySyncEvent(cb_id)) {
178     gpu_profiler_inst->EventHandleProcess(cb_id, cb_data, "cuMemcpy", start_timestamp, end_timestamp);
179   }
180 }
181 
CUPTICallBackFunc(void * user_data,CUpti_CallbackDomain domain,CUpti_CallbackId cb_id,const CUpti_CallbackData * cb_data)182 void CUPTICallBackFunc(void *user_data, CUpti_CallbackDomain domain, CUpti_CallbackId cb_id,
183                        const CUpti_CallbackData *cb_data) {
184   if (domain != CUPTI_CB_DOMAIN_DRIVER_API) {
185     return;
186   }
187   auto gpu_profiler_inst = GPUProfiler::GetInstance();
188   PROFILER_ERROR_IF_NULLPTR(gpu_profiler_inst);
189   if (!gpu_profiler_inst->GetEnableFlag()) {
190     return;
191   }
192 
193   PROFILER_ERROR_IF_NULLPTR(cb_data);
194   if (cb_data->context == nullptr) {
195     MS_LOG(DEBUG) << "Callback data context is null , correlation Id:" << cb_data->correlationId
196                   << " callback id:" << cb_id;
197     return;
198   }
199 
200   if (cb_data->callbackSite == CUPTI_API_ENTER) {
201     *cb_data->correlationData = GetCUPTITimeStamp();
202   } else if (cb_data->callbackSite == CUPTI_API_EXIT) {
203     CUPTIApiExit(gpu_profiler_inst, cb_id, cb_data);
204   }
205 }
206 
GetKernelFuncName(std::string kernel_name)207 std::string GetKernelFuncName(std::string kernel_name) {
208   // remove the return type name (void) in kernel_name.
209   std::string search_pattern("void ");
210   auto func_name_begin_iter = kernel_name.find(search_pattern);
211   if (func_name_begin_iter == kernel_name.npos) {
212     func_name_begin_iter = 0;
213   } else {
214     func_name_begin_iter += search_pattern.length();
215   }
216   return kernel_name.substr(func_name_begin_iter);
217 }
218 
GetInstance()219 std::shared_ptr<GPUProfiler> &GPUProfiler::GetInstance() {
220   MS_EXCEPTION_IF_NULL(profiler_inst_);
221   return profiler_inst_;
222 }
223 
SyncEnable(const bool enable_flag)224 void GPUProfiler::SyncEnable(const bool enable_flag) {
225   MS_LOG(INFO) << "GPU Profiler synchronous enable flag:" << enable_flag;
226   sync_enable_flag_ = enable_flag;
227 }
228 
StepProfilingEnable(const bool enable_flag)229 void GPUProfiler::StepProfilingEnable(const bool enable_flag) {
230   MS_LOG(INFO) << "GPU Profiler enable flag:" << enable_flag;
231   CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityFlushAll(0), "CuptiActivityFlushAll");
232   enable_flag_ = enable_flag;
233 }
234 
FixOpNameByCorrelationId(Event * event)235 void GPUProfiler::FixOpNameByCorrelationId(Event *event) {
236   PROFILER_ERROR_IF_NULLPTR(event);
237   if (event->api_type != CUPTIApiType::kActivity) {
238     return;
239   }
240   auto iter = op_name_map_.find(event->correlation_id);
241   if (iter != op_name_map_.end()) {
242     event->op_name = std::move(iter->second);
243   }
244 }
245 
AddEvent(Event && event)246 void GPUProfiler::AddEvent(Event &&event) {
247   // protect callback concurrency for driver api and activity
248   std::unique_lock<std::mutex> lock(event_mutex_);
249   switch (event.api_type) {
250     case CUPTIApiType::kCallback: {
251       if (cupti_callback_events_count_ < max_cupti_callback_events_) {
252         events_.emplace_back(std::move(event));
253         cupti_callback_events_count_++;
254       } else {
255         cupti_callback_events_drop_count_++;
256       }
257       break;
258     }
259     case CUPTIApiType::kActivity: {
260       if (cupti_activity_events_count_ < max_cupti_activity_events_) {
261         events_.emplace_back(std::move(event));
262         cupti_activity_events_count_++;
263       } else {
264         cupti_activity_events_drop_count_++;
265       }
266       break;
267     }
268     default:
269       break;
270   }
271 }
272 
EventLog(const Event & event)273 void GPUProfiler::EventLog(const Event &event) {
274   MS_LOG(DEBUG) << "GPUProfiler"
275                 << ",\"kernel_name:" << event.kernel_name << "\",kernel_type:" << event.kernel_type
276                 << ",api_type:" << static_cast<int>(event.api_type) << ",start_time_stamp:" << event.start_time_stamp
277                 << ",end_time_stamp:" << event.end_time_stamp << ",cost:,"
278                 << (event.end_time_stamp - event.start_time_stamp) / kTimeUnit << ",op_name:" << event.op_name
279                 << ",device_id:" << event.device_id << ",correlation_id:" << event.correlation_id
280                 << ",thread_id:" << event.thread_id << ",context_id:" << event.context_id
281                 << ",stream_id:" << event.stream_id << ",cb_id:" << event.cb_id;
282 }
283 
ProcessEvents()284 void GPUProfiler::ProcessEvents() {
285   for (Event &event : events_) {
286     if (event.op_name.empty()) {
287       FixOpNameByCorrelationId(&event);
288     }
289 
290     EventLog(event);
291 
292     if (event.op_name.empty() || event.cb_id == CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize) {
293       continue;
294     }
295 
296     auto iter = op_info_map_.find(event.op_name);
297     if (iter != op_info_map_.end()) {
298       switch (event.api_type) {
299         case CUPTIApiType::kCallback: {
300           iter->second.op_kernel_api_count += 1;
301           // The time unit from ns to us
302           iter->second.cupti_api_call_time += (event.end_time_stamp - event.start_time_stamp) / kTimeUnit;
303           break;
304         }
305         case CUPTIApiType::kActivity: {
306           iter->second.op_kernel_count += 1;
307           // The time unit from ns to us
308           iter->second.cupti_activity_time += (event.end_time_stamp - event.start_time_stamp) / kTimeUnit;
309           break;
310         }
311         default:
312           break;
313       }
314     }
315   }
316 }
317 
OpsParser()318 void GPUProfiler::OpsParser() {
319   MS_LOG(INFO) << "Count the number of events size:" << events_.size()
320                << " callback api:" << cupti_callback_events_count_ << " activity:" << cupti_activity_events_count_;
321 
322   if (cupti_activity_events_drop_count_ > 0 || cupti_callback_events_drop_count_ > 0) {
323     MS_LOG(WARNING)
324       << "The total number of events exceeded the profiler's processing capacity, some events were discarded."
325       << " activity api events:" << cupti_activity_events_drop_count_
326       << " callback api events:" << cupti_callback_events_drop_count_;
327   }
328 
329   if (events_.size() == 0) {
330     return;
331   }
332 
333   ProcessEvents();
334   MS_LOG(DEBUG) << "GPU_profiler, op_name, op_count , kernel_count, kernel_api_count,|"
335                    ",cupti_activity_total_time, cupti_api_call_total_time, op_host_cost_total_time,|"
336                    ",cupti_activity_average_time,cupti_api_call_average_time, op_host_cost_average_time"
337                 << std::endl;
338 
339   std::vector<std::pair<std::string, OpInfo>> order_vec(op_info_map_.begin(), op_info_map_.end());
340 
341   auto cmp_func = [](const std::pair<std::string, OpInfo> &a, const std::pair<std::string, OpInfo> &b) {
342     return a.second.cupti_activity_time > b.second.cupti_activity_time;
343   };
344   std::sort(order_vec.begin(), order_vec.end(), cmp_func);
345 
346   for (auto iter = order_vec.begin(); iter != order_vec.end(); iter++) {
347     if (iter->second.op_count == 0) {
348       MS_LOG(ERROR) << "The num of operations can not be 0.";
349       return;
350     }
351     MS_LOG(DEBUG) << "GPU_profiler"
352                   << "," << iter->first << "," << iter->second.op_count << "," << iter->second.op_kernel_count << ","
353                   << iter->second.op_kernel_api_count << ","
354                   << "|," << iter->second.cupti_activity_time << "," << iter->second.cupti_api_call_time << ","
355                   << iter->second.op_host_cost_time << ","
356                   << "|," << round(iter->second.cupti_activity_time / iter->second.op_count) << ","
357                   << round(iter->second.cupti_api_call_time / iter->second.op_count) << ","
358                   << round(iter->second.op_host_cost_time / iter->second.op_count) << std::endl;
359   }
360 }
361 
EventHandleProcess(CUpti_CallbackId cbid,const CUpti_CallbackData * cbdata,const std::string & typestring,uint64_t startTimestamp,uint64_t endTimestamp)362 void GPUProfiler::EventHandleProcess(CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata,
363                                      const std::string &typestring, uint64_t startTimestamp, uint64_t endTimestamp) {
364   Event event;
365   uint32_t device_id = -1;
366   CuptiGetDeviceId(cbdata->context, &device_id);
367   event.kernel_name = cbdata->symbolName ? GetKernelFunc(cbdata->symbolName) : cbdata->functionName;
368   event.kernel_name = GetKernelFuncName(event.kernel_name);
369   event.kernel_type = typestring;
370   event.api_type = CUPTIApiType::kCallback;
371   event.start_time_stamp = startTimestamp;
372   event.end_time_stamp = endTimestamp;
373   event.op_name = op_name_;
374   event.device_id = device_id;
375   event.correlation_id = cbdata->correlationId;
376   event.thread_id = GetThreadID();
377   event.context_id = cbdata->contextUid;
378   event.stream_id = GetStreamID(cbdata->context, stream_);
379   event.cb_id = cbid;
380   op_name_map_[event.correlation_id] = event.op_name;
381   AddEvent(std::move(event));
382 }
383 
384 void CUPTIAPI ActivityAllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords);
385 
386 void CUPTIAPI ActivityProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize);
387 
Init(const std::string & profileDataPath="")388 void GPUProfiler::Init(const std::string &profileDataPath = "") {
389   MS_LOG(INFO) << "Initialize GPU Profiling";
390   if (subscriber_ != nullptr) {
391     StopCUPTI();
392     MS_LOG(EXCEPTION)
393       << "Repeated initialization, Please check whether you have created the Profiler object multiple times";
394   }
395   CHECK_CUPTI_RET_WITH_EXCEPT(CuptiSubscribe(&subscriber_, (CUpti_CallbackFunc)CUPTICallBackFunc, this),
396                               "CuptiSubscribe");
397   CHECK_CUPTI_RET_WITH_EXCEPT(CuptiEnableDomain(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API), "CuptiEnableDomain");
398 
399   activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_MEMCPY);
400   activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_MEMCPY2);
401   activities_enable_.emplace_back(CUPTI_ACTIVITY_KIND_KERNEL);
402 
403   for (std::vector<CUpti_ActivityKind>::iterator it = activities_enable_.begin(); it != activities_enable_.end();
404        ++it) {
405     CHECK_CUPTI_RET_WITH_EXCEPT(CuptiActivityEnable(*it), "CuptiActivityEnable");
406   }
407 
408   CHECK_CUPTI_RET_WITH_EXCEPT(CuptiActivityRegisterCallbacks(ActivityAllocBuffer, ActivityProcessBuffer),
409                               "CuptiActivityRegisterCallbacks");
410 
411   base_time_.gpu_start_time = GetCUPTITimeStamp();
412   base_time_.host_start_time = GetHostTimeStamp();
413   base_time_.host_start_monotonic_raw_time = GetHostMonoTimeStamp();
414 
415   profile_data_path_ = profileDataPath;
416   MS_LOG(INFO) << "GPU start time(ns):" << base_time_.gpu_start_time
417                << " Host start time(ns):" << base_time_.host_start_time << " profile data path: " << profile_data_path_;
418 }
419 
SetRunTimeData(const std::string & op_name,void * stream)420 void GPUProfiler::SetRunTimeData(const std::string &op_name, void *stream) {
421   auto iter = op_info_map_.find(op_name);
422   if (iter != op_info_map_.end()) {
423     iter->second.op_count += 1;
424   } else {
425     OpInfo op_info;
426     op_info.op_name = op_name;
427     op_info.stream = stream;
428     op_info.op_count = 1;
429     op_info_map_[op_name] = op_info;
430   }
431   op_name_ = op_name;
432   stream_ = stream;
433 }
434 
OpDataProducerBegin(const std::string op_name,void * stream)435 void GPUProfiler::OpDataProducerBegin(const std::string op_name, void *stream) {
436   if (sync_enable_flag_) {
437     CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&op_event_start_), "cudaEventCreate  op event start failed");
438     CHECK_CUDA_RET_WITH_ERROR(cudaEventCreate(&op_event_stop_), "cudaEventCreate op event stop failed");
439     CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(op_event_start_, (CUstream)stream_),
440                               "cudaEventRecord op event start failed");
441     op_host_time_start_ = GetHostTimeStamp();
442     op_cupti_time_start_ = GetCUPTITimeStamp();
443   } else {
444     op_host_time_start_ = GetHostTimeStamp();
445     op_cupti_time_start_ = GetCUPTITimeStamp();
446   }
447   SetRunTimeData(op_name, stream);
448 
449   if (MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
450     RecordOneStepStartEndInfo(op_name);
451   }
452 }
453 
SingleOpLaunchTimeProcess(float op_time_elapsed)454 void GPUProfiler::SingleOpLaunchTimeProcess(float op_time_elapsed) {
455   auto launch_end_time = GetTime();
456   double launch_start_time = launch_end_time - op_time_elapsed / kTimeUnit / kTimeUnit;
457   SetSingleOpLaunchTime(std::make_pair(launch_start_time, launch_end_time));
458 }
459 
OpDataProducerEnd()460 void GPUProfiler::OpDataProducerEnd() {
461   float op_time_elapsed = 0;
462   if (sync_enable_flag_) {
463     CHECK_CUDA_RET_WITH_ERROR(cudaEventRecord(op_event_stop_, (CUstream)stream_),
464                               "cudaEventRecord op event stop failed");
465     CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(op_event_start_), "cudaEventSynchronize op event start failed");
466     CHECK_CUDA_RET_WITH_ERROR(cudaEventSynchronize(op_event_stop_), "cudaEventSynchronize op event stop failed");
467     CHECK_CUDA_RET_WITH_ERROR(cudaEventElapsedTime(&op_time_elapsed, op_event_start_, op_event_stop_),
468                               "cudaEventElapsedTime failed");
469     CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_start_), "cudaEventDestroy  op event start failed");
470     CHECK_CUDA_RET_WITH_ERROR(cudaEventDestroy(op_event_stop_), "cudaEventDestroy  op event stop failed");
471     op_time_elapsed = op_time_elapsed * kTimeUnit;
472     op_host_time_stop_ = GetHostTimeStamp();
473     SingleOpLaunchTimeProcess(op_time_elapsed);
474   } else {
475     op_host_time_stop_ = GetHostTimeStamp();
476     op_time_elapsed = (op_host_time_stop_ - op_host_time_start_) / kTimeUnit;
477     SingleOpLaunchTimeProcess(op_time_elapsed);
478   }
479   MS_LOG(DEBUG) << "Host Time Elapsed(us)," << op_name_ << "," << op_time_elapsed;
480   Profiler::SetRunTimeData(op_name_, op_time_elapsed);
481   Profiler::SetRunTimeData(op_name_, op_cupti_time_start_, op_time_elapsed);
482 }
483 
StopCUPTI()484 void GPUProfiler::StopCUPTI() {
485   if (subscriber_ != nullptr) {
486     CHECK_CUPTI_RET_WITH_ERROR(CuptiUnsubscribe(subscriber_), "CuptiUnsubscribe");
487     CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityFlushAll(0), "CuptiActivityFlushAll");
488     for (std::vector<CUpti_ActivityKind>::iterator it = activities_enable_.begin(); it != activities_enable_.end();
489          ++it) {
490       CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityDisable(*it), "CuptiActivityDisable");
491     }
492     subscriber_ = nullptr;
493   }
494 }
495 
Stop()496 void GPUProfiler::Stop() {
497   MS_LOG(INFO) << "Stop GPU Profiling";
498   StopCUPTI();
499   OpsParser();
500   SaveProfileData();
501   ClearInst();
502 }
503 
SaveExtraProfileData()504 void GPUProfiler::SaveExtraProfileData() {
505   for (auto op : profiling_op_) {
506     op.second->SaveProfilingData();
507   }
508   MS_LOG(INFO) << "Save extra profiling data end.";
509 }
510 
SaveProfileData()511 void GPUProfiler::SaveProfileData() {
512   if (profile_data_path_.empty()) {
513     MS_LOG(WARNING) << "Profile data path is empty, skip save profile data.";
514   } else {
515     GpuDataSaver dataSaver(step_trace_op_name_, all_step_start_end_info_);
516     dataSaver.ParseOpInfo(op_info_map_);
517     dataSaver.ParseEvent(events_);
518     dataSaver.WriteFile(profile_data_path_, base_time_);
519     SaveExtraProfileData();
520   }
521 }
522 
ClearInst()523 void GPUProfiler::ClearInst() {
524   op_info_map_.clear();
525   op_name_map_.clear();
526   events_.clear();
527   activities_enable_.clear();
528   enable_flag_ = false;
529   sync_enable_flag_ = true;
530   cupti_callback_events_count_ = 0l;
531   cupti_callback_events_drop_count_ = 0l;
532   cupti_activity_events_count_ = 0l;
533   cupti_activity_events_drop_count_ = 0l;
534 }
535 
ActivityAllocBuffer(uint8_t ** buffer,size_t * size,size_t * maxNumRecords)536 void CUPTIAPI ActivityAllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) {
537   auto gpu_profiler_inst = GPUProfiler::GetInstance();
538   if (gpu_profiler_inst == nullptr) {
539     MS_LOG(ERROR) << "GPU profiler instance is nullptr";
540     return;
541   }
542   gpu_profiler_inst->AllocBuffer(buffer, size, maxNumRecords);
543 }
544 
ActivityProcessBuffer(CUcontext ctx,uint32_t streamId,uint8_t * buffer,size_t size,size_t validSize)545 void CUPTIAPI ActivityProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize) {
546   PROFILER_ERROR_IF_NULLPTR(buffer);
547   auto gpu_profiler_inst = GPUProfiler::GetInstance();
548   if (gpu_profiler_inst == nullptr) {
549     MS_LOG(ERROR) << "GPU profiler instance is nullptr";
550     return;
551   }
552   gpu_profiler_inst->ProcessBuffer(ctx, streamId, buffer, size, validSize);
553 }
554 
ProcessActivityMemcpyRecord(Event * profilingData,CUpti_Activity * record,CUpti_ActivityMemcpy * memcpy)555 void ProcessActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record, CUpti_ActivityMemcpy *memcpy) {
556   switch (memcpy->copyKind) {
557     case CUPTI_ACTIVITY_MEMCPY_KIND_HTOD:
558       profilingData->activity_type = ActivityType::kMemcpyH2D;
559       profilingData->kernel_name = "MemcpyH2D";
560       break;
561     case CUPTI_ACTIVITY_MEMCPY_KIND_DTOH:
562       profilingData->activity_type = ActivityType::kMemcpyD2H;
563       profilingData->kernel_name = "MemcpyD2H";
564       break;
565     case CUPTI_ACTIVITY_MEMCPY_KIND_HTOA:
566       profilingData->activity_type = ActivityType::kMemcpyH2A;
567       profilingData->kernel_name = "MemcpyH2A";
568       break;
569     case CUPTI_ACTIVITY_MEMCPY_KIND_ATOH:
570       profilingData->activity_type = ActivityType::kMemcpyA2H;
571       profilingData->kernel_name = "MemcpyA2H";
572       break;
573     case CUPTI_ACTIVITY_MEMCPY_KIND_ATOD:
574       profilingData->activity_type = ActivityType::kMemcpyA2D;
575       profilingData->kernel_name = "MemcpyA2D";
576       break;
577     case CUPTI_ACTIVITY_MEMCPY_KIND_DTOA:
578       profilingData->activity_type = ActivityType::kMemcpyD2A;
579       profilingData->kernel_name = "MemcpyD2A";
580       break;
581     case CUPTI_ACTIVITY_MEMCPY_KIND_DTOD:
582       profilingData->activity_type = ActivityType::kMemcpyD2D;
583       profilingData->kernel_name = "MemcpyD2D";
584       break;
585     case CUPTI_ACTIVITY_MEMCPY_KIND_HTOH:
586       profilingData->activity_type = ActivityType::kMemcpyH2H;
587       profilingData->kernel_name = "MemcpyH2H";
588       break;
589     case CUPTI_ACTIVITY_MEMCPY_KIND_PTOP:
590       profilingData->activity_type = ActivityType::kMemcpyP2P;
591       profilingData->kernel_name = "MemcpyP2P";
592       break;
593     default:
594       profilingData->activity_type = ActivityType::kMemcpyUnknown;
595       profilingData->kernel_name = "MemcpyUnknown";
596       break;
597   }
598 }
599 
HandleActivityMemcpyRecord(Event * profilingData,CUpti_Activity * record)600 void HandleActivityMemcpyRecord(Event *profilingData, CUpti_Activity *record) {
601   CUpti_ActivityMemcpy *memcpy = reinterpret_cast<CUpti_ActivityMemcpy *>(record);
602   ProcessActivityMemcpyRecord(profilingData, record, memcpy);
603 
604   profilingData->kernel_type = "cuMemcpy";
605   profilingData->api_type = CUPTIApiType::kActivity;
606   profilingData->start_time_stamp = memcpy->start;
607   profilingData->end_time_stamp = memcpy->end;
608   profilingData->device_id = memcpy->deviceId;
609   profilingData->context_id = memcpy->contextId;
610   profilingData->stream_id = memcpy->streamId;
611   profilingData->correlation_id = memcpy->correlationId;
612   profilingData->memcpy_info.bytes = memcpy->bytes;
613   profilingData->memcpy_info.src_kind = memcpy->srcKind;
614   profilingData->memcpy_info.dst_kind = memcpy->dstKind;
615 }
616 
HandleActivityMemcpy2Record(Event * profilingData,CUpti_Activity * record)617 void HandleActivityMemcpy2Record(Event *profilingData, CUpti_Activity *record) {
618   CUpti_ActivityMemcpy2 *memcpyP2P = reinterpret_cast<CUpti_ActivityMemcpy2 *>(record);
619   profilingData->activity_type = ActivityType::kMemcpyP2P;
620   profilingData->kernel_name = "MemcpyP2P";
621   profilingData->kernel_type = "cuMemcpy";
622   profilingData->api_type = CUPTIApiType::kActivity;
623   profilingData->start_time_stamp = memcpyP2P->start;
624   profilingData->end_time_stamp = memcpyP2P->end;
625   profilingData->device_id = memcpyP2P->deviceId;
626   profilingData->context_id = memcpyP2P->contextId;
627   profilingData->stream_id = memcpyP2P->streamId;
628   profilingData->correlation_id = memcpyP2P->correlationId;
629   profilingData->memcpy_info.bytes = memcpyP2P->bytes;
630   profilingData->memcpy_info.src_kind = memcpyP2P->srcKind;
631   profilingData->memcpy_info.dst_kind = memcpyP2P->dstKind;
632 }
633 
HandleActivityMemsetRecord(Event * profilingData,CUpti_Activity * record)634 void HandleActivityMemsetRecord(Event *profilingData, CUpti_Activity *record) {
635   CUpti_ActivityMemset *memset = reinterpret_cast<CUpti_ActivityMemset *>(record);
636   profilingData->activity_type = ActivityType::kMemset;
637   profilingData->kernel_name = "MemorySet";
638   profilingData->api_type = CUPTIApiType::kActivity;
639   profilingData->start_time_stamp = memset->start;
640   profilingData->end_time_stamp = memset->end;
641   profilingData->device_id = memset->deviceId;
642   profilingData->context_id = memset->contextId;
643   profilingData->stream_id = memset->streamId;
644   profilingData->correlation_id = memset->correlationId;
645   profilingData->memcpy_info.bytes = memset->bytes;
646 }
647 
HandleActivityKernelRecord(Event * profilingData,CUpti_Activity * record)648 void HandleActivityKernelRecord(Event *profilingData, CUpti_Activity *record) {
649   CUpti_ActivityKernel4 *kernel = reinterpret_cast<CUpti_ActivityKernel4 *>(record);
650   profilingData->activity_type = ActivityType::kKernel;
651   profilingData->api_type = CUPTIApiType::kActivity;
652   profilingData->kernel_name = GetKernelFunc(kernel->name);
653   profilingData->kernel_name = GetKernelFuncName(profilingData->kernel_name);
654   profilingData->kernel_type = "cuLaunchKernel";
655   profilingData->start_time_stamp = kernel->start;
656   profilingData->end_time_stamp = kernel->end;
657   profilingData->device_id = kernel->deviceId;
658   profilingData->context_id = kernel->contextId;
659   profilingData->stream_id = kernel->streamId;
660   profilingData->correlation_id = kernel->correlationId;
661   profilingData->kernel_info.registers_per_thread = kernel->registersPerThread;
662   profilingData->kernel_info.static_shared_memory = kernel->staticSharedMemory;
663   profilingData->kernel_info.dynamic_shared_memory = kernel->dynamicSharedMemory;
664   profilingData->kernel_info.block_x = kernel->blockX;
665   profilingData->kernel_info.block_y = kernel->blockY;
666   profilingData->kernel_info.block_z = kernel->blockZ;
667   profilingData->kernel_info.grid_x = kernel->gridX;
668   profilingData->kernel_info.grid_y = kernel->gridY;
669   profilingData->kernel_info.grid_z = kernel->gridZ;
670 }
671 
HandleActivityRecord(CUpti_Activity * record)672 void GPUProfiler::HandleActivityRecord(CUpti_Activity *record) {
673   PROFILER_ERROR_IF_NULLPTR(record);
674   Event profilingData;
675   profilingData.cb_id = 0;
676   switch (record->kind) {
677     case CUPTI_ACTIVITY_KIND_MEMCPY: {
678       HandleActivityMemcpyRecord(&profilingData, record);
679       break;
680     }
681     case CUPTI_ACTIVITY_KIND_MEMCPY2: {
682       HandleActivityMemcpy2Record(&profilingData, record);
683       break;
684     }
685     case CUPTI_ACTIVITY_KIND_MEMSET: {
686       HandleActivityMemsetRecord(&profilingData, record);
687       break;
688     }
689     case CUPTI_ACTIVITY_KIND_KERNEL:
690     case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
691       HandleActivityKernelRecord(&profilingData, record);
692       break;
693     }
694     default:
695       MS_LOG(WARNING) << "Unknown activity type!";
696       return;
697   }
698 
699   AddEvent(std::move(profilingData));
700 }
701 
SetStepTraceOpName(ProfilingTraceInfo trace_op_name)702 void GPUProfiler::SetStepTraceOpName(ProfilingTraceInfo trace_op_name) { step_trace_op_name_ = trace_op_name; }
703 
RegisterProfilingOp(std::shared_ptr<ProfilingOp> node)704 void GPUProfiler::RegisterProfilingOp(std::shared_ptr<ProfilingOp> node) {
705   PROFILER_ERROR_IF_NULLPTR(node);
706   if (profiling_op_.find(node->Name()) != profiling_op_.end()) {
707     return;
708   }
709   node->Init();
710   profiling_op_[node->Name()] = node;
711 }
712 
AllocBuffer(uint8_t ** buffer,size_t * size,size_t * maxNumRecords)713 void CUPTIAPI GPUProfiler::AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords) {
714   PROFILER_ERROR_IF_NULLPTR(size);
715   PROFILER_ERROR_IF_NULLPTR(maxNumRecords);
716   int stat = posix_memalign(reinterpret_cast<void **>(buffer), ALIGN_SIZE, BUF_SIZE);
717   if (stat) {
718     MS_LOG(ERROR) << "Out of memory, activity buffer alloc failed.";
719     return;
720   }
721   MS_LOG(DEBUG) << "Alloc activity buffer, buffer size: " << BUF_SIZE;
722   *size = BUF_SIZE;
723   *maxNumRecords = 0;
724 }
725 
ProcessBuffer(CUcontext ctx,uint32_t streamId,uint8_t * buffer,size_t size,size_t validSize)726 void CUPTIAPI GPUProfiler::ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size,
727                                          size_t validSize) {
728   if (!enable_flag_) {
729     MS_LOG(DEBUG) << "Profiler is not enable, skip to process activity record.";
730     free(buffer);
731     return;
732   }
733   CUptiResult status;
734   CUpti_Activity *record = NULL;
735 
736   MS_LOG(DEBUG) << "Process activity buffer, valid size:" << validSize << ",Stream ID:" << streamId;
737   if (validSize > 0) {
738     do {
739       status = CuptiActivityGetNextRecord(buffer, validSize, &record);
740       if (status == CUPTI_SUCCESS) {
741         HandleActivityRecord(record);
742       } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
743         break;
744       } else {
745         CHECK_CUPTI_RET_WITH_ERROR(status, "CuptiActivityGetNextRecord");
746       }
747     } while (1);
748 
749     // report any records dropped from the queue
750     size_t dropped;
751     CHECK_CUPTI_RET_WITH_ERROR(CuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped),
752                                "CuptiActivityGetNumDroppedRecords");
753     if (dropped != 0) {
754       MS_LOG(INFO) << "Dropped " << (unsigned int)dropped << " activity records\n";
755     }
756   }
757 
758   free(buffer);
759 }
760 
__anonf90a38340202(const py::module *m) 761 REGISTER_PYBIND_DEFINE(GPUProfiler_, ([](const py::module *m) {
762                          (void)py::class_<GPUProfiler, std::shared_ptr<GPUProfiler>>(*m, "GPUProfiler")
763                            .def_static("get_instance", &GPUProfiler::GetInstance, "GPUProfiler get_instance.")
764                            .def("init", &GPUProfiler::Init, py::arg("profile_data_path"), "init")
765                            .def("stop", &GPUProfiler::Stop, "stop")
766                            .def("step_profiling_enable", &GPUProfiler::StepProfilingEnable, py::arg("enable_flag"),
767                                 "enable or disable step profiling")
768                            .def("sync_enable", &GPUProfiler::SyncEnable, py::arg("enable_flag"),
769                                 "enable or disable synchronization profiling");
770                        }));
771 }  // namespace gpu
772 }  // namespace profiler
773 }  // namespace mindspore
774