• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "runtime/device/ascend/profiling/profiling_manager.h"
18 #include <cstdlib>
19 #include <vector>
20 #include "securec/include/securec.h"
21 #include "./prof_mgr_core.h"
22 #include "utils/log_adapter.h"
23 #include "utils/ms_context.h"
24 #include "utils/ms_utils.h"
25 #include "utils/convert_utils.h"
26 #include "runtime/base.h"
27 #include <nlohmann/json.hpp>
28 
29 namespace {
30 constexpr Status PROF_SUCCESS = 0;
31 constexpr Status PROF_FAILED = 0xFFFFFFFF;
32 }  // namespace
33 
34 namespace mindspore {
35 namespace device {
36 namespace ascend {
GetInstance()37 ProfilingManager &ProfilingManager::GetInstance() {
38   static ProfilingManager inst{};
39   return inst;
40 }
41 
ProfilingManager()42 ProfilingManager::ProfilingManager()
43     : device_id_(0), prof_cb_({0}), hccl_enabled_bef_profiling_enabled_(false), has_started_(false) {}
44 
GetJobId() const45 uint64_t ProfilingManager::GetJobId() const { return 0; }
46 
GetProfilingModule()47 uint64_t GetProfilingModule() {
48   return PROF_MODEL_EXECUTE_MASK | PROF_RUNTIME_API_MASK | PROF_RUNTIME_TRACE_MASK | PROF_SCHEDULE_TIMELINE_MASK |
49          PROF_SCHEDULE_TRACE_MASK | PROF_TASK_TIME_MASK | PROF_SUBTASK_TIME_MASK | PROF_AICPU_TRACE_MASK |
50          PROF_AICORE_METRICS_MASK | PROF_AIVECTORCORE_METRICS_MASK | PROF_MODEL_LOAD_MASK;
51 }
52 
PluginInit() const53 Status ProfilingManager::PluginInit() const {
54   if (prof_cb_.msprofReporterCallback == nullptr) {
55     MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr.";
56     return PROF_FAILED;
57   }
58   int32_t ret = prof_cb_.msprofReporterCallback(static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
59                                                 static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_INIT),
60                                                 nullptr, 0);
61   if (ret != UintToInt(PROF_SUCCESS)) {
62     MS_LOG(ERROR) << "MsprofReporter init failed, ret: " << ret;
63     return PROF_FAILED;
64   }
65   return PROF_SUCCESS;
66 }
67 
PluginUnInit() const68 void ProfilingManager::PluginUnInit() const {
69   if (prof_cb_.msprofReporterCallback == nullptr) {
70     MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr.";
71     return;
72   }
73   int32_t cb_ret = prof_cb_.msprofReporterCallback(
74     static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
75     static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_UNINIT), nullptr, 0);
76   if (cb_ret != 0) {
77     MS_LOG(WARNING) << "profiling plugin uninit failed, ret:%d" << cb_ret;
78   }
79 }
80 
GetProfConf(const NotNull<MsprofGeOptions * > prof)81 Status ProfilingManager::GetProfConf(const NotNull<MsprofGeOptions *> prof) {
82   string job_id = std::to_string(GetJobId());
83   if (memcpy_s(prof->jobId, sizeof(prof->jobId), job_id.c_str(), strlen(job_id.c_str())) != EOK) {
84     MS_LOG(ERROR) << "Copy job_id failed.";
85     return PROF_FAILED;
86   }
87 
88   auto profiler_manager = profiler::ProfilerManager::GetInstance();
89   if (profiler_manager == nullptr) {
90     MS_LOG(ERROR) << "Profiler manager instance is nullptr.";
91     return PROF_FAILED;
92   }
93   const string prof_options_str = profiler_manager->GetProfilingOptions();
94 
95   const nlohmann::json options_all = nlohmann::json::parse(prof_options_str);
96   nlohmann::json options_for_cann;
97   options_for_cann["output"] = options_all["output"];
98   options_for_cann["fp_point"] = options_all["fp_point"];
99   options_for_cann["bp_point"] = options_all["bp_point"];
100   options_for_cann["training_trace"] = options_all["training_trace"];
101   options_for_cann["task_trace"] = options_all["task_trace"];
102   options_for_cann["aic_metrics"] = options_all["aic_metrics"];
103   options_for_cann["aicpu"] = options_all["aicpu"];
104 
105   const string options_for_cann_str = options_for_cann.dump();
106   if (memcpy_s(prof->options, MSPROF_OPTIONS_DEF_LEN_MAX, options_for_cann_str.c_str(), options_for_cann_str.size()) !=
107       EOK) {
108     MS_LOG(ERROR) << "Copy profiling_options failed";
109     return PROF_FAILED;
110   }
111   return PROF_SUCCESS;
112 }
113 
StartupProfiling(uint32_t device_id)114 bool ProfilingManager::StartupProfiling(uint32_t device_id) {
115   if (has_started_) {
116     return true;
117   }
118 
119   auto is_profiling = IsProfiling();
120   if (!is_profiling) {
121     int32_t cb_ret = MsprofInit(0XFF, nullptr, 0);
122     if (cb_ret != UintToInt(PROF_SUCCESS)) {
123       MS_LOG(ERROR) << "Call msprofCtrlCallback failed, ret: " << cb_ret;
124       return false;
125     }
126     MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
127     return true;
128   }
129 
130   if (hccl_enabled_bef_profiling_enabled_) {
131     MS_LOG(ERROR)
132       << "Please check the Profiler object initialized before mindspore.context.set_auto_parallel_context() "
133          "and mindspore.communication.management.init(). Profiler should be initialized before these code.";
134     return false;
135   }
136 
137   device_id_ = device_id;
138 
139   struct MsprofGeOptions prof_conf = {0};
140   if (GetProfConf(NOT_NULL(&prof_conf)) != PROF_SUCCESS) {
141     MS_LOG(ERROR) << "Get prof conf failed.";
142     return false;
143   }
144 
145   if (!ProfStartUp(NOT_NULL(&prof_conf))) {
146     MS_LOG(ERROR) << "ProfMgrStartUp failed.";
147     return false;
148   }
149 
150   has_started_ = true;
151 
152   return true;
153 }
154 
ProfStartUp(const NotNull<MsprofGeOptions * > prof_conf) const155 bool ProfilingManager::ProfStartUp(const NotNull<MsprofGeOptions *> prof_conf) const {
156   MS_LOG(INFO) << "Prof start up. ";
157 
158   bool ret = ProfRegisterCtrlCallback();
159   if (ret == false) {
160     return ret;
161   }
162 
163   // call profiling start up api
164   int32_t cb_ret = MsprofInit(static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS),
165                               static_cast<void *>(prof_conf.get()), sizeof(MsprofGeOptions));
166   if (cb_ret != UintToInt(PROF_SUCCESS)) {
167     MS_LOG(ERROR) << "Call msprofCtrlCallback failed, ret: " << cb_ret;
168     return false;
169   }
170 
171   MS_LOG(INFO) << "Start up profiling success.";
172   return true;
173 }
174 
ProfRegisterCtrlCallback() const175 bool ProfilingManager::ProfRegisterCtrlCallback() const {
176   rtError_t rt_ret = rtProfRegisterCtrlCallback(GE, CtrlCallbackHandle);
177   if (rt_ret != RT_ERROR_NONE) {
178     MS_LOG(ERROR) << "Call rtProfRegisterCtrlCallback failed.";
179     return false;
180   }
181 
182   return true;
183 }
184 
CtrlCallbackHandle(uint32_t rt_type,void * data,uint32_t)185 rtError_t CtrlCallbackHandle(uint32_t rt_type, void *data, uint32_t /* len */) {
186   if (rt_type == RT_PROF_CTRL_REPORTER) {
187     ProfilingManager::GetInstance().SetMsprofReporterCallback(reinterpret_cast<MsprofReporterCallback>(data));
188     MS_LOG(INFO) << "Set MsprofReporterCallback success.";
189   } else if (rt_type == RT_PROF_CTRL_SWITCH) {
190     Status ret = ProfCtrlSwitchHandle(data);
191     if (ret != PROF_SUCCESS) {
192       MS_LOG(ERROR) << "Start runtime profiler failed.";
193     }
194   }
195 
196   return RT_ERROR_NONE;
197 }
198 
StopProfiling() const199 bool ProfilingManager::StopProfiling() const {
200   MS_LOG(INFO) << "StopProfiling";
201   if (!IsProfiling()) {
202     MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
203     return true;
204   }
205 
206   // plugin unregister
207   PluginUnInit();
208 
209   // stop profiling
210   int32_t cb_ret = MsprofFinalize();
211   if (cb_ret != 0) {
212     MS_LOG(WARNING) << "Call MsprofFinalize failed, ret: " << cb_ret;
213     return false;
214   }
215   return true;
216 }
217 
CallMsprofReport(const NotNull<ReporterData * > reporter_data) const218 Status ProfilingManager::CallMsprofReport(const NotNull<ReporterData *> reporter_data) const {
219   if (prof_cb_.msprofReporterCallback == nullptr) {
220     MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr.";
221     return PROF_FAILED;
222   }
223   int32_t ret =
224     prof_cb_.msprofReporterCallback(static_cast<int32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
225                                     static_cast<int32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_REPORT),
226                                     static_cast<void *>(reporter_data.get()), sizeof(ReporterData));
227   if (ret != UintToInt(PROF_SUCCESS)) {
228     MS_LOG(ERROR) << "Call MsprofReporterCallback failed. ret: " << ret;
229     return PROF_FAILED;
230   }
231   return PROF_SUCCESS;
232 }
233 
ProfCtrlSwitchHandle(void * data)234 Status ProfCtrlSwitchHandle(void *data) {
235   if (data == nullptr) {
236     MS_LOG(ERROR) << "Ctrl switch handl data is nullptr.";
237     return PROF_FAILED;
238   }
239 
240   rtProfCommandHandle_t *prof_config_param = reinterpret_cast<rtProfCommandHandle_t *>(data);
241   auto type = static_cast<ProfCommandHandleType>(prof_config_param->type);
242   return ProfCommandHandle(type);
243 }
244 
ProfCommandHandle(ProfCommandHandleType type)245 Status ProfCommandHandle(ProfCommandHandleType type) {
246   MS_LOG(INFO) << "ProfCommandHandle start, type:" << type;
247   if (type == kProfCommandhandleInit) {
248     auto cb_ret = ProfilingManager::GetInstance().PluginInit();
249     if (cb_ret != PROF_SUCCESS) {
250       MS_LOG(ERROR) << "Profiling plugin int failed.";
251       return PROF_FAILED;
252     }
253   }
254 
255   return PROF_SUCCESS;
256 }
257 }  // namespace ascend
258 }  // namespace device
259 }  // namespace mindspore
260