1 /**
2 * Copyright 2019-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "runtime/device/ascend/profiling/profiling_manager.h"
18 #include <cstdlib>
19 #include <vector>
20 #include "securec/include/securec.h"
21 #include "./prof_mgr_core.h"
22 #include "utils/log_adapter.h"
23 #include "utils/ms_context.h"
24 #include "utils/ms_utils.h"
25 #include "utils/convert_utils.h"
26 #include "runtime/base.h"
27 #include <nlohmann/json.hpp>
28
29 namespace {
30 constexpr Status PROF_SUCCESS = 0;
31 constexpr Status PROF_FAILED = 0xFFFFFFFF;
32 } // namespace
33
34 namespace mindspore {
35 namespace device {
36 namespace ascend {
GetInstance()37 ProfilingManager &ProfilingManager::GetInstance() {
38 static ProfilingManager inst{};
39 return inst;
40 }
41
ProfilingManager()42 ProfilingManager::ProfilingManager()
43 : device_id_(0), prof_cb_({0}), hccl_enabled_bef_profiling_enabled_(false), has_started_(false) {}
44
GetJobId() const45 uint64_t ProfilingManager::GetJobId() const { return 0; }
46
GetProfilingModule()47 uint64_t GetProfilingModule() {
48 return PROF_MODEL_EXECUTE_MASK | PROF_RUNTIME_API_MASK | PROF_RUNTIME_TRACE_MASK | PROF_SCHEDULE_TIMELINE_MASK |
49 PROF_SCHEDULE_TRACE_MASK | PROF_TASK_TIME_MASK | PROF_SUBTASK_TIME_MASK | PROF_AICPU_TRACE_MASK |
50 PROF_AICORE_METRICS_MASK | PROF_AIVECTORCORE_METRICS_MASK | PROF_MODEL_LOAD_MASK;
51 }
52
PluginInit() const53 Status ProfilingManager::PluginInit() const {
54 if (prof_cb_.msprofReporterCallback == nullptr) {
55 MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr.";
56 return PROF_FAILED;
57 }
58 int32_t ret = prof_cb_.msprofReporterCallback(static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
59 static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_INIT),
60 nullptr, 0);
61 if (ret != UintToInt(PROF_SUCCESS)) {
62 MS_LOG(ERROR) << "MsprofReporter init failed, ret: " << ret;
63 return PROF_FAILED;
64 }
65 return PROF_SUCCESS;
66 }
67
PluginUnInit() const68 void ProfilingManager::PluginUnInit() const {
69 if (prof_cb_.msprofReporterCallback == nullptr) {
70 MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr.";
71 return;
72 }
73 int32_t cb_ret = prof_cb_.msprofReporterCallback(
74 static_cast<uint32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
75 static_cast<uint32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_UNINIT), nullptr, 0);
76 if (cb_ret != 0) {
77 MS_LOG(WARNING) << "profiling plugin uninit failed, ret:%d" << cb_ret;
78 }
79 }
80
GetProfConf(const NotNull<MsprofGeOptions * > prof)81 Status ProfilingManager::GetProfConf(const NotNull<MsprofGeOptions *> prof) {
82 string job_id = std::to_string(GetJobId());
83 if (memcpy_s(prof->jobId, sizeof(prof->jobId), job_id.c_str(), strlen(job_id.c_str())) != EOK) {
84 MS_LOG(ERROR) << "Copy job_id failed.";
85 return PROF_FAILED;
86 }
87
88 auto profiler_manager = profiler::ProfilerManager::GetInstance();
89 if (profiler_manager == nullptr) {
90 MS_LOG(ERROR) << "Profiler manager instance is nullptr.";
91 return PROF_FAILED;
92 }
93 const string prof_options_str = profiler_manager->GetProfilingOptions();
94
95 const nlohmann::json options_all = nlohmann::json::parse(prof_options_str);
96 nlohmann::json options_for_cann;
97 options_for_cann["output"] = options_all["output"];
98 options_for_cann["fp_point"] = options_all["fp_point"];
99 options_for_cann["bp_point"] = options_all["bp_point"];
100 options_for_cann["training_trace"] = options_all["training_trace"];
101 options_for_cann["task_trace"] = options_all["task_trace"];
102 options_for_cann["aic_metrics"] = options_all["aic_metrics"];
103 options_for_cann["aicpu"] = options_all["aicpu"];
104
105 const string options_for_cann_str = options_for_cann.dump();
106 if (memcpy_s(prof->options, MSPROF_OPTIONS_DEF_LEN_MAX, options_for_cann_str.c_str(), options_for_cann_str.size()) !=
107 EOK) {
108 MS_LOG(ERROR) << "Copy profiling_options failed";
109 return PROF_FAILED;
110 }
111 return PROF_SUCCESS;
112 }
113
StartupProfiling(uint32_t device_id)114 bool ProfilingManager::StartupProfiling(uint32_t device_id) {
115 if (has_started_) {
116 return true;
117 }
118
119 auto is_profiling = IsProfiling();
120 if (!is_profiling) {
121 int32_t cb_ret = MsprofInit(0XFF, nullptr, 0);
122 if (cb_ret != UintToInt(PROF_SUCCESS)) {
123 MS_LOG(ERROR) << "Call msprofCtrlCallback failed, ret: " << cb_ret;
124 return false;
125 }
126 MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
127 return true;
128 }
129
130 if (hccl_enabled_bef_profiling_enabled_) {
131 MS_LOG(ERROR)
132 << "Please check the Profiler object initialized before mindspore.context.set_auto_parallel_context() "
133 "and mindspore.communication.management.init(). Profiler should be initialized before these code.";
134 return false;
135 }
136
137 device_id_ = device_id;
138
139 struct MsprofGeOptions prof_conf = {0};
140 if (GetProfConf(NOT_NULL(&prof_conf)) != PROF_SUCCESS) {
141 MS_LOG(ERROR) << "Get prof conf failed.";
142 return false;
143 }
144
145 if (!ProfStartUp(NOT_NULL(&prof_conf))) {
146 MS_LOG(ERROR) << "ProfMgrStartUp failed.";
147 return false;
148 }
149
150 has_started_ = true;
151
152 return true;
153 }
154
ProfStartUp(const NotNull<MsprofGeOptions * > prof_conf) const155 bool ProfilingManager::ProfStartUp(const NotNull<MsprofGeOptions *> prof_conf) const {
156 MS_LOG(INFO) << "Prof start up. ";
157
158 bool ret = ProfRegisterCtrlCallback();
159 if (ret == false) {
160 return ret;
161 }
162
163 // call profiling start up api
164 int32_t cb_ret = MsprofInit(static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS),
165 static_cast<void *>(prof_conf.get()), sizeof(MsprofGeOptions));
166 if (cb_ret != UintToInt(PROF_SUCCESS)) {
167 MS_LOG(ERROR) << "Call msprofCtrlCallback failed, ret: " << cb_ret;
168 return false;
169 }
170
171 MS_LOG(INFO) << "Start up profiling success.";
172 return true;
173 }
174
ProfRegisterCtrlCallback() const175 bool ProfilingManager::ProfRegisterCtrlCallback() const {
176 rtError_t rt_ret = rtProfRegisterCtrlCallback(GE, CtrlCallbackHandle);
177 if (rt_ret != RT_ERROR_NONE) {
178 MS_LOG(ERROR) << "Call rtProfRegisterCtrlCallback failed.";
179 return false;
180 }
181
182 return true;
183 }
184
CtrlCallbackHandle(uint32_t rt_type,void * data,uint32_t)185 rtError_t CtrlCallbackHandle(uint32_t rt_type, void *data, uint32_t /* len */) {
186 if (rt_type == RT_PROF_CTRL_REPORTER) {
187 ProfilingManager::GetInstance().SetMsprofReporterCallback(reinterpret_cast<MsprofReporterCallback>(data));
188 MS_LOG(INFO) << "Set MsprofReporterCallback success.";
189 } else if (rt_type == RT_PROF_CTRL_SWITCH) {
190 Status ret = ProfCtrlSwitchHandle(data);
191 if (ret != PROF_SUCCESS) {
192 MS_LOG(ERROR) << "Start runtime profiler failed.";
193 }
194 }
195
196 return RT_ERROR_NONE;
197 }
198
StopProfiling() const199 bool ProfilingManager::StopProfiling() const {
200 MS_LOG(INFO) << "StopProfiling";
201 if (!IsProfiling()) {
202 MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode.";
203 return true;
204 }
205
206 // plugin unregister
207 PluginUnInit();
208
209 // stop profiling
210 int32_t cb_ret = MsprofFinalize();
211 if (cb_ret != 0) {
212 MS_LOG(WARNING) << "Call MsprofFinalize failed, ret: " << cb_ret;
213 return false;
214 }
215 return true;
216 }
217
CallMsprofReport(const NotNull<ReporterData * > reporter_data) const218 Status ProfilingManager::CallMsprofReport(const NotNull<ReporterData *> reporter_data) const {
219 if (prof_cb_.msprofReporterCallback == nullptr) {
220 MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr.";
221 return PROF_FAILED;
222 }
223 int32_t ret =
224 prof_cb_.msprofReporterCallback(static_cast<int32_t>(MsprofReporterModuleId::MSPROF_MODULE_FRAMEWORK),
225 static_cast<int32_t>(MsprofReporterCallbackType::MSPROF_REPORTER_REPORT),
226 static_cast<void *>(reporter_data.get()), sizeof(ReporterData));
227 if (ret != UintToInt(PROF_SUCCESS)) {
228 MS_LOG(ERROR) << "Call MsprofReporterCallback failed. ret: " << ret;
229 return PROF_FAILED;
230 }
231 return PROF_SUCCESS;
232 }
233
ProfCtrlSwitchHandle(void * data)234 Status ProfCtrlSwitchHandle(void *data) {
235 if (data == nullptr) {
236 MS_LOG(ERROR) << "Ctrl switch handl data is nullptr.";
237 return PROF_FAILED;
238 }
239
240 rtProfCommandHandle_t *prof_config_param = reinterpret_cast<rtProfCommandHandle_t *>(data);
241 auto type = static_cast<ProfCommandHandleType>(prof_config_param->type);
242 return ProfCommandHandle(type);
243 }
244
ProfCommandHandle(ProfCommandHandleType type)245 Status ProfCommandHandle(ProfCommandHandleType type) {
246 MS_LOG(INFO) << "ProfCommandHandle start, type:" << type;
247 if (type == kProfCommandhandleInit) {
248 auto cb_ret = ProfilingManager::GetInstance().PluginInit();
249 if (cb_ret != PROF_SUCCESS) {
250 MS_LOG(ERROR) << "Profiling plugin int failed.";
251 return PROF_FAILED;
252 }
253 }
254
255 return PROF_SUCCESS;
256 }
257 } // namespace ascend
258 } // namespace device
259 } // namespace mindspore
260