1 /**
2 * Copyright 2022 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "plugin/device/ascend/hal/common/ascend_utils.h"
18 #include <vector>
19 #include <string>
20 #include <map>
21 #include <fstream>
22 #include "utils/dlopen_macro.h"
23 #include "acl/error_codes/rt_error_codes.h"
24 #include "transform/symbol/acl_base_symbol.h"
25 #include "transform/symbol/acl_rt_symbol.h"
26 #include "transform/symbol/acl_symbol.h"
27 #include "transform/symbol/symbol_utils.h"
28 #include "include/common/debug/common.h"
29
30 namespace mindspore {
31 namespace device {
32 namespace ascend {
33 namespace {
34 const std::map<uint32_t, std::string> error_msg = {
35 {ACL_RT_SUCCESS, "success"},
36 {ACL_ERROR_RT_PARAM_INVALID, "param invalid"},
37 {ACL_ERROR_RT_INVALID_DEVICEID, "invalid device id"},
38 {ACL_ERROR_RT_CONTEXT_NULL, "current context null"},
39 {ACL_ERROR_RT_STREAM_CONTEXT, "stream not in current context"},
40 {ACL_ERROR_RT_MODEL_CONTEXT, "model not in current context"},
41 {ACL_ERROR_RT_STREAM_MODEL, "stream not in model"},
42 {ACL_ERROR_RT_EVENT_TIMESTAMP_INVALID, "event timestamp invalid"},
43 {ACL_ERROR_RT_EVENT_TIMESTAMP_REVERSAL, " event timestamp reversal"},
44 {ACL_ERROR_RT_ADDR_UNALIGNED, "memory address unaligned"},
45 {ACL_ERROR_RT_FILE_OPEN, "open file failed"},
46 {ACL_ERROR_RT_FILE_WRITE, "write file failed"},
47 {ACL_ERROR_RT_STREAM_SUBSCRIBE, "error subscribe stream"},
48 {ACL_ERROR_RT_THREAD_SUBSCRIBE, "error subscribe thread"},
49 {ACL_ERROR_RT_GROUP_NOT_SET, "group not set"},
50 {ACL_ERROR_RT_GROUP_NOT_CREATE, "group not create"},
51 {ACL_ERROR_RT_STREAM_NO_CB_REG, "callback not register to stream"},
52 {ACL_ERROR_RT_INVALID_MEMORY_TYPE, "invalid memory type"},
53 {ACL_ERROR_RT_INVALID_HANDLE, "invalid handle"},
54 {ACL_ERROR_RT_INVALID_MALLOC_TYPE, "invalid malloc type"},
55 {ACL_ERROR_RT_FEATURE_NOT_SUPPORT, "feature not support"},
56 {ACL_ERROR_RT_MEMORY_ALLOCATION, "memory allocation error"},
57 {ACL_ERROR_RT_MEMORY_FREE, "memory free error"},
58 {ACL_ERROR_RT_AICORE_OVER_FLOW, "aicore over flow"},
59 {ACL_ERROR_RT_NO_DEVICE, "no device"},
60 {ACL_ERROR_RT_RESOURCE_ALLOC_FAIL, "resource alloc fail"},
61 {ACL_ERROR_RT_NO_PERMISSION, "no permission"},
62 {ACL_ERROR_RT_NO_EVENT_RESOURCE, "no event resource"},
63 {ACL_ERROR_RT_NO_STREAM_RESOURCE, "no stream resource"},
64 {ACL_ERROR_RT_NO_NOTIFY_RESOURCE, "no notify resource"},
65 {ACL_ERROR_RT_NO_MODEL_RESOURCE, "no model resource"},
66 {ACL_ERROR_RT_INTERNAL_ERROR, "runtime internal error"},
67 {ACL_ERROR_RT_TS_ERROR, "ts internal error"},
68 {ACL_ERROR_RT_STREAM_TASK_FULL, "task full in stream"},
69 {ACL_ERROR_RT_STREAM_TASK_EMPTY, " task empty in stream"},
70 {ACL_ERROR_RT_STREAM_NOT_COMPLETE, "stream not complete"},
71 {ACL_ERROR_RT_END_OF_SEQUENCE, "end of sequence"},
72 {ACL_ERROR_RT_EVENT_NOT_COMPLETE, "event not complete"},
73 {ACL_ERROR_RT_CONTEXT_RELEASE_ERROR, "context release error"},
74 {ACL_ERROR_RT_SOC_VERSION, "soc version error"},
75 {ACL_ERROR_RT_TASK_TYPE_NOT_SUPPORT, "task type not support"},
76 {ACL_ERROR_RT_LOST_HEARTBEAT, "ts lost heartbeat"},
77 {ACL_ERROR_RT_MODEL_EXECUTE, " model execute failed"},
78 {ACL_ERROR_RT_REPORT_TIMEOUT, "report timeout"},
79 {ACL_ERROR_RT_SYS_DMA, "sys dma error"},
80 {ACL_ERROR_RT_AICORE_TIMEOUT, "aicore timeout"},
81 {ACL_ERROR_RT_AICORE_EXCEPTION, "aicore exception"},
82 {ACL_ERROR_RT_AICORE_TRAP_EXCEPTION, " aicore trap exception"},
83 {ACL_ERROR_RT_AICPU_TIMEOUT, " aicpu timeout"},
84 {ACL_ERROR_RT_AICPU_EXCEPTION, "aicpu exception"},
85 {ACL_ERROR_RT_AICPU_DATADUMP_RSP_ERR, " aicpu datadump response error"},
86 {ACL_ERROR_RT_AICPU_MODEL_RSP_ERR, "aicpu model operate response error"},
87 {ACL_ERROR_RT_PROFILING_ERROR, "profiling error"},
88 {ACL_ERROR_RT_IPC_ERROR, "ipc error"},
89 {ACL_ERROR_RT_MODEL_ABORT_NORMAL, "model abort normal"},
90 {ACL_ERROR_RT_KERNEL_UNREGISTERING, "kernel unregistering"},
91 {ACL_ERROR_RT_RINGBUFFER_NOT_INIT, "ringbuffer not init"},
92 {ACL_ERROR_RT_RINGBUFFER_NO_DATA, "ringbuffer no data"},
93 {ACL_ERROR_RT_KERNEL_LOOKUP, "kernel lookup error"},
94 {ACL_ERROR_RT_KERNEL_DUPLICATE, "kernel register duplicate"},
95 {ACL_ERROR_RT_DEBUG_REGISTER_FAIL, "debug register failed"},
96 {ACL_ERROR_RT_DEBUG_UNREGISTER_FAIL, "debug unregister failed"},
97 {ACL_ERROR_RT_LABEL_CONTEXT, "label not in current context"},
98 {ACL_ERROR_RT_PROGRAM_USE_OUT, "program register num use out"},
99 {ACL_ERROR_RT_DEV_SETUP_ERROR, "device setup error"},
100 {ACL_ERROR_RT_DRV_INTERNAL_ERROR, "drv internal error"},
101 };
102
103 constexpr auto kUnknowErrorString = "Unknown error occurred";
104
105 bool g_acl_initialized = false;
106 std::mutex g_acl_init_mutex;
107 } // namespace
108
109 std::mutex ErrorManagerAdapter::initialized_mutex_;
110 bool ErrorManagerAdapter::initialized_ = false;
111
Init()112 bool ErrorManagerAdapter::Init() {
113 std::unique_lock<std::mutex> lock(initialized_mutex_);
114 if (initialized_) {
115 MS_LOG(DEBUG) << "Ascend error manager has been initialized.";
116 return true;
117 }
118 LogWriter::SetMessageHandler(&MessageHandler);
119 initialized_ = true;
120 return true;
121 }
122
GetErrorMessage(bool add_title)123 std::string ErrorManagerAdapter::GetErrorMessage(bool add_title) {
124 int32_t device_id;
125 if (CALL_ASCEND_API(aclrtGetDevice, &device_id) != ACL_SUCCESS) {
126 MS_LOG(INFO) << "The device is not set yet, no need to fetch error from device.";
127 return "";
128 }
129 const char *message = CALL_ASCEND_API(aclGetRecentErrMsg);
130 const string error_message = message == nullptr ? "" : message;
131 if (error_message.empty() || error_message.find(kUnknowErrorString) != string::npos) {
132 return "";
133 }
134 if (add_title) {
135 return "#umsg#Ascend Error Message:#umsg#" + error_message +
136 "\n(Please search \"CANN Common Error Analysis\" at https://www.mindspore.cn for error code description)";
137 }
138 return error_message;
139 }
140
MessageHandler(std::ostringstream * oss)141 void ErrorManagerAdapter::MessageHandler(std::ostringstream *oss) {
142 const auto &error_message = GetErrorMessage(true);
143 if (!error_message.empty()) {
144 *oss << error_message;
145 }
146 }
147
GetErrorMsg(uint32_t rt_error_code)148 std::string GetErrorMsg(uint32_t rt_error_code) {
149 auto find_iter = error_msg.find(rt_error_code);
150 if (find_iter == error_msg.end()) {
151 return "Return error code unknown, ret code: " + std::to_string(rt_error_code);
152 }
153 return find_iter->second;
154 }
155
callback_thread_func(void * data)156 void *callback_thread_func(void *data) {
157 pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, nullptr);
158 pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, nullptr);
159 #ifdef WITH_BACKEND
160 auto callback_thread = reinterpret_cast<CallbackThread *>(data);
161 while (callback_thread->flag_.load()) {
162 try {
163 auto ret = CALL_ASCEND_API(aclrtProcessReport, callback_thread->default_timeout_);
164 if (ret && ret != ACL_ERROR_WAIT_CALLBACK_TIMEOUT && ret != ACL_ERROR_RT_REPORT_TIMEOUT) {
165 MS_LOG(DEBUG) << "aclrtProcessReport err : " << ret << ".";
166 }
167 } catch (const std::exception &ex) {
168 MS_LOG(ERROR) << "aclrtProcessReport exception : " << ex.what() << ".";
169 break;
170 }
171 }
172 MS_LOG(INFO) << "Exit callback thread loop.";
173 #endif
174 return data;
175 }
176
177 namespace {
GenerateAclInitJson(const string & json_file_path)178 bool GenerateAclInitJson(const string &json_file_path) {
179 nlohmann::json acl_init_json;
180 // generate err_msg_mode
181 acl_init_json["err_msg_mode"] = "1";
182
183 // write to file
184 std::string json_file_str = acl_init_json.dump();
185 std::ofstream json_file(json_file_path);
186 if (!json_file.is_open()) {
187 MS_LOG(WARNING) << "Open file [" << json_file_path << "] failed!";
188 return False;
189 }
190 json_file << json_file_str;
191 json_file.close();
192 MS_LOG(INFO) << "Generate aclInit json to file : " << json_file_path;
193 return True;
194 }
195 } // namespace
196
InitializeAcl()197 void InitializeAcl() {
198 std::lock_guard<std::mutex> lock(g_acl_init_mutex);
199 if (g_acl_initialized) {
200 return;
201 }
202
203 const char *acl_json_path = nullptr;
204
205 std::string file_name = "./aclinit.json";
206 auto realpath = Common::CreatePrefixPath(file_name);
207 if (realpath.has_value()) {
208 if (GenerateAclInitJson(realpath.value())) {
209 acl_json_path = realpath.value().c_str();
210 }
211 } else {
212 MS_LOG(WARNING) << "Failed to get real path: [" << file_name << "] in generate aclInit json file path.";
213 }
214
215 if (CALL_ASCEND_API(aclInit, acl_json_path) != ACL_ERROR_NONE) {
216 MS_LOG(WARNING) << "Call aclInit failed, acl data dump function will be unusable.";
217 } else {
218 MS_LOG(INFO) << "Call aclInit successfully";
219 }
220 g_acl_initialized = true;
221 }
222
GetFormatMode()223 std::string GetFormatMode() {
224 auto format_mode = common::GetEnv("MS_FORMAT_MODE");
225 if (format_mode.empty()) {
226 // default set "0" for 910a graph sink, otherwise "1"
227 auto ms_context = MsContext::GetInstance();
228 MS_EXCEPTION_IF_NULL(ms_context);
229 if (ms_context->ascend_soc_version() == "ascend910" && ms_context->get_param<bool>(MS_CTX_IS_MULTI_GRAPH_SINK) &&
230 ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode) {
231 format_mode = "0";
232 } else {
233 format_mode = "1";
234 }
235 }
236 return format_mode;
237 }
238 } // namespace ascend
239 } // namespace device
240 } // namespace mindspore
241