• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "plugin/device/ascend/hal/common/ascend_utils.h"
18 #include <vector>
19 #include <string>
20 #include <map>
21 #include <fstream>
22 #include "utils/dlopen_macro.h"
23 #include "acl/error_codes/rt_error_codes.h"
24 #include "transform/symbol/acl_base_symbol.h"
25 #include "transform/symbol/acl_rt_symbol.h"
26 #include "transform/symbol/acl_symbol.h"
27 #include "transform/symbol/symbol_utils.h"
28 #include "include/common/debug/common.h"
29 
30 namespace mindspore {
31 namespace device {
32 namespace ascend {
33 namespace {
34 const std::map<uint32_t, std::string> error_msg = {
35   {ACL_RT_SUCCESS, "success"},
36   {ACL_ERROR_RT_PARAM_INVALID, "param invalid"},
37   {ACL_ERROR_RT_INVALID_DEVICEID, "invalid device id"},
38   {ACL_ERROR_RT_CONTEXT_NULL, "current context null"},
39   {ACL_ERROR_RT_STREAM_CONTEXT, "stream not in current context"},
40   {ACL_ERROR_RT_MODEL_CONTEXT, "model not in current context"},
41   {ACL_ERROR_RT_STREAM_MODEL, "stream not in model"},
42   {ACL_ERROR_RT_EVENT_TIMESTAMP_INVALID, "event timestamp invalid"},
43   {ACL_ERROR_RT_EVENT_TIMESTAMP_REVERSAL, " event timestamp reversal"},
44   {ACL_ERROR_RT_ADDR_UNALIGNED, "memory address unaligned"},
45   {ACL_ERROR_RT_FILE_OPEN, "open file failed"},
46   {ACL_ERROR_RT_FILE_WRITE, "write file failed"},
47   {ACL_ERROR_RT_STREAM_SUBSCRIBE, "error subscribe stream"},
48   {ACL_ERROR_RT_THREAD_SUBSCRIBE, "error subscribe thread"},
49   {ACL_ERROR_RT_GROUP_NOT_SET, "group not set"},
50   {ACL_ERROR_RT_GROUP_NOT_CREATE, "group not create"},
51   {ACL_ERROR_RT_STREAM_NO_CB_REG, "callback not register to stream"},
52   {ACL_ERROR_RT_INVALID_MEMORY_TYPE, "invalid memory type"},
53   {ACL_ERROR_RT_INVALID_HANDLE, "invalid handle"},
54   {ACL_ERROR_RT_INVALID_MALLOC_TYPE, "invalid malloc type"},
55   {ACL_ERROR_RT_FEATURE_NOT_SUPPORT, "feature not support"},
56   {ACL_ERROR_RT_MEMORY_ALLOCATION, "memory allocation error"},
57   {ACL_ERROR_RT_MEMORY_FREE, "memory free error"},
58   {ACL_ERROR_RT_AICORE_OVER_FLOW, "aicore over flow"},
59   {ACL_ERROR_RT_NO_DEVICE, "no device"},
60   {ACL_ERROR_RT_RESOURCE_ALLOC_FAIL, "resource alloc fail"},
61   {ACL_ERROR_RT_NO_PERMISSION, "no permission"},
62   {ACL_ERROR_RT_NO_EVENT_RESOURCE, "no event resource"},
63   {ACL_ERROR_RT_NO_STREAM_RESOURCE, "no stream resource"},
64   {ACL_ERROR_RT_NO_NOTIFY_RESOURCE, "no notify resource"},
65   {ACL_ERROR_RT_NO_MODEL_RESOURCE, "no model resource"},
66   {ACL_ERROR_RT_INTERNAL_ERROR, "runtime internal error"},
67   {ACL_ERROR_RT_TS_ERROR, "ts internal error"},
68   {ACL_ERROR_RT_STREAM_TASK_FULL, "task full in stream"},
69   {ACL_ERROR_RT_STREAM_TASK_EMPTY, " task empty in stream"},
70   {ACL_ERROR_RT_STREAM_NOT_COMPLETE, "stream not complete"},
71   {ACL_ERROR_RT_END_OF_SEQUENCE, "end of sequence"},
72   {ACL_ERROR_RT_EVENT_NOT_COMPLETE, "event not complete"},
73   {ACL_ERROR_RT_CONTEXT_RELEASE_ERROR, "context release error"},
74   {ACL_ERROR_RT_SOC_VERSION, "soc version error"},
75   {ACL_ERROR_RT_TASK_TYPE_NOT_SUPPORT, "task type not support"},
76   {ACL_ERROR_RT_LOST_HEARTBEAT, "ts lost heartbeat"},
77   {ACL_ERROR_RT_MODEL_EXECUTE, " model execute failed"},
78   {ACL_ERROR_RT_REPORT_TIMEOUT, "report timeout"},
79   {ACL_ERROR_RT_SYS_DMA, "sys dma error"},
80   {ACL_ERROR_RT_AICORE_TIMEOUT, "aicore timeout"},
81   {ACL_ERROR_RT_AICORE_EXCEPTION, "aicore exception"},
82   {ACL_ERROR_RT_AICORE_TRAP_EXCEPTION, " aicore trap exception"},
83   {ACL_ERROR_RT_AICPU_TIMEOUT, " aicpu timeout"},
84   {ACL_ERROR_RT_AICPU_EXCEPTION, "aicpu exception"},
85   {ACL_ERROR_RT_AICPU_DATADUMP_RSP_ERR, " aicpu datadump response error"},
86   {ACL_ERROR_RT_AICPU_MODEL_RSP_ERR, "aicpu model operate response error"},
87   {ACL_ERROR_RT_PROFILING_ERROR, "profiling error"},
88   {ACL_ERROR_RT_IPC_ERROR, "ipc error"},
89   {ACL_ERROR_RT_MODEL_ABORT_NORMAL, "model abort normal"},
90   {ACL_ERROR_RT_KERNEL_UNREGISTERING, "kernel unregistering"},
91   {ACL_ERROR_RT_RINGBUFFER_NOT_INIT, "ringbuffer not init"},
92   {ACL_ERROR_RT_RINGBUFFER_NO_DATA, "ringbuffer no data"},
93   {ACL_ERROR_RT_KERNEL_LOOKUP, "kernel lookup error"},
94   {ACL_ERROR_RT_KERNEL_DUPLICATE, "kernel register duplicate"},
95   {ACL_ERROR_RT_DEBUG_REGISTER_FAIL, "debug register failed"},
96   {ACL_ERROR_RT_DEBUG_UNREGISTER_FAIL, "debug unregister failed"},
97   {ACL_ERROR_RT_LABEL_CONTEXT, "label not in current context"},
98   {ACL_ERROR_RT_PROGRAM_USE_OUT, "program register num use out"},
99   {ACL_ERROR_RT_DEV_SETUP_ERROR, "device setup error"},
100   {ACL_ERROR_RT_DRV_INTERNAL_ERROR, "drv internal error"},
101 };
102 
103 constexpr auto kUnknowErrorString = "Unknown error occurred";
104 
105 bool g_acl_initialized = false;
106 std::mutex g_acl_init_mutex;
107 }  // namespace
108 
109 std::mutex ErrorManagerAdapter::initialized_mutex_;
110 bool ErrorManagerAdapter::initialized_ = false;
111 
Init()112 bool ErrorManagerAdapter::Init() {
113   std::unique_lock<std::mutex> lock(initialized_mutex_);
114   if (initialized_) {
115     MS_LOG(DEBUG) << "Ascend error manager has been initialized.";
116     return true;
117   }
118   LogWriter::SetMessageHandler(&MessageHandler);
119   initialized_ = true;
120   return true;
121 }
122 
GetErrorMessage(bool add_title)123 std::string ErrorManagerAdapter::GetErrorMessage(bool add_title) {
124   int32_t device_id;
125   if (CALL_ASCEND_API(aclrtGetDevice, &device_id) != ACL_SUCCESS) {
126     MS_LOG(INFO) << "The device is not set yet, no need to fetch error from device.";
127     return "";
128   }
129   const char *message = CALL_ASCEND_API(aclGetRecentErrMsg);
130   const string error_message = message == nullptr ? "" : message;
131   if (error_message.empty() || error_message.find(kUnknowErrorString) != string::npos) {
132     return "";
133   }
134   if (add_title) {
135     return "#umsg#Ascend Error Message:#umsg#" + error_message +
136            "\n(Please search \"CANN Common Error Analysis\" at https://www.mindspore.cn for error code description)";
137   }
138   return error_message;
139 }
140 
MessageHandler(std::ostringstream * oss)141 void ErrorManagerAdapter::MessageHandler(std::ostringstream *oss) {
142   const auto &error_message = GetErrorMessage(true);
143   if (!error_message.empty()) {
144     *oss << error_message;
145   }
146 }
147 
GetErrorMsg(uint32_t rt_error_code)148 std::string GetErrorMsg(uint32_t rt_error_code) {
149   auto find_iter = error_msg.find(rt_error_code);
150   if (find_iter == error_msg.end()) {
151     return "Return error code unknown, ret code: " + std::to_string(rt_error_code);
152   }
153   return find_iter->second;
154 }
155 
callback_thread_func(void * data)156 void *callback_thread_func(void *data) {
157   pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, nullptr);
158   pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, nullptr);
159 #ifdef WITH_BACKEND
160   auto callback_thread = reinterpret_cast<CallbackThread *>(data);
161   while (callback_thread->flag_.load()) {
162     try {
163       auto ret = CALL_ASCEND_API(aclrtProcessReport, callback_thread->default_timeout_);
164       if (ret && ret != ACL_ERROR_WAIT_CALLBACK_TIMEOUT && ret != ACL_ERROR_RT_REPORT_TIMEOUT) {
165         MS_LOG(DEBUG) << "aclrtProcessReport err : " << ret << ".";
166       }
167     } catch (const std::exception &ex) {
168       MS_LOG(ERROR) << "aclrtProcessReport exception : " << ex.what() << ".";
169       break;
170     }
171   }
172   MS_LOG(INFO) << "Exit callback thread loop.";
173 #endif
174   return data;
175 }
176 
177 namespace {
GenerateAclInitJson(const string & json_file_path)178 bool GenerateAclInitJson(const string &json_file_path) {
179   nlohmann::json acl_init_json;
180   // generate err_msg_mode
181   acl_init_json["err_msg_mode"] = "1";
182 
183   // write to file
184   std::string json_file_str = acl_init_json.dump();
185   std::ofstream json_file(json_file_path);
186   if (!json_file.is_open()) {
187     MS_LOG(WARNING) << "Open file [" << json_file_path << "] failed!";
188     return False;
189   }
190   json_file << json_file_str;
191   json_file.close();
192   MS_LOG(INFO) << "Generate aclInit json to file : " << json_file_path;
193   return True;
194 }
195 }  // namespace
196 
InitializeAcl()197 void InitializeAcl() {
198   std::lock_guard<std::mutex> lock(g_acl_init_mutex);
199   if (g_acl_initialized) {
200     return;
201   }
202 
203   const char *acl_json_path = nullptr;
204 
205   std::string file_name = "./aclinit.json";
206   auto realpath = Common::CreatePrefixPath(file_name);
207   if (realpath.has_value()) {
208     if (GenerateAclInitJson(realpath.value())) {
209       acl_json_path = realpath.value().c_str();
210     }
211   } else {
212     MS_LOG(WARNING) << "Failed to get real path: [" << file_name << "] in generate aclInit json file path.";
213   }
214 
215   if (CALL_ASCEND_API(aclInit, acl_json_path) != ACL_ERROR_NONE) {
216     MS_LOG(WARNING) << "Call aclInit failed, acl data dump function will be unusable.";
217   } else {
218     MS_LOG(INFO) << "Call aclInit successfully";
219   }
220   g_acl_initialized = true;
221 }
222 
GetFormatMode()223 std::string GetFormatMode() {
224   auto format_mode = common::GetEnv("MS_FORMAT_MODE");
225   if (format_mode.empty()) {
226     // default set "0" for 910a graph sink, otherwise "1"
227     auto ms_context = MsContext::GetInstance();
228     MS_EXCEPTION_IF_NULL(ms_context);
229     if (ms_context->ascend_soc_version() == "ascend910" && ms_context->get_param<bool>(MS_CTX_IS_MULTI_GRAPH_SINK) &&
230         ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kPynativeMode) {
231       format_mode = "0";
232     } else {
233       format_mode = "1";
234     }
235   }
236   return format_mode;
237 }
238 }  // namespace ascend
239 }  // namespace device
240 }  // namespace mindspore
241