1 /**
2 * Copyright 2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "backend/kernel_compiler/tbe/ascend_kernel_compile.h"
18 #include <sys/syscall.h>
19 #include <unistd.h>
20 #include <algorithm>
21 #include <map>
22 #include <memory>
23 #include <string>
24 #include <unordered_map>
25 #include <unordered_set>
26 #include <vector>
27 #include "mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_creator.h"
28 #include "mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/single_tbe_json_creator.h"
29 #include "mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/fusion_tbe_json_creator.h"
30 #include "backend/kernel_compiler/tbe/tbe_utils.h"
31 #include "backend/kernel_compiler/tbe/tbe_convert_utils.h"
32 #include "backend/session/anf_runtime_algorithm.h"
33 #include "common/util/error_manager/error_manager.h"
34 #include "debug/anf_ir_dump.h"
35 #include "frontend/operator/ops.h"
36 #include "utils/ms_context.h"
37 #include "utils/ms_utils.h"
38 #include "utils/trace_base.h"
39 #include "utils/utils.h"
40 #include "utils/json_operation_utils.h"
41
42 namespace mindspore {
43 namespace kernel {
44 namespace ascend {
45 using mindspore::kernel::tbe::TbeAdapter;
46 using mindspore::kernel::tbe::TbeUtils;
47 const int indent = 4; // for dump json
48 const int kAscii_0 = 48;
49 const int kAscii_9 = 57;
50 const uint32_t kDEFAULT_PROCESS_NUM = 24;
51 constexpr auto kInitialize = "Initialize";
52 constexpr auto kPreCompile = "PreCompile";
53 constexpr auto kFinalize = "Finalize";
54 constexpr auto kCompile = "Compile";
55 constexpr auto kFusionCompile = "FusionOpCompile";
56 constexpr auto kTune = "Tune";
57 constexpr auto kOfflineTune = "offlineTune";
58 constexpr auto kCheckSupport = "CheckSupport";
59 constexpr auto kSelectFormat = "SelectFormat";
60 constexpr auto kFullySupported = "FULLY_SUPPORTED";
61 constexpr auto kLevel = "level";
62 constexpr auto kMessage = "message";
63 constexpr auto kErrorCode = "errCode";
64 constexpr auto kIndex = "index";
65 constexpr auto kStatus = "status";
66 constexpr auto kJobType = "job_type";
67 constexpr auto kJobId = "job_id";
68 constexpr auto kSourceId = "source_id";
69 constexpr auto kTuneMode = "tune_mode";
70 constexpr auto kTuneType = "tune_type";
71 constexpr auto kJobContent = "job_content";
72 constexpr auto kProcessInfo = "process_info";
73 constexpr auto kReturnValue = "return_value";
74 constexpr auto kFusionOpName = "fusion_op_name";
75 constexpr auto kResult = "result";
76 constexpr auto kOpList = "op_list";
77 constexpr auto kSuccess = "SUCCESS";
78 constexpr auto kRunning = "RUNNING";
79 constexpr auto kFailed = "FAILED";
80 constexpr auto kQuery = "Query";
81 constexpr auto kTrue = "True";
82 constexpr auto kGLOG_v = "GLOG_v";
83 constexpr auto kSocInfo = "SocInfo";
84 constexpr auto kTuneInfo = "TuneInfo";
85 constexpr auto kLicInfo = "LicInfo";
86 constexpr auto kTuneOpList = "tune_op_list";
87 constexpr auto kProcessNum = "process_num";
88 constexpr auto kLogLevel = "log_level";
89 constexpr auto kEnableEvent = "enable_event";
90 constexpr auto kTuneDumpPath = "tune_dump_path";
91 constexpr auto kTuneBankPath = "tune_bank_path";
92 constexpr auto kTbeImplPath = "tbe_impl_path";
93 constexpr auto kParaDebugPath = "para_debug_path";
94 constexpr auto kMS_BUILD_PROCESS_NUM = "MS_BUILD_PROCESS_NUM";
95 constexpr auto kMS_PARA_DEBUG_PATH = "PARA_DEBUG_PATH";
96 constexpr auto kTBE_IMPL_PATH = "TBE_IMPL_PATH";
97 constexpr auto kTUNE_OPS_NAME = "TUNE_OPS_NAME";
98 constexpr auto kDefPath = "/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe/";
99 constexpr auto kBkPath = "/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe/";
100 constexpr int KSleepSeconds = 3;
101 constexpr int KSleepInterval = 1000;
102
103 namespace {
Order(const nlohmann::json & json1,const nlohmann::json & json2)104 inline bool Order(const nlohmann::json &json1, const nlohmann::json &json2) {
105 return json1[kIndex].dump() < json2[kIndex].dump();
106 }
107
ReportToErrorManager(const string & message)108 void ReportToErrorManager(const string &message) {
109 nlohmann::json exception_message;
110 if (!ParseJson(message, &exception_message)) {
111 MS_LOG(EXCEPTION) << "Parse tbe exception message error.";
112 }
113 const auto &error_code = GetJsonValue<std::string>(exception_message, kErrorCode);
114 std::map<std::string, std::string> arg_map;
115 for (auto it = exception_message.begin(); it != exception_message.end(); (void)it++) {
116 const std::string arg_key = it.key();
117 if (it.key() == kErrorCode) {
118 continue;
119 }
120 const auto &arg_value = GetJsonValue<std::string>(exception_message, arg_key);
121 arg_map[arg_key] = arg_value;
122 }
123 const auto report_ret = ErrorManager::GetInstance().ReportErrMessage(error_code, arg_map);
124 if (report_ret != 0) {
125 MS_LOG(WARNING) << "Report error message failed, raw error message: " << message;
126 }
127 }
128
PrintInfo(const nlohmann::json & info,const std::string & job_name,const int job_id,int adjust_log_level)129 void PrintInfo(const nlohmann::json &info, const std::string &job_name, const int job_id, int adjust_log_level) {
130 auto level = GetJsonValue<int>(info, kLevel);
131 level = level > adjust_log_level ? adjust_log_level : level;
132 auto message = GetJsonValue<std::string>(info, kMessage);
133 if (level == 0) {
134 MS_LOG(DEBUG) << "Job id:" << job_id << ", name :" << job_name << ", message:" << message;
135 } else if (level == INFO) {
136 MS_LOG(INFO) << "Job id:" << job_id << ", name :" << job_name << ", message:" << message;
137 } else if (level == WARNING) {
138 MS_LOG(WARNING) << "Job id:" << job_id << ", name :" << job_name << ", message:" << message;
139 } else if (level == ERROR) {
140 MS_LOG(ERROR) << "Job id:" << job_id << ", name :" << job_name << ", message:" << message;
141 } else if (level == EXCEPTION) {
142 ReportToErrorManager(message);
143 }
144 }
145
FilterExceptionMessage(const std::vector<nlohmann::json> & all_logs)146 std::string FilterExceptionMessage(const std::vector<nlohmann::json> &all_logs) {
147 std::ostringstream buffer;
148 for (const auto &item : all_logs) {
149 auto message = GetJsonValue<std::string>(item, kMessage);
150 if (message.find("except_msg") != std::string::npos) {
151 buffer << message;
152 buffer << "\n";
153 }
154 if (message.find("except_tuple_msg") != std::string::npos) {
155 buffer << message;
156 buffer << "\n";
157 }
158 if (message.find("Error message") != std::string::npos) {
159 buffer << message;
160 buffer << "\n";
161 }
162 }
163 auto res = buffer.str().empty() ? "None" : buffer.str();
164 return res;
165 }
166
IsDigit(const std::string & str)167 bool IsDigit(const std::string &str) {
168 if (str.empty()) {
169 return false;
170 }
171 size_t i = 0;
172 while (i < str.size()) {
173 if (static_cast<int>(str[i]) < kAscii_0 || static_cast<int>(str[i]) > kAscii_9) {
174 return false;
175 }
176 i++;
177 }
178 return true;
179 }
180
GetProcessNum()181 uint32_t GetProcessNum() {
182 uint32_t process_num = kDEFAULT_PROCESS_NUM;
183 auto env_process_num = common::GetEnv(kMS_BUILD_PROCESS_NUM);
184 if (!env_process_num.empty()) {
185 if (!IsDigit(env_process_num)) {
186 MS_LOG(EXCEPTION) << "Invalid environment of 'MS_BUILD_PROCESS_NUM',it should be a digit, but got: "
187 << env_process_num;
188 }
189 process_num = UlongToUint(std::stoul(env_process_num));
190 if (process_num < 1 || process_num > kDEFAULT_PROCESS_NUM) {
191 MS_LOG(EXCEPTION) << "Invalid environment of 'MS_BUILD_PROCESS_NUM', the value should be in [1, 24], but got: "
192 << process_num;
193 }
194 }
195 return process_num;
196 }
197
StrToInt(const std::string & env)198 int StrToInt(const std::string &env) {
199 if (env == "0") {
200 return DEBUG;
201 } else if (env == "1") {
202 return INFO;
203 } else if (env == "3") {
204 return ERROR;
205 }
206 return WARNING;
207 }
208
GetLogLevel()209 int GetLogLevel() {
210 auto env = common::GetEnv(kGLOG_v);
211 int ms_level = StrToInt(env);
212 return ms_level;
213 }
214
GetParaDebugPath()215 std::string GetParaDebugPath() {
216 auto save_path = common::GetEnv(kMS_PARA_DEBUG_PATH);
217 char real_path[PATH_MAX] = {0};
218 if (!save_path.empty()) {
219 if (realpath(save_path.c_str(), real_path)) {
220 save_path = real_path;
221 } else {
222 MS_LOG(EXCEPTION) << "Invalid environment variable 'PARA_DEBUG_PATH', the path is " << save_path
223 << ". Please check (1) whether the path exists, (2) whether the path has the access "
224 "permission, (3) whether the path is too long.";
225 }
226 } else {
227 save_path = "";
228 }
229 return save_path;
230 }
231
GetTbePath()232 std::string GetTbePath() {
233 auto save_path = common::GetEnv(kTBE_IMPL_PATH);
234 char real_path[PATH_MAX] = {0};
235 if (!save_path.empty()) {
236 if (realpath(save_path.c_str(), real_path)) {
237 save_path = real_path;
238 } else {
239 MS_LOG(EXCEPTION) << "Invalid environment variable 'TBE_IMPL_PATH', the path is " << save_path
240 << ". Please check (1) whether the path exists, (2) whether the path has the access "
241 "permission, (3) whether the path is too long. ";
242 }
243 } else {
244 if (realpath(kDefPath, real_path)) {
245 save_path = real_path;
246 } else if (realpath(kBkPath, real_path)) {
247 save_path = real_path;
248 } else {
249 MS_LOG(WARNING) << "Can not get access to [" << kDefPath << "] or [" << kBkPath << "]";
250 }
251 }
252 return save_path;
253 }
254
GetTuneOpsList(const std::string & d)255 std::vector<std::string> GetTuneOpsList(const std::string &d) {
256 std::vector<string> res;
257 auto ops = common::GetEnv(kTUNE_OPS_NAME);
258 if (ops.empty()) {
259 return {};
260 }
261 size_t p1 = 0;
262 size_t p2 = ops.find(d);
263 while (p2 != std::string::npos) {
264 if (p1 < ops.length() && (p2 - p1) < ops.length()) {
265 (void)res.emplace_back(ops.substr(p1, p2 - p1));
266 }
267
268 p1 = p2 + 1;
269 p2 = ops.find(d, p1);
270 }
271 if (p1 <= ops.length()) {
272 (void)res.emplace_back(ops.substr(p1));
273 }
274 return res;
275 }
276 } // namespace
277
ResetOldTask()278 void AscendKernelCompileManager::ResetOldTask() {
279 if (build_manager_ != nullptr) {
280 build_manager_->ResetTaskInfo();
281 }
282 job_list_.clear();
283 job_id_to_node_.clear();
284 }
285
PrintProcessLog(const nlohmann::json & json,int adjust_log_level=EXCEPTION)286 void AscendKernelCompileManager::PrintProcessLog(const nlohmann::json &json, int adjust_log_level = EXCEPTION) {
287 auto all_logs = GetJsonValue<std::vector<nlohmann::json>>(json, kProcessInfo);
288 auto job_id = GetJsonValue<int>(json, kJobId);
289 auto json_name = GetJsonValue<std::string>(json, kFusionOpName);
290 std::sort(all_logs.begin(), all_logs.end(), Order);
291 for (const auto &item : all_logs) {
292 PrintInfo(item, json_name, job_id, adjust_log_level);
293 }
294 }
295
PrintCompileResult(const nlohmann::json & json)296 void AscendKernelCompileManager::PrintCompileResult(const nlohmann::json &json) {
297 auto job_type = GetJsonValue<std::string>(json, kJobType);
298 auto json_name = GetJsonValue<std::string>(json, kFusionOpName);
299 MS_LOG(DEBUG) << "Job: " << job_type << " post process";
300 if (json.at(kStatus) == kFailed) {
301 if (job_type == kFusionCompile || job_type == kPreCompile || job_type == kTune) {
302 auto all_logs = GetJsonValue<std::vector<nlohmann::json>>(json, kProcessInfo);
303 auto message = FilterExceptionMessage(all_logs);
304 MS_LOG(INFO) << "Job " << job_type << " running failed, json name, " << json_name << "\n except_msg: " << message;
305 return;
306 } else {
307 PrintProcessLog(json);
308 auto task_id = GetJsonValue<int>(json, kJobId);
309 auto target_node = job_id_to_node_[task_id];
310 MS_LOG(EXCEPTION) << "Job " << job_type << " running failed, json name: " << json_name
311 << "node trace: " << trace::DumpSourceLines(target_node);
312 return;
313 }
314 }
315 MS_LOG(INFO) << "Job " << job_type << " running " << json.at(kStatus) << ", json name: " << json_name;
316 }
317
QueryResultProcess(const nlohmann::json & json,TargetJobStatus * task_info)318 void AscendKernelCompileManager::QueryResultProcess(const nlohmann::json &json, TargetJobStatus *task_info) {
319 if (GetJsonValue<std::string>(json, kStatus) == kSuccess) {
320 nlohmann::json query_result;
321 if (!ParseJson(GetJsonValue<std::string>(json, kResult), &query_result)) {
322 MS_LOG(EXCEPTION) << "Parse query result error.";
323 }
324 auto json_name = GetJsonValue<std::string>(query_result, kFusionOpName);
325 auto target_job_id = query_result.at(kJobId);
326 auto target_status = query_result.at(kStatus);
327 // target job result
328 auto all_logs = GetJsonValue<std::vector<nlohmann::json>>(query_result, kProcessInfo);
329 auto message = FilterExceptionMessage(all_logs);
330 // save job status and exception message
331 task_info->target_job_id = target_job_id;
332 task_info->json_name = json_name;
333 task_info->except_msg = message;
334 if (target_status == kSuccess) {
335 task_info->job_status = kSuccess;
336 } else if (target_status != kSuccess && target_status != kRunning) {
337 task_info->job_status = kFailed;
338 }
339 }
340 }
341
TurnStrToJson(const std::string & string) const342 nlohmann::json AscendKernelCompileManager::TurnStrToJson(const std::string &string) const {
343 nlohmann::json json;
344 if (!ParseJson(string, &json)) {
345 MS_LOG(EXCEPTION) << "Parse build result error.";
346 }
347 if (!json.is_object()) {
348 MS_LOG(EXCEPTION) << "Json str is not an object, str: " << string;
349 }
350 return json;
351 }
352
ParseTargetJobStatus(const std::string & type,const std::string & job_result,std::vector<int> * success_job)353 void AscendKernelCompileManager::ParseTargetJobStatus(const std::string &type, const std::string &job_result,
354 std::vector<int> *success_job) {
355 MS_EXCEPTION_IF_NULL(success_job);
356 auto json_obj = TurnStrToJson(job_result);
357 // the query job' status.
358 if (json_obj.at(kStatus) == kSuccess) {
359 nlohmann::json query_obj;
360 if (!ParseJson(GetJsonValue<std::string>(json_obj, kResult), &query_obj)) {
361 MS_LOG(EXCEPTION) << "Parse query result error.";
362 }
363 auto kernel_name = GetJsonValue<std::string>(query_obj, kFusionOpName);
364 struct TargetJobStatus task_info;
365 QueryResultProcess(json_obj, &task_info);
366 auto target_node = job_id_to_node_[task_info.target_job_id];
367 if (task_info.job_status == kSuccess) {
368 MS_LOG(DEBUG) << "Job " << GetJsonValue<std::string>(query_obj, kJobType) << " running success.";
369 std::string build_result = GetJsonValue<std::string>(query_obj, kResult);
370 if (type == kPreCompile) {
371 build_manager_->PreTaskFinishProcess(task_info.target_job_id, build_result);
372 } else {
373 (void)build_manager_->TaskFinishProcess(task_info.target_job_id, build_result);
374 }
375 (void)success_job->emplace_back(task_info.target_job_id);
376 } else if (task_info.job_status == kFailed) {
377 if (type == kPreCompile) {
378 (void)success_job->emplace_back(task_info.target_job_id);
379 MS_LOG(WARNING) << "Single op pre build failed ,op: " << kernel_name
380 << "\n except_msg : " << task_info.except_msg;
381 } else {
382 ResetOldTask();
383 single_processed_kernels_.clear();
384 MS_LOG(EXCEPTION) << "Single op compile failed, op: " << kernel_name
385 << "\n except_msg : " << task_info.except_msg
386 << "\n node trace: " << trace::DumpSourceLines(target_node);
387 }
388 }
389 } else {
390 if (type == kPreCompile) {
391 MS_LOG(WARNING) << "Query job failed.";
392 return;
393 }
394 MS_LOG(EXCEPTION) << "Query job failed.";
395 }
396 }
397
QueryFinishJob(const std::string & job_type)398 void AscendKernelCompileManager::QueryFinishJob(const std::string &job_type) {
399 MS_EXCEPTION_IF_NULL(build_manager_);
400 size_t query_cnt = 0;
401 while (!job_list_.empty()) {
402 std::vector<int> success_job;
403 auto iter = job_list_.begin();
404 while (iter != job_list_.end()) {
405 nlohmann::json query_json;
406 auto kernel_json = iter->second;
407 JsonAssemble(kQuery, kernel_json, &query_json);
408 auto job_result = build_manager_->ProcessTbeJob(query_json);
409 query_cnt++;
410 ParseTargetJobStatus(job_type, job_result, &success_job);
411 (void)iter++;
412 }
413 for (auto k : success_job) {
414 (void)job_list_.erase(k);
415 }
416 success_job.clear();
417 if (!job_list_.empty()) {
418 if (query_cnt % KSleepInterval == 0) {
419 MS_LOG(INFO) << "Querying Parallel Compilation Job. Current Query Count: " << query_cnt;
420 (void)sleep(KSleepSeconds);
421 }
422 }
423 }
424 }
425
QueryFusionFinishJob(KernelModMap * kernel_mode_ret)426 void AscendKernelCompileManager::QueryFusionFinishJob(KernelModMap *kernel_mode_ret) {
427 MS_EXCEPTION_IF_NULL(build_manager_);
428 MS_EXCEPTION_IF_NULL(kernel_mode_ret);
429 int build_failed_nums = 0;
430 size_t query_cnt = 0;
431 while (!job_list_.empty()) {
432 std::vector<int> success_job;
433 auto iter = job_list_.begin();
434 while (iter != job_list_.end()) {
435 nlohmann::json query_json;
436 auto kernel_json = iter->second;
437 JsonAssemble(kQuery, kernel_json, &query_json);
438 auto build_result = build_manager_->ProcessTbeJob(query_json);
439 query_cnt++;
440 auto json_obj = TurnStrToJson(build_result);
441 if (json_obj.at(kStatus) == kSuccess) {
442 struct TargetJobStatus task_info;
443 QueryResultProcess(json_obj, &task_info);
444 if (task_info.job_status == kSuccess) {
445 MS_LOG(DEBUG) << "Job " << GetJsonValue<std::string>(json_obj, kJobType) << " running success.";
446 std::string build_res = GetJsonValue<std::string>(json_obj, kResult);
447 auto kernel_mode_item = build_manager_->TaskFinishProcess(task_info.target_job_id, build_res, false);
448 if (kernel_mode_item.second != nullptr) {
449 (void)kernel_mode_ret->emplace(kernel_mode_item);
450 }
451 (void)success_job.emplace_back(task_info.target_job_id);
452 } else if (task_info.job_status == kFailed) {
453 MS_LOG(INFO) << "FusionOp compile failed, json name: " << task_info.json_name
454 << "\n Except_msg: " << task_info.except_msg;
455 auto target_id = task_info.target_job_id;
456 (void)success_job.emplace_back(target_id);
457 build_failed_nums += 1;
458 }
459 } else {
460 MS_LOG(EXCEPTION) << "Fusion op query failed. message: " << build_result;
461 }
462 (void)iter++;
463 }
464 for (auto k : success_job) {
465 (void)job_list_.erase(k);
466 }
467 success_job.clear();
468 if (!job_list_.empty()) {
469 if (query_cnt % KSleepInterval == 0) {
470 MS_LOG(INFO) << "Querying Parallel Compilation Job. Current Query Count: " << query_cnt;
471 (void)sleep(KSleepSeconds);
472 }
473 }
474 }
475 MS_LOG(INFO) << "Compile Fusion Kernel Failed Num: " << build_failed_nums;
476 }
477
JsonAssemble(const std::string & job_type,const nlohmann::json & src_json,nlohmann::json * dst_json)478 void AscendKernelCompileManager::JsonAssemble(const std::string &job_type, const nlohmann::json &src_json,
479 nlohmann::json *dst_json) {
480 MS_EXCEPTION_IF_NULL(src_json);
481 MS_EXCEPTION_IF_NULL(dst_json);
482 static size_t job_id = 0;
483 auto context_ptr = MsContext::GetInstance();
484 MS_EXCEPTION_IF_NULL(context_ptr);
485 static uint32_t source_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
486 (*dst_json)[kJobType] = job_type;
487 (*dst_json)[kJobId] = job_id++;
488 (*dst_json)[kSourceId] = source_id;
489 if (job_type == kInitialize || job_type == kFinalize) {
490 nlohmann::json job_info;
491 static auto process_num = GetProcessNum();
492 job_info[kProcessNum] = process_num;
493 job_info[kLogLevel] = GetLogLevel();
494 job_info[kEnableEvent] = false;
495 job_info[kParaDebugPath] = GetParaDebugPath();
496 job_info[kTbeImplPath] = GetTbePath();
497 job_info[kSocInfo] = src_json;
498 nlohmann::json tune_infos;
499 tune_infos[kTuneOpList] = GetTuneOpsList(",");
500 tune_infos[kTuneDumpPath] = TbeUtils::GetTuneDumpPath();
501 tune_infos[kTuneBankPath] = TbeUtils::GetBankPath();
502 job_info[kTuneInfo] = tune_infos;
503 nlohmann::json lic_infos;
504 kernel::tbe::TbeUtils::GenLicInfo(&lic_infos);
505 job_info[kLicInfo] = lic_infos;
506 (*dst_json)[kJobContent] = job_info;
507 } else if (job_type == kQuery) {
508 nlohmann::json content;
509 content[kSourceId] = src_json[kSourceId];
510 content[kJobId] = src_json[kJobId];
511 (*dst_json)[kJobContent] = content;
512 } else {
513 (*dst_json)[kJobContent] = src_json;
514 }
515 }
516
GetAllAscendNodes(const std::shared_ptr<session::KernelGraph> & kernel_graph,std::vector<AnfNodePtr> * tbe_nodes)517 void AscendKernelCompileManager::GetAllAscendNodes(const std::shared_ptr<session::KernelGraph> &kernel_graph,
518 std::vector<AnfNodePtr> *tbe_nodes) {
519 MS_EXCEPTION_IF_NULL(kernel_graph);
520 auto all_nodes = kernel_graph->execution_order();
521 for (const auto &anf_node : all_nodes) {
522 MS_EXCEPTION_IF_NULL(anf_node);
523 if (!AnfAlgo::IsRealKernel(anf_node)) {
524 continue;
525 }
526 KernelType kernel_type = AnfAlgo::GetKernelType(anf_node);
527 if (kernel_type == TBE_KERNEL) {
528 if (AnfAlgo::GetKernelMod(anf_node) == nullptr) {
529 tbe_nodes->push_back(anf_node);
530 }
531 }
532 }
533 }
534
AscendPreBuild(const std::shared_ptr<session::KernelGraph> & kernel_graph)535 void AscendKernelCompileManager::AscendPreBuild(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
536 MS_EXCEPTION_IF_NULL(kernel_graph);
537 MS_LOG(INFO) << "Single op pre build start.";
538 struct timeval start_time, end_time;
539 (void)gettimeofday(&start_time, nullptr);
540 MS_EXCEPTION_IF_NULL(build_manager_);
541 std::vector<AnfNodePtr> anf_nodes;
542 GetAllAscendNodes(kernel_graph, &anf_nodes);
543 if (anf_nodes.empty()) {
544 return;
545 }
546 auto json_creator = std::make_shared<BuildTbeJsonCreator>();
547 MS_EXCEPTION_IF_NULL(json_creator);
548 for (const auto &node : anf_nodes) {
549 MS_EXCEPTION_IF_NULL(node);
550 auto op_name = AnfAlgo::GetCNodeName(node);
551 nlohmann::json kernel_json;
552 if (!json_creator->GenJson(node, &kernel_json)) {
553 MS_LOG(EXCEPTION) << "Generate prebuild json failed, [" << op_name << ", " << node->fullname_with_scope()
554 << "], node trace:" << trace::DumpSourceLines(node);
555 }
556 auto json_name = json_creator->GetJsonName();
557 nlohmann::json build_json;
558 JsonAssemble(kPreCompile, kernel_json, &build_json);
559 auto build_result = build_manager_->ProcessTbeJob(build_json);
560 auto json_obj = TurnStrToJson(build_result);
561 PrintCompileResult(json_obj);
562 auto task_id = GetJsonValue<int>(json_obj, kJobId);
563 build_manager_->SavePreBuildTaskInfo(task_id, node, json_name);
564 if (json_obj.at(kStatus) == kRunning) {
565 std::pair<int, nlohmann::json> pair(task_id, build_json);
566 std::pair<int, AnfNodePtr> id_node(task_id, node);
567 (void)job_list_.insert(pair);
568 (void)job_id_to_node_.insert(id_node);
569 } else if (json_obj.at(kStatus) == kSuccess) {
570 std::string build_res = GetJsonValue<std::string>(json_obj, kResult);
571 build_manager_->PreTaskFinishProcess(task_id, build_res);
572 } else {
573 MS_LOG(WARNING) << "Kernel prebuild failed, op: " << op_name << ", json_name: " << json_name;
574 }
575 }
576
577 QueryFinishJob(kPreCompile);
578 (void)gettimeofday(&end_time, nullptr);
579 const uint64_t kUSecondInSecond = 1000000;
580 uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
581 cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
582 MS_LOG(INFO) << "Kernel PreBuild run in " << PRIu64 << " us " << cost;
583 MS_LOG(INFO) << "Single op pre build end.";
584 }
585
AscendSingleOpCompile(const std::vector<AnfNodePtr> & anf_nodes)586 bool AscendKernelCompileManager::AscendSingleOpCompile(const std::vector<AnfNodePtr> &anf_nodes) {
587 MS_LOG(INFO) << "Single op parallel build start";
588 MS_EXCEPTION_IF_NULL(build_manager_);
589 auto json_creator = std::make_shared<BuildTbeJsonCreator>();
590 MS_EXCEPTION_IF_NULL(json_creator);
591 std::string job_type;
592 for (const auto &node : anf_nodes) {
593 MS_EXCEPTION_IF_NULL(node);
594 if (AnfAlgo::GetKernelMod(node) != nullptr && !is_tune_flag_) {
595 continue;
596 }
597 auto op_name = AnfAlgo::GetCNodeName(node);
598 nlohmann::json kernel_json;
599 if (!json_creator->GenJson(node, &kernel_json)) {
600 MS_LOG(EXCEPTION) << "Generate compile json failed, [" << op_name << ", " << node->fullname_with_scope()
601 << "], node trace: " << trace::DumpSourceLines(node);
602 }
603 auto json_name = json_creator->GetJsonName();
604 std::vector<size_t> in_size_list;
605 std::vector<size_t> out_size_list;
606 (void)TbeKernelBuild::GetIOSize2(kernel_json, &in_size_list, &out_size_list, node);
607 // step1: if same node has been dispitch, no need to compile
608 if (single_processed_kernels_.find(json_name) != single_processed_kernels_.end()) {
609 build_manager_->SaveSameOpInfo(node, json_name, in_size_list, out_size_list);
610 continue;
611 }
612 // step2: if node has in the cache, load the cache.
613 if (!is_tune_flag_ && op_debug_level_ != "1" &&
614 build_manager_->SearchInCache(json_name, in_size_list, out_size_list, node.get())) {
615 continue;
616 }
617 (void)single_processed_kernels_.insert(json_name);
618
619 nlohmann::json build_json;
620 job_type = is_tune_flag_ ? kTune : kCompile;
621 JsonAssemble(job_type, kernel_json, &build_json);
622 auto build_str = build_json.dump(indent);
623 MS_LOG(DEBUG) << "Op build json file : " << build_str;
624 TbeUtils::SaveJsonInfo(json_name, build_str);
625 // save pair<task_id, node> for exception print and get node trace
626 auto task_id = GetJsonValue<int>(build_json, kJobId);
627 std::pair<int, AnfNodePtr> id_node(task_id, node);
628 (void)job_id_to_node_.insert(id_node);
629 // start compile
630 auto build_result = build_manager_->ProcessTbeJob(build_json);
631 auto json_obj = TurnStrToJson(build_result);
632 // print message of build
633 PrintCompileResult(json_obj);
634 build_manager_->SaveTaskInfo(task_id, node, json_name, in_size_list, out_size_list);
635 if (json_obj.at(kStatus) == kRunning) {
636 // job is running, save into job_list.
637 MS_LOG(DEBUG) << "Target job is running, keep it into job_list, json name: " << json_name;
638 std::pair<int, nlohmann::json> pair(task_id, build_json);
639 (void)job_list_.insert(pair);
640 } else if (json_obj.at(kStatus) == kSuccess) {
641 // job running success, save build result.
642 MS_LOG(DEBUG) << "Target job compile success, save build result, json name: " << json_name;
643 std::string build_res = GetJsonValue<std::string>(json_obj, kResult);
644 (void)build_manager_->TaskFinishProcess(task_id, build_res);
645 } else {
646 // job running failed, raise exception (only single op)
647 ResetOldTask();
648 single_processed_kernels_.clear();
649 MS_LOG(EXCEPTION) << "Kernel compile failed, operator [" << op_name << ", " << json_name
650 << "], node trace: " << trace::DumpSourceLines(node);
651 }
652 }
653 // query job if build success
654 QueryFinishJob(job_type);
655 return build_manager_->GenSameOpKernelMod();
656 }
657
AscendFusionOpCompile(const std::vector<FusionScopeInfo> & fusion_scopes)658 KernelModMap AscendKernelCompileManager::AscendFusionOpCompile(const std::vector<FusionScopeInfo> &fusion_scopes) {
659 MS_LOG(INFO) << "Fusion op build start";
660 KernelModMap kernel_mode_ret;
661 MS_EXCEPTION_IF_NULL(build_manager_);
662 auto json_creator = std::make_shared<FusionBuildTbeJsonCreator>();
663 MS_EXCEPTION_IF_NULL(json_creator);
664 for (const auto &fusion_scope_iter : fusion_scopes) {
665 nlohmann::json fusion_op;
666 if (!json_creator->GenJson(fusion_scope_iter, &fusion_op)) {
667 MS_LOG(WARNING) << "Generate fusion json failed, fusion info: " << fusion_scope_iter.full_name;
668 continue;
669 }
670 auto json_name = json_creator->GetJsonName();
671 std::vector<size_t> input_size_list;
672 std::vector<size_t> output_size_list;
673 if (!TbeKernelBuild::GetIOSize(fusion_op[kOpList], fusion_scope_iter.output_nodes, &input_size_list,
674 &output_size_list)) {
675 continue;
676 }
677 // cache
678 if (!is_tune_flag_ && op_debug_level_ != "1") {
679 auto kernel_pack = TbeUtils::SearchCache(json_name);
680 if (kernel_pack != nullptr) {
681 auto kernel_mod = build_manager_->GenKernelMod(input_size_list, output_size_list, kernel_pack);
682 if (kernel_mod != nullptr) {
683 kernel_mode_ret[fusion_scope_iter.scope_id] = kernel_mod;
684 continue;
685 }
686 }
687 }
688
689 // same op no need build, but need wait build finish to set kernel mode
690 if (fusion_processed_kernels_.find(json_name) != fusion_processed_kernels_.end()) {
691 build_manager_->SaveSameFusionOpInfo(fusion_scope_iter.scope_id, json_name, tbe::kProcessorAiCore,
692 input_size_list, output_size_list);
693 continue;
694 }
695 // op has been processed
696 (void)fusion_processed_kernels_.insert(json_name);
697
698 nlohmann::json build_json;
699 const std::string job_type = is_tune_flag_ ? kTune : kFusionCompile;
700 JsonAssemble(job_type, fusion_op, &build_json);
701 auto build_str = build_json.dump(indent);
702 MS_LOG(DEBUG) << "FusionOp build json file : " << build_str;
703 TbeUtils::SaveJsonInfo(json_name, build_str);
704 auto build_result = build_manager_->ProcessTbeJob(build_json);
705 auto json_obj = TurnStrToJson(build_result);
706 PrintCompileResult(json_obj);
707 auto task_id = GetJsonValue<int>(json_obj, kJobId);
708 fusion_op_names_[task_id] = json_name;
709 build_manager_->SaveTaskInfo(task_id, nullptr, json_name, input_size_list, output_size_list,
710 fusion_scope_iter.scope_id);
711 if (json_obj.at(kStatus) == kRunning) {
712 // job is running, save it into job_list.
713 std::pair<int, nlohmann::json> pair(task_id, build_json);
714 (void)job_list_.insert(pair);
715 } else if (json_obj.at(kStatus) == kSuccess) {
716 // job running success, save build result.
717 std::string build_res = GetJsonValue<std::string>(json_obj, kResult);
718 auto kernel_mode_item = build_manager_->TaskFinishProcess(task_id, build_res, false);
719 if (kernel_mode_item.second != nullptr) {
720 (void)kernel_mode_ret.emplace(kernel_mode_item);
721 }
722 }
723 }
724 // start query if job has finished
725 QueryFusionFinishJob(&kernel_mode_ret);
726 if (!build_manager_->GenSameFusionOpKernelMod(&kernel_mode_ret)) {
727 MS_LOG(INFO) << "Fusion warning: cache failed.";
728 }
729 return kernel_mode_ret;
730 }
731
PrintInitResult(const nlohmann::json & json)732 void AscendKernelCompileManager::PrintInitResult(const nlohmann::json &json) {
733 auto job_type = GetJsonValue<std::string>(json, kJobType);
734 MS_LOG(DEBUG) << "Job: " << job_type << " result processing.";
735 // init only concern about result, but don't care about the process.
736 if (json.at(kStatus) == kFailed) {
737 PrintProcessLog(json);
738 MS_LOG(EXCEPTION) << "Job " << job_type << " running failed.";
739 }
740 MS_LOG(INFO) << "Job: " << job_type << " running success.";
741 }
742
TbeInitialize()743 void AscendKernelCompileManager::TbeInitialize() {
744 if (tbe_init_flag_) {
745 MS_LOG(DEBUG) << "TbeInitialize already complete, no need do again";
746 return;
747 }
748 MS_LOG(INFO) << "TbeInitialize start";
749 build_manager_ = std::make_shared<ParallelBuildManager>();
750 MS_EXCEPTION_IF_NULL(build_manager_);
751 nlohmann::json init_json;
752 nlohmann::json soc_info = TbeUtils::GenSocInfo();
753 JsonAssemble(kInitialize, soc_info, &init_json);
754 auto offline_tune = (init_json[kJobContent][kSocInfo][kOfflineTune]).get<bool>();
755 op_debug_level_ = (init_json[kJobContent][kSocInfo]["op_debug_level"]).get<std::string>();
756 auto auto_tiling_mode = (init_json[kJobContent][kSocInfo]["autoTilingMode"]).get<std::string>();
757 tbe_init_flag_ = true;
758 is_tune_flag_ = offline_tune || (auto_tiling_mode != "NO_TUNE");
759
760 auto init_str = init_json.dump();
761 MS_LOG(INFO) << "TbeInitialize json file : " << init_str;
762 TbeUtils::SaveJsonInfo(kInitialize, init_str);
763 auto init_ret = build_manager_->ProcessTbeJob(init_json);
764 auto json_ret = TurnStrToJson(init_ret);
765 PrintInitResult(json_ret);
766 MS_LOG(INFO) << "TbeInitialize end.";
767 }
768
OpSelectAndCheckResultProcess(const nlohmann::json & json,const AnfNodePtr & node)769 std::string AscendKernelCompileManager::OpSelectAndCheckResultProcess(const nlohmann::json &json,
770 const AnfNodePtr &node) {
771 // for check supported and format select
772 MS_EXCEPTION_IF_NULL(node);
773 auto job_type = GetJsonValue<std::string>(json, kJobType);
774 auto json_name = GetJsonValue<std::string>(json, kFusionOpName);
775 if (json.at(kStatus) == kFailed) {
776 if (job_type == kCheckSupport) {
777 PrintProcessLog(json, WARNING);
778 MS_LOG(WARNING) << "Job:" << job_type << " running failed, json name:" << json_name;
779 return kFailed;
780 } else {
781 auto all_logs = GetJsonValue<std::vector<nlohmann::json>>(json, kProcessInfo);
782 auto except_msg = FilterExceptionMessage(all_logs);
783 MS_LOG(EXCEPTION) << "Job:" << job_type << " running failed, json name: " << json_name
784 << "\n exception message:" << except_msg << "\n node trace: " << trace::DumpSourceLines(node);
785 }
786 }
787 auto res = GetJsonValue<std::string>(json, kResult);
788 if (job_type == kCheckSupport && res != kFullySupported) {
789 PrintProcessLog(json, WARNING);
790 }
791 MS_LOG(INFO) << "Job:" << job_type << " running success, " << json_name << ", get: " << res;
792 return res;
793 }
794
AscendOpSelectFormat(const AnfNodePtr & node)795 std::string AscendKernelCompileManager::AscendOpSelectFormat(const AnfNodePtr &node) {
796 MS_EXCEPTION_IF_NULL(node);
797 auto op_name = AnfAlgo::GetCNodeName(node);
798 MS_LOG(INFO) << "Op select format start for op [" << op_name << ", " << node->fullname_with_scope() << "]";
799 MS_EXCEPTION_IF_NULL(build_manager_);
800 auto json_creator = std::make_shared<SelectTbeJsonCreator>();
801 MS_EXCEPTION_IF_NULL(json_creator);
802 nlohmann::json kernel_info;
803 nlohmann::json select_json;
804 if (!json_creator->GenJson(node, &kernel_info)) {
805 MS_LOG(EXCEPTION) << "Gen select json failed. [" << op_name << ", " << node->fullname_with_scope() << "]";
806 }
807 JsonAssemble(kSelectFormat, kernel_info, &select_json);
808 auto select_ret = build_manager_->ProcessTbeJob(select_json);
809 auto json_ret = TurnStrToJson(select_ret);
810 return OpSelectAndCheckResultProcess(json_ret, node);
811 }
812
AscendOpCheckSupported(const AnfNodePtr & node)813 bool AscendKernelCompileManager::AscendOpCheckSupported(const AnfNodePtr &node) {
814 MS_EXCEPTION_IF_NULL(node);
815 auto full_name = node->fullname_with_scope();
816 MS_LOG(INFO) << "Check supported for op [" << full_name << "]";
817 MS_EXCEPTION_IF_NULL(build_manager_);
818 auto json_creator = std::make_shared<CheckTbeJsonCreator>();
819 MS_EXCEPTION_IF_NULL(json_creator);
820 nlohmann::json kernel_info;
821 nlohmann::json check_json;
822 if (!json_creator->GenJson(node, &kernel_info)) {
823 MS_LOG(EXCEPTION) << "Gen check supported json failed.[" << full_name
824 << "], node trace: " << trace::DumpSourceLines(node);
825 }
826 JsonAssemble(kCheckSupport, kernel_info, &check_json);
827 auto check_ret = build_manager_->ProcessTbeJob(check_json);
828 auto json_ret = TurnStrToJson(check_ret);
829 std::string check_info = OpSelectAndCheckResultProcess(json_ret, node);
830 return check_info == kFullySupported;
831 }
832
TbeFinalize()833 void AscendKernelCompileManager::TbeFinalize() {
834 MS_LOG(INFO) << "TbeFinalize start.";
835 if (!tbe_init_flag_) {
836 MS_LOG(DEBUG) << "TbeFinalize already complete, no need do again";
837 return;
838 }
839 build_manager_ = nullptr;
840 tbe_init_flag_ = false;
841 is_tune_flag_ = false;
842 job_list_.clear();
843 job_id_to_node_.clear();
844 single_processed_kernels_.clear();
845 fusion_processed_kernels_.clear();
846 MS_LOG(INFO) << "TbeFinalize end.";
847 }
848
~AscendKernelCompileManager()849 AscendKernelCompileManager::~AscendKernelCompileManager() { TbeFinalize(); }
850
851 bool AscendKernelCompileManager::tbe_init_flag_ = false;
852 bool AscendKernelCompileManager::is_tune_flag_ = false;
853 } // namespace ascend
854 } // namespace kernel
855 } // namespace mindspore
856