• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "backend/kernel_compiler/tbe/ascend_kernel_compile.h"
18 #include <sys/syscall.h>
19 #include <unistd.h>
20 #include <algorithm>
21 #include <map>
22 #include <memory>
23 #include <string>
24 #include <unordered_map>
25 #include <unordered_set>
26 #include <vector>
27 #include "mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/tbe_json_creator.h"
28 #include "mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/single_tbe_json_creator.h"
29 #include "mindspore/ccsrc/backend/kernel_compiler/tbe/tbe_json/fusion_tbe_json_creator.h"
30 #include "backend/kernel_compiler/tbe/tbe_utils.h"
31 #include "backend/kernel_compiler/tbe/tbe_convert_utils.h"
32 #include "backend/session/anf_runtime_algorithm.h"
33 #include "common/util/error_manager/error_manager.h"
34 #include "debug/anf_ir_dump.h"
35 #include "frontend/operator/ops.h"
36 #include "utils/ms_context.h"
37 #include "utils/ms_utils.h"
38 #include "utils/trace_base.h"
39 #include "utils/utils.h"
40 #include "utils/json_operation_utils.h"
41 
42 namespace mindspore {
43 namespace kernel {
44 namespace ascend {
45 using mindspore::kernel::tbe::TbeAdapter;
46 using mindspore::kernel::tbe::TbeUtils;
47 const int indent = 4;  // for dump json
48 const int kAscii_0 = 48;
49 const int kAscii_9 = 57;
50 const uint32_t kDEFAULT_PROCESS_NUM = 24;
51 constexpr auto kInitialize = "Initialize";
52 constexpr auto kPreCompile = "PreCompile";
53 constexpr auto kFinalize = "Finalize";
54 constexpr auto kCompile = "Compile";
55 constexpr auto kFusionCompile = "FusionOpCompile";
56 constexpr auto kTune = "Tune";
57 constexpr auto kOfflineTune = "offlineTune";
58 constexpr auto kCheckSupport = "CheckSupport";
59 constexpr auto kSelectFormat = "SelectFormat";
60 constexpr auto kFullySupported = "FULLY_SUPPORTED";
61 constexpr auto kLevel = "level";
62 constexpr auto kMessage = "message";
63 constexpr auto kErrorCode = "errCode";
64 constexpr auto kIndex = "index";
65 constexpr auto kStatus = "status";
66 constexpr auto kJobType = "job_type";
67 constexpr auto kJobId = "job_id";
68 constexpr auto kSourceId = "source_id";
69 constexpr auto kTuneMode = "tune_mode";
70 constexpr auto kTuneType = "tune_type";
71 constexpr auto kJobContent = "job_content";
72 constexpr auto kProcessInfo = "process_info";
73 constexpr auto kReturnValue = "return_value";
74 constexpr auto kFusionOpName = "fusion_op_name";
75 constexpr auto kResult = "result";
76 constexpr auto kOpList = "op_list";
77 constexpr auto kSuccess = "SUCCESS";
78 constexpr auto kRunning = "RUNNING";
79 constexpr auto kFailed = "FAILED";
80 constexpr auto kQuery = "Query";
81 constexpr auto kTrue = "True";
82 constexpr auto kGLOG_v = "GLOG_v";
83 constexpr auto kSocInfo = "SocInfo";
84 constexpr auto kTuneInfo = "TuneInfo";
85 constexpr auto kLicInfo = "LicInfo";
86 constexpr auto kTuneOpList = "tune_op_list";
87 constexpr auto kProcessNum = "process_num";
88 constexpr auto kLogLevel = "log_level";
89 constexpr auto kEnableEvent = "enable_event";
90 constexpr auto kTuneDumpPath = "tune_dump_path";
91 constexpr auto kTuneBankPath = "tune_bank_path";
92 constexpr auto kTbeImplPath = "tbe_impl_path";
93 constexpr auto kParaDebugPath = "para_debug_path";
94 constexpr auto kMS_BUILD_PROCESS_NUM = "MS_BUILD_PROCESS_NUM";
95 constexpr auto kMS_PARA_DEBUG_PATH = "PARA_DEBUG_PATH";
96 constexpr auto kTBE_IMPL_PATH = "TBE_IMPL_PATH";
97 constexpr auto kTUNE_OPS_NAME = "TUNE_OPS_NAME";
98 constexpr auto kDefPath = "/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe/";
99 constexpr auto kBkPath = "/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe/";
100 constexpr int KSleepSeconds = 3;
101 constexpr int KSleepInterval = 1000;
102 
103 namespace {
Order(const nlohmann::json & json1,const nlohmann::json & json2)104 inline bool Order(const nlohmann::json &json1, const nlohmann::json &json2) {
105   return json1[kIndex].dump() < json2[kIndex].dump();
106 }
107 
ReportToErrorManager(const string & message)108 void ReportToErrorManager(const string &message) {
109   nlohmann::json exception_message;
110   if (!ParseJson(message, &exception_message)) {
111     MS_LOG(EXCEPTION) << "Parse tbe exception message error.";
112   }
113   const auto &error_code = GetJsonValue<std::string>(exception_message, kErrorCode);
114   std::map<std::string, std::string> arg_map;
115   for (auto it = exception_message.begin(); it != exception_message.end(); (void)it++) {
116     const std::string arg_key = it.key();
117     if (it.key() == kErrorCode) {
118       continue;
119     }
120     const auto &arg_value = GetJsonValue<std::string>(exception_message, arg_key);
121     arg_map[arg_key] = arg_value;
122   }
123   const auto report_ret = ErrorManager::GetInstance().ReportErrMessage(error_code, arg_map);
124   if (report_ret != 0) {
125     MS_LOG(WARNING) << "Report error message failed, raw error message: " << message;
126   }
127 }
128 
PrintInfo(const nlohmann::json & info,const std::string & job_name,const int job_id,int adjust_log_level)129 void PrintInfo(const nlohmann::json &info, const std::string &job_name, const int job_id, int adjust_log_level) {
130   auto level = GetJsonValue<int>(info, kLevel);
131   level = level > adjust_log_level ? adjust_log_level : level;
132   auto message = GetJsonValue<std::string>(info, kMessage);
133   if (level == 0) {
134     MS_LOG(DEBUG) << "Job id:" << job_id << ", name :" << job_name << ", message:" << message;
135   } else if (level == INFO) {
136     MS_LOG(INFO) << "Job id:" << job_id << ", name :" << job_name << ", message:" << message;
137   } else if (level == WARNING) {
138     MS_LOG(WARNING) << "Job id:" << job_id << ", name :" << job_name << ", message:" << message;
139   } else if (level == ERROR) {
140     MS_LOG(ERROR) << "Job id:" << job_id << ", name :" << job_name << ", message:" << message;
141   } else if (level == EXCEPTION) {
142     ReportToErrorManager(message);
143   }
144 }
145 
FilterExceptionMessage(const std::vector<nlohmann::json> & all_logs)146 std::string FilterExceptionMessage(const std::vector<nlohmann::json> &all_logs) {
147   std::ostringstream buffer;
148   for (const auto &item : all_logs) {
149     auto message = GetJsonValue<std::string>(item, kMessage);
150     if (message.find("except_msg") != std::string::npos) {
151       buffer << message;
152       buffer << "\n";
153     }
154     if (message.find("except_tuple_msg") != std::string::npos) {
155       buffer << message;
156       buffer << "\n";
157     }
158     if (message.find("Error message") != std::string::npos) {
159       buffer << message;
160       buffer << "\n";
161     }
162   }
163   auto res = buffer.str().empty() ? "None" : buffer.str();
164   return res;
165 }
166 
IsDigit(const std::string & str)167 bool IsDigit(const std::string &str) {
168   if (str.empty()) {
169     return false;
170   }
171   size_t i = 0;
172   while (i < str.size()) {
173     if (static_cast<int>(str[i]) < kAscii_0 || static_cast<int>(str[i]) > kAscii_9) {
174       return false;
175     }
176     i++;
177   }
178   return true;
179 }
180 
GetProcessNum()181 uint32_t GetProcessNum() {
182   uint32_t process_num = kDEFAULT_PROCESS_NUM;
183   auto env_process_num = common::GetEnv(kMS_BUILD_PROCESS_NUM);
184   if (!env_process_num.empty()) {
185     if (!IsDigit(env_process_num)) {
186       MS_LOG(EXCEPTION) << "Invalid environment of 'MS_BUILD_PROCESS_NUM',it should be a digit, but got: "
187                         << env_process_num;
188     }
189     process_num = UlongToUint(std::stoul(env_process_num));
190     if (process_num < 1 || process_num > kDEFAULT_PROCESS_NUM) {
191       MS_LOG(EXCEPTION) << "Invalid environment of 'MS_BUILD_PROCESS_NUM', the value should be in [1, 24], but got: "
192                         << process_num;
193     }
194   }
195   return process_num;
196 }
197 
StrToInt(const std::string & env)198 int StrToInt(const std::string &env) {
199   if (env == "0") {
200     return DEBUG;
201   } else if (env == "1") {
202     return INFO;
203   } else if (env == "3") {
204     return ERROR;
205   }
206   return WARNING;
207 }
208 
GetLogLevel()209 int GetLogLevel() {
210   auto env = common::GetEnv(kGLOG_v);
211   int ms_level = StrToInt(env);
212   return ms_level;
213 }
214 
GetParaDebugPath()215 std::string GetParaDebugPath() {
216   auto save_path = common::GetEnv(kMS_PARA_DEBUG_PATH);
217   char real_path[PATH_MAX] = {0};
218   if (!save_path.empty()) {
219     if (realpath(save_path.c_str(), real_path)) {
220       save_path = real_path;
221     } else {
222       MS_LOG(EXCEPTION) << "Invalid environment variable 'PARA_DEBUG_PATH', the path is " << save_path
223                         << ". Please check (1) whether the path exists, (2) whether the path has the access "
224                            "permission, (3) whether the path is too long.";
225     }
226   } else {
227     save_path = "";
228   }
229   return save_path;
230 }
231 
GetTbePath()232 std::string GetTbePath() {
233   auto save_path = common::GetEnv(kTBE_IMPL_PATH);
234   char real_path[PATH_MAX] = {0};
235   if (!save_path.empty()) {
236     if (realpath(save_path.c_str(), real_path)) {
237       save_path = real_path;
238     } else {
239       MS_LOG(EXCEPTION) << "Invalid environment variable 'TBE_IMPL_PATH', the path is " << save_path
240                         << ". Please check (1) whether the path exists, (2) whether the path has the access "
241                            "permission, (3) whether the path is too long. ";
242     }
243   } else {
244     if (realpath(kDefPath, real_path)) {
245       save_path = real_path;
246     } else if (realpath(kBkPath, real_path)) {
247       save_path = real_path;
248     } else {
249       MS_LOG(WARNING) << "Can not get access to [" << kDefPath << "] or [" << kBkPath << "]";
250     }
251   }
252   return save_path;
253 }
254 
GetTuneOpsList(const std::string & d)255 std::vector<std::string> GetTuneOpsList(const std::string &d) {
256   std::vector<string> res;
257   auto ops = common::GetEnv(kTUNE_OPS_NAME);
258   if (ops.empty()) {
259     return {};
260   }
261   size_t p1 = 0;
262   size_t p2 = ops.find(d);
263   while (p2 != std::string::npos) {
264     if (p1 < ops.length() && (p2 - p1) < ops.length()) {
265       (void)res.emplace_back(ops.substr(p1, p2 - p1));
266     }
267 
268     p1 = p2 + 1;
269     p2 = ops.find(d, p1);
270   }
271   if (p1 <= ops.length()) {
272     (void)res.emplace_back(ops.substr(p1));
273   }
274   return res;
275 }
276 }  // namespace
277 
ResetOldTask()278 void AscendKernelCompileManager::ResetOldTask() {
279   if (build_manager_ != nullptr) {
280     build_manager_->ResetTaskInfo();
281   }
282   job_list_.clear();
283   job_id_to_node_.clear();
284 }
285 
PrintProcessLog(const nlohmann::json & json,int adjust_log_level=EXCEPTION)286 void AscendKernelCompileManager::PrintProcessLog(const nlohmann::json &json, int adjust_log_level = EXCEPTION) {
287   auto all_logs = GetJsonValue<std::vector<nlohmann::json>>(json, kProcessInfo);
288   auto job_id = GetJsonValue<int>(json, kJobId);
289   auto json_name = GetJsonValue<std::string>(json, kFusionOpName);
290   std::sort(all_logs.begin(), all_logs.end(), Order);
291   for (const auto &item : all_logs) {
292     PrintInfo(item, json_name, job_id, adjust_log_level);
293   }
294 }
295 
PrintCompileResult(const nlohmann::json & json)296 void AscendKernelCompileManager::PrintCompileResult(const nlohmann::json &json) {
297   auto job_type = GetJsonValue<std::string>(json, kJobType);
298   auto json_name = GetJsonValue<std::string>(json, kFusionOpName);
299   MS_LOG(DEBUG) << "Job: " << job_type << " post process";
300   if (json.at(kStatus) == kFailed) {
301     if (job_type == kFusionCompile || job_type == kPreCompile || job_type == kTune) {
302       auto all_logs = GetJsonValue<std::vector<nlohmann::json>>(json, kProcessInfo);
303       auto message = FilterExceptionMessage(all_logs);
304       MS_LOG(INFO) << "Job " << job_type << " running failed, json name, " << json_name << "\n except_msg: " << message;
305       return;
306     } else {
307       PrintProcessLog(json);
308       auto task_id = GetJsonValue<int>(json, kJobId);
309       auto target_node = job_id_to_node_[task_id];
310       MS_LOG(EXCEPTION) << "Job " << job_type << " running failed, json name: " << json_name
311                         << "node trace: " << trace::DumpSourceLines(target_node);
312       return;
313     }
314   }
315   MS_LOG(INFO) << "Job " << job_type << " running " << json.at(kStatus) << ", json name: " << json_name;
316 }
317 
QueryResultProcess(const nlohmann::json & json,TargetJobStatus * task_info)318 void AscendKernelCompileManager::QueryResultProcess(const nlohmann::json &json, TargetJobStatus *task_info) {
319   if (GetJsonValue<std::string>(json, kStatus) == kSuccess) {
320     nlohmann::json query_result;
321     if (!ParseJson(GetJsonValue<std::string>(json, kResult), &query_result)) {
322       MS_LOG(EXCEPTION) << "Parse query result error.";
323     }
324     auto json_name = GetJsonValue<std::string>(query_result, kFusionOpName);
325     auto target_job_id = query_result.at(kJobId);
326     auto target_status = query_result.at(kStatus);
327     // target job result
328     auto all_logs = GetJsonValue<std::vector<nlohmann::json>>(query_result, kProcessInfo);
329     auto message = FilterExceptionMessage(all_logs);
330     // save job status and exception message
331     task_info->target_job_id = target_job_id;
332     task_info->json_name = json_name;
333     task_info->except_msg = message;
334     if (target_status == kSuccess) {
335       task_info->job_status = kSuccess;
336     } else if (target_status != kSuccess && target_status != kRunning) {
337       task_info->job_status = kFailed;
338     }
339   }
340 }
341 
TurnStrToJson(const std::string & string) const342 nlohmann::json AscendKernelCompileManager::TurnStrToJson(const std::string &string) const {
343   nlohmann::json json;
344   if (!ParseJson(string, &json)) {
345     MS_LOG(EXCEPTION) << "Parse build result error.";
346   }
347   if (!json.is_object()) {
348     MS_LOG(EXCEPTION) << "Json str is not an object, str: " << string;
349   }
350   return json;
351 }
352 
ParseTargetJobStatus(const std::string & type,const std::string & job_result,std::vector<int> * success_job)353 void AscendKernelCompileManager::ParseTargetJobStatus(const std::string &type, const std::string &job_result,
354                                                       std::vector<int> *success_job) {
355   MS_EXCEPTION_IF_NULL(success_job);
356   auto json_obj = TurnStrToJson(job_result);
357   // the query job' status.
358   if (json_obj.at(kStatus) == kSuccess) {
359     nlohmann::json query_obj;
360     if (!ParseJson(GetJsonValue<std::string>(json_obj, kResult), &query_obj)) {
361       MS_LOG(EXCEPTION) << "Parse query result error.";
362     }
363     auto kernel_name = GetJsonValue<std::string>(query_obj, kFusionOpName);
364     struct TargetJobStatus task_info;
365     QueryResultProcess(json_obj, &task_info);
366     auto target_node = job_id_to_node_[task_info.target_job_id];
367     if (task_info.job_status == kSuccess) {
368       MS_LOG(DEBUG) << "Job " << GetJsonValue<std::string>(query_obj, kJobType) << " running success.";
369       std::string build_result = GetJsonValue<std::string>(query_obj, kResult);
370       if (type == kPreCompile) {
371         build_manager_->PreTaskFinishProcess(task_info.target_job_id, build_result);
372       } else {
373         (void)build_manager_->TaskFinishProcess(task_info.target_job_id, build_result);
374       }
375       (void)success_job->emplace_back(task_info.target_job_id);
376     } else if (task_info.job_status == kFailed) {
377       if (type == kPreCompile) {
378         (void)success_job->emplace_back(task_info.target_job_id);
379         MS_LOG(WARNING) << "Single op pre build failed ,op: " << kernel_name
380                         << "\n except_msg : " << task_info.except_msg;
381       } else {
382         ResetOldTask();
383         single_processed_kernels_.clear();
384         MS_LOG(EXCEPTION) << "Single op compile failed, op: " << kernel_name
385                           << "\n except_msg : " << task_info.except_msg
386                           << "\n node trace: " << trace::DumpSourceLines(target_node);
387       }
388     }
389   } else {
390     if (type == kPreCompile) {
391       MS_LOG(WARNING) << "Query job failed.";
392       return;
393     }
394     MS_LOG(EXCEPTION) << "Query job failed.";
395   }
396 }
397 
QueryFinishJob(const std::string & job_type)398 void AscendKernelCompileManager::QueryFinishJob(const std::string &job_type) {
399   MS_EXCEPTION_IF_NULL(build_manager_);
400   size_t query_cnt = 0;
401   while (!job_list_.empty()) {
402     std::vector<int> success_job;
403     auto iter = job_list_.begin();
404     while (iter != job_list_.end()) {
405       nlohmann::json query_json;
406       auto kernel_json = iter->second;
407       JsonAssemble(kQuery, kernel_json, &query_json);
408       auto job_result = build_manager_->ProcessTbeJob(query_json);
409       query_cnt++;
410       ParseTargetJobStatus(job_type, job_result, &success_job);
411       (void)iter++;
412     }
413     for (auto k : success_job) {
414       (void)job_list_.erase(k);
415     }
416     success_job.clear();
417     if (!job_list_.empty()) {
418       if (query_cnt % KSleepInterval == 0) {
419         MS_LOG(INFO) << "Querying Parallel Compilation Job. Current Query Count: " << query_cnt;
420         (void)sleep(KSleepSeconds);
421       }
422     }
423   }
424 }
425 
QueryFusionFinishJob(KernelModMap * kernel_mode_ret)426 void AscendKernelCompileManager::QueryFusionFinishJob(KernelModMap *kernel_mode_ret) {
427   MS_EXCEPTION_IF_NULL(build_manager_);
428   MS_EXCEPTION_IF_NULL(kernel_mode_ret);
429   int build_failed_nums = 0;
430   size_t query_cnt = 0;
431   while (!job_list_.empty()) {
432     std::vector<int> success_job;
433     auto iter = job_list_.begin();
434     while (iter != job_list_.end()) {
435       nlohmann::json query_json;
436       auto kernel_json = iter->second;
437       JsonAssemble(kQuery, kernel_json, &query_json);
438       auto build_result = build_manager_->ProcessTbeJob(query_json);
439       query_cnt++;
440       auto json_obj = TurnStrToJson(build_result);
441       if (json_obj.at(kStatus) == kSuccess) {
442         struct TargetJobStatus task_info;
443         QueryResultProcess(json_obj, &task_info);
444         if (task_info.job_status == kSuccess) {
445           MS_LOG(DEBUG) << "Job " << GetJsonValue<std::string>(json_obj, kJobType) << " running success.";
446           std::string build_res = GetJsonValue<std::string>(json_obj, kResult);
447           auto kernel_mode_item = build_manager_->TaskFinishProcess(task_info.target_job_id, build_res, false);
448           if (kernel_mode_item.second != nullptr) {
449             (void)kernel_mode_ret->emplace(kernel_mode_item);
450           }
451           (void)success_job.emplace_back(task_info.target_job_id);
452         } else if (task_info.job_status == kFailed) {
453           MS_LOG(INFO) << "FusionOp compile failed, json name: " << task_info.json_name
454                        << "\n Except_msg: " << task_info.except_msg;
455           auto target_id = task_info.target_job_id;
456           (void)success_job.emplace_back(target_id);
457           build_failed_nums += 1;
458         }
459       } else {
460         MS_LOG(EXCEPTION) << "Fusion op query failed. message: " << build_result;
461       }
462       (void)iter++;
463     }
464     for (auto k : success_job) {
465       (void)job_list_.erase(k);
466     }
467     success_job.clear();
468     if (!job_list_.empty()) {
469       if (query_cnt % KSleepInterval == 0) {
470         MS_LOG(INFO) << "Querying Parallel Compilation Job. Current Query Count: " << query_cnt;
471         (void)sleep(KSleepSeconds);
472       }
473     }
474   }
475   MS_LOG(INFO) << "Compile Fusion Kernel Failed Num: " << build_failed_nums;
476 }
477 
JsonAssemble(const std::string & job_type,const nlohmann::json & src_json,nlohmann::json * dst_json)478 void AscendKernelCompileManager::JsonAssemble(const std::string &job_type, const nlohmann::json &src_json,
479                                               nlohmann::json *dst_json) {
480   MS_EXCEPTION_IF_NULL(src_json);
481   MS_EXCEPTION_IF_NULL(dst_json);
482   static size_t job_id = 0;
483   auto context_ptr = MsContext::GetInstance();
484   MS_EXCEPTION_IF_NULL(context_ptr);
485   static uint32_t source_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
486   (*dst_json)[kJobType] = job_type;
487   (*dst_json)[kJobId] = job_id++;
488   (*dst_json)[kSourceId] = source_id;
489   if (job_type == kInitialize || job_type == kFinalize) {
490     nlohmann::json job_info;
491     static auto process_num = GetProcessNum();
492     job_info[kProcessNum] = process_num;
493     job_info[kLogLevel] = GetLogLevel();
494     job_info[kEnableEvent] = false;
495     job_info[kParaDebugPath] = GetParaDebugPath();
496     job_info[kTbeImplPath] = GetTbePath();
497     job_info[kSocInfo] = src_json;
498     nlohmann::json tune_infos;
499     tune_infos[kTuneOpList] = GetTuneOpsList(",");
500     tune_infos[kTuneDumpPath] = TbeUtils::GetTuneDumpPath();
501     tune_infos[kTuneBankPath] = TbeUtils::GetBankPath();
502     job_info[kTuneInfo] = tune_infos;
503     nlohmann::json lic_infos;
504     kernel::tbe::TbeUtils::GenLicInfo(&lic_infos);
505     job_info[kLicInfo] = lic_infos;
506     (*dst_json)[kJobContent] = job_info;
507   } else if (job_type == kQuery) {
508     nlohmann::json content;
509     content[kSourceId] = src_json[kSourceId];
510     content[kJobId] = src_json[kJobId];
511     (*dst_json)[kJobContent] = content;
512   } else {
513     (*dst_json)[kJobContent] = src_json;
514   }
515 }
516 
GetAllAscendNodes(const std::shared_ptr<session::KernelGraph> & kernel_graph,std::vector<AnfNodePtr> * tbe_nodes)517 void AscendKernelCompileManager::GetAllAscendNodes(const std::shared_ptr<session::KernelGraph> &kernel_graph,
518                                                    std::vector<AnfNodePtr> *tbe_nodes) {
519   MS_EXCEPTION_IF_NULL(kernel_graph);
520   auto all_nodes = kernel_graph->execution_order();
521   for (const auto &anf_node : all_nodes) {
522     MS_EXCEPTION_IF_NULL(anf_node);
523     if (!AnfAlgo::IsRealKernel(anf_node)) {
524       continue;
525     }
526     KernelType kernel_type = AnfAlgo::GetKernelType(anf_node);
527     if (kernel_type == TBE_KERNEL) {
528       if (AnfAlgo::GetKernelMod(anf_node) == nullptr) {
529         tbe_nodes->push_back(anf_node);
530       }
531     }
532   }
533 }
534 
AscendPreBuild(const std::shared_ptr<session::KernelGraph> & kernel_graph)535 void AscendKernelCompileManager::AscendPreBuild(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
536   MS_EXCEPTION_IF_NULL(kernel_graph);
537   MS_LOG(INFO) << "Single op pre build start.";
538   struct timeval start_time, end_time;
539   (void)gettimeofday(&start_time, nullptr);
540   MS_EXCEPTION_IF_NULL(build_manager_);
541   std::vector<AnfNodePtr> anf_nodes;
542   GetAllAscendNodes(kernel_graph, &anf_nodes);
543   if (anf_nodes.empty()) {
544     return;
545   }
546   auto json_creator = std::make_shared<BuildTbeJsonCreator>();
547   MS_EXCEPTION_IF_NULL(json_creator);
548   for (const auto &node : anf_nodes) {
549     MS_EXCEPTION_IF_NULL(node);
550     auto op_name = AnfAlgo::GetCNodeName(node);
551     nlohmann::json kernel_json;
552     if (!json_creator->GenJson(node, &kernel_json)) {
553       MS_LOG(EXCEPTION) << "Generate prebuild json failed, [" << op_name << ", " << node->fullname_with_scope()
554                         << "], node trace:" << trace::DumpSourceLines(node);
555     }
556     auto json_name = json_creator->GetJsonName();
557     nlohmann::json build_json;
558     JsonAssemble(kPreCompile, kernel_json, &build_json);
559     auto build_result = build_manager_->ProcessTbeJob(build_json);
560     auto json_obj = TurnStrToJson(build_result);
561     PrintCompileResult(json_obj);
562     auto task_id = GetJsonValue<int>(json_obj, kJobId);
563     build_manager_->SavePreBuildTaskInfo(task_id, node, json_name);
564     if (json_obj.at(kStatus) == kRunning) {
565       std::pair<int, nlohmann::json> pair(task_id, build_json);
566       std::pair<int, AnfNodePtr> id_node(task_id, node);
567       (void)job_list_.insert(pair);
568       (void)job_id_to_node_.insert(id_node);
569     } else if (json_obj.at(kStatus) == kSuccess) {
570       std::string build_res = GetJsonValue<std::string>(json_obj, kResult);
571       build_manager_->PreTaskFinishProcess(task_id, build_res);
572     } else {
573       MS_LOG(WARNING) << "Kernel prebuild failed, op: " << op_name << ", json_name: " << json_name;
574     }
575   }
576 
577   QueryFinishJob(kPreCompile);
578   (void)gettimeofday(&end_time, nullptr);
579   const uint64_t kUSecondInSecond = 1000000;
580   uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
581   cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
582   MS_LOG(INFO) << "Kernel PreBuild run in " << PRIu64 << " us " << cost;
583   MS_LOG(INFO) << "Single op pre build end.";
584 }
585 
AscendSingleOpCompile(const std::vector<AnfNodePtr> & anf_nodes)586 bool AscendKernelCompileManager::AscendSingleOpCompile(const std::vector<AnfNodePtr> &anf_nodes) {
587   MS_LOG(INFO) << "Single op parallel build start";
588   MS_EXCEPTION_IF_NULL(build_manager_);
589   auto json_creator = std::make_shared<BuildTbeJsonCreator>();
590   MS_EXCEPTION_IF_NULL(json_creator);
591   std::string job_type;
592   for (const auto &node : anf_nodes) {
593     MS_EXCEPTION_IF_NULL(node);
594     if (AnfAlgo::GetKernelMod(node) != nullptr && !is_tune_flag_) {
595       continue;
596     }
597     auto op_name = AnfAlgo::GetCNodeName(node);
598     nlohmann::json kernel_json;
599     if (!json_creator->GenJson(node, &kernel_json)) {
600       MS_LOG(EXCEPTION) << "Generate compile json failed, [" << op_name << ", " << node->fullname_with_scope()
601                         << "], node trace: " << trace::DumpSourceLines(node);
602     }
603     auto json_name = json_creator->GetJsonName();
604     std::vector<size_t> in_size_list;
605     std::vector<size_t> out_size_list;
606     (void)TbeKernelBuild::GetIOSize2(kernel_json, &in_size_list, &out_size_list, node);
607     // step1: if same node has been dispitch, no need to compile
608     if (single_processed_kernels_.find(json_name) != single_processed_kernels_.end()) {
609       build_manager_->SaveSameOpInfo(node, json_name, in_size_list, out_size_list);
610       continue;
611     }
612     // step2: if node has in the cache, load the cache.
613     if (!is_tune_flag_ && op_debug_level_ != "1" &&
614         build_manager_->SearchInCache(json_name, in_size_list, out_size_list, node.get())) {
615       continue;
616     }
617     (void)single_processed_kernels_.insert(json_name);
618 
619     nlohmann::json build_json;
620     job_type = is_tune_flag_ ? kTune : kCompile;
621     JsonAssemble(job_type, kernel_json, &build_json);
622     auto build_str = build_json.dump(indent);
623     MS_LOG(DEBUG) << "Op build json file : " << build_str;
624     TbeUtils::SaveJsonInfo(json_name, build_str);
625     // save pair<task_id, node> for exception print and get node trace
626     auto task_id = GetJsonValue<int>(build_json, kJobId);
627     std::pair<int, AnfNodePtr> id_node(task_id, node);
628     (void)job_id_to_node_.insert(id_node);
629     // start compile
630     auto build_result = build_manager_->ProcessTbeJob(build_json);
631     auto json_obj = TurnStrToJson(build_result);
632     // print message of build
633     PrintCompileResult(json_obj);
634     build_manager_->SaveTaskInfo(task_id, node, json_name, in_size_list, out_size_list);
635     if (json_obj.at(kStatus) == kRunning) {
636       // job is running, save into job_list.
637       MS_LOG(DEBUG) << "Target job is running, keep it into job_list, json name: " << json_name;
638       std::pair<int, nlohmann::json> pair(task_id, build_json);
639       (void)job_list_.insert(pair);
640     } else if (json_obj.at(kStatus) == kSuccess) {
641       // job running success, save build result.
642       MS_LOG(DEBUG) << "Target job compile success, save build result, json name: " << json_name;
643       std::string build_res = GetJsonValue<std::string>(json_obj, kResult);
644       (void)build_manager_->TaskFinishProcess(task_id, build_res);
645     } else {
646       // job running failed, raise exception (only single op)
647       ResetOldTask();
648       single_processed_kernels_.clear();
649       MS_LOG(EXCEPTION) << "Kernel compile failed, operator [" << op_name << ", " << json_name
650                         << "], node trace: " << trace::DumpSourceLines(node);
651     }
652   }
653   // query job if build success
654   QueryFinishJob(job_type);
655   return build_manager_->GenSameOpKernelMod();
656 }
657 
AscendFusionOpCompile(const std::vector<FusionScopeInfo> & fusion_scopes)658 KernelModMap AscendKernelCompileManager::AscendFusionOpCompile(const std::vector<FusionScopeInfo> &fusion_scopes) {
659   MS_LOG(INFO) << "Fusion op build start";
660   KernelModMap kernel_mode_ret;
661   MS_EXCEPTION_IF_NULL(build_manager_);
662   auto json_creator = std::make_shared<FusionBuildTbeJsonCreator>();
663   MS_EXCEPTION_IF_NULL(json_creator);
664   for (const auto &fusion_scope_iter : fusion_scopes) {
665     nlohmann::json fusion_op;
666     if (!json_creator->GenJson(fusion_scope_iter, &fusion_op)) {
667       MS_LOG(WARNING) << "Generate fusion json failed, fusion info: " << fusion_scope_iter.full_name;
668       continue;
669     }
670     auto json_name = json_creator->GetJsonName();
671     std::vector<size_t> input_size_list;
672     std::vector<size_t> output_size_list;
673     if (!TbeKernelBuild::GetIOSize(fusion_op[kOpList], fusion_scope_iter.output_nodes, &input_size_list,
674                                    &output_size_list)) {
675       continue;
676     }
677     // cache
678     if (!is_tune_flag_ && op_debug_level_ != "1") {
679       auto kernel_pack = TbeUtils::SearchCache(json_name);
680       if (kernel_pack != nullptr) {
681         auto kernel_mod = build_manager_->GenKernelMod(input_size_list, output_size_list, kernel_pack);
682         if (kernel_mod != nullptr) {
683           kernel_mode_ret[fusion_scope_iter.scope_id] = kernel_mod;
684           continue;
685         }
686       }
687     }
688 
689     // same op no need build, but need wait build finish to set kernel mode
690     if (fusion_processed_kernels_.find(json_name) != fusion_processed_kernels_.end()) {
691       build_manager_->SaveSameFusionOpInfo(fusion_scope_iter.scope_id, json_name, tbe::kProcessorAiCore,
692                                            input_size_list, output_size_list);
693       continue;
694     }
695     // op has been processed
696     (void)fusion_processed_kernels_.insert(json_name);
697 
698     nlohmann::json build_json;
699     const std::string job_type = is_tune_flag_ ? kTune : kFusionCompile;
700     JsonAssemble(job_type, fusion_op, &build_json);
701     auto build_str = build_json.dump(indent);
702     MS_LOG(DEBUG) << "FusionOp build json file : " << build_str;
703     TbeUtils::SaveJsonInfo(json_name, build_str);
704     auto build_result = build_manager_->ProcessTbeJob(build_json);
705     auto json_obj = TurnStrToJson(build_result);
706     PrintCompileResult(json_obj);
707     auto task_id = GetJsonValue<int>(json_obj, kJobId);
708     fusion_op_names_[task_id] = json_name;
709     build_manager_->SaveTaskInfo(task_id, nullptr, json_name, input_size_list, output_size_list,
710                                  fusion_scope_iter.scope_id);
711     if (json_obj.at(kStatus) == kRunning) {
712       // job is running, save it into job_list.
713       std::pair<int, nlohmann::json> pair(task_id, build_json);
714       (void)job_list_.insert(pair);
715     } else if (json_obj.at(kStatus) == kSuccess) {
716       // job running success, save build result.
717       std::string build_res = GetJsonValue<std::string>(json_obj, kResult);
718       auto kernel_mode_item = build_manager_->TaskFinishProcess(task_id, build_res, false);
719       if (kernel_mode_item.second != nullptr) {
720         (void)kernel_mode_ret.emplace(kernel_mode_item);
721       }
722     }
723   }
724   // start query if job has finished
725   QueryFusionFinishJob(&kernel_mode_ret);
726   if (!build_manager_->GenSameFusionOpKernelMod(&kernel_mode_ret)) {
727     MS_LOG(INFO) << "Fusion warning: cache failed.";
728   }
729   return kernel_mode_ret;
730 }
731 
PrintInitResult(const nlohmann::json & json)732 void AscendKernelCompileManager::PrintInitResult(const nlohmann::json &json) {
733   auto job_type = GetJsonValue<std::string>(json, kJobType);
734   MS_LOG(DEBUG) << "Job: " << job_type << " result processing.";
735   // init only concern about result, but don't care about the process.
736   if (json.at(kStatus) == kFailed) {
737     PrintProcessLog(json);
738     MS_LOG(EXCEPTION) << "Job " << job_type << " running failed.";
739   }
740   MS_LOG(INFO) << "Job: " << job_type << " running success.";
741 }
742 
TbeInitialize()743 void AscendKernelCompileManager::TbeInitialize() {
744   if (tbe_init_flag_) {
745     MS_LOG(DEBUG) << "TbeInitialize already complete, no need do again";
746     return;
747   }
748   MS_LOG(INFO) << "TbeInitialize start";
749   build_manager_ = std::make_shared<ParallelBuildManager>();
750   MS_EXCEPTION_IF_NULL(build_manager_);
751   nlohmann::json init_json;
752   nlohmann::json soc_info = TbeUtils::GenSocInfo();
753   JsonAssemble(kInitialize, soc_info, &init_json);
754   auto offline_tune = (init_json[kJobContent][kSocInfo][kOfflineTune]).get<bool>();
755   op_debug_level_ = (init_json[kJobContent][kSocInfo]["op_debug_level"]).get<std::string>();
756   auto auto_tiling_mode = (init_json[kJobContent][kSocInfo]["autoTilingMode"]).get<std::string>();
757   tbe_init_flag_ = true;
758   is_tune_flag_ = offline_tune || (auto_tiling_mode != "NO_TUNE");
759 
760   auto init_str = init_json.dump();
761   MS_LOG(INFO) << "TbeInitialize json file : " << init_str;
762   TbeUtils::SaveJsonInfo(kInitialize, init_str);
763   auto init_ret = build_manager_->ProcessTbeJob(init_json);
764   auto json_ret = TurnStrToJson(init_ret);
765   PrintInitResult(json_ret);
766   MS_LOG(INFO) << "TbeInitialize end.";
767 }
768 
OpSelectAndCheckResultProcess(const nlohmann::json & json,const AnfNodePtr & node)769 std::string AscendKernelCompileManager::OpSelectAndCheckResultProcess(const nlohmann::json &json,
770                                                                       const AnfNodePtr &node) {
771   // for check supported and format select
772   MS_EXCEPTION_IF_NULL(node);
773   auto job_type = GetJsonValue<std::string>(json, kJobType);
774   auto json_name = GetJsonValue<std::string>(json, kFusionOpName);
775   if (json.at(kStatus) == kFailed) {
776     if (job_type == kCheckSupport) {
777       PrintProcessLog(json, WARNING);
778       MS_LOG(WARNING) << "Job:" << job_type << " running failed, json name:" << json_name;
779       return kFailed;
780     } else {
781       auto all_logs = GetJsonValue<std::vector<nlohmann::json>>(json, kProcessInfo);
782       auto except_msg = FilterExceptionMessage(all_logs);
783       MS_LOG(EXCEPTION) << "Job:" << job_type << " running failed, json name: " << json_name
784                         << "\n exception message:" << except_msg << "\n node trace: " << trace::DumpSourceLines(node);
785     }
786   }
787   auto res = GetJsonValue<std::string>(json, kResult);
788   if (job_type == kCheckSupport && res != kFullySupported) {
789     PrintProcessLog(json, WARNING);
790   }
791   MS_LOG(INFO) << "Job:" << job_type << " running success, " << json_name << ", get: " << res;
792   return res;
793 }
794 
AscendOpSelectFormat(const AnfNodePtr & node)795 std::string AscendKernelCompileManager::AscendOpSelectFormat(const AnfNodePtr &node) {
796   MS_EXCEPTION_IF_NULL(node);
797   auto op_name = AnfAlgo::GetCNodeName(node);
798   MS_LOG(INFO) << "Op select format start for op [" << op_name << ", " << node->fullname_with_scope() << "]";
799   MS_EXCEPTION_IF_NULL(build_manager_);
800   auto json_creator = std::make_shared<SelectTbeJsonCreator>();
801   MS_EXCEPTION_IF_NULL(json_creator);
802   nlohmann::json kernel_info;
803   nlohmann::json select_json;
804   if (!json_creator->GenJson(node, &kernel_info)) {
805     MS_LOG(EXCEPTION) << "Gen select json failed. [" << op_name << ", " << node->fullname_with_scope() << "]";
806   }
807   JsonAssemble(kSelectFormat, kernel_info, &select_json);
808   auto select_ret = build_manager_->ProcessTbeJob(select_json);
809   auto json_ret = TurnStrToJson(select_ret);
810   return OpSelectAndCheckResultProcess(json_ret, node);
811 }
812 
AscendOpCheckSupported(const AnfNodePtr & node)813 bool AscendKernelCompileManager::AscendOpCheckSupported(const AnfNodePtr &node) {
814   MS_EXCEPTION_IF_NULL(node);
815   auto full_name = node->fullname_with_scope();
816   MS_LOG(INFO) << "Check supported for op [" << full_name << "]";
817   MS_EXCEPTION_IF_NULL(build_manager_);
818   auto json_creator = std::make_shared<CheckTbeJsonCreator>();
819   MS_EXCEPTION_IF_NULL(json_creator);
820   nlohmann::json kernel_info;
821   nlohmann::json check_json;
822   if (!json_creator->GenJson(node, &kernel_info)) {
823     MS_LOG(EXCEPTION) << "Gen check supported json failed.[" << full_name
824                       << "], node trace: " << trace::DumpSourceLines(node);
825   }
826   JsonAssemble(kCheckSupport, kernel_info, &check_json);
827   auto check_ret = build_manager_->ProcessTbeJob(check_json);
828   auto json_ret = TurnStrToJson(check_ret);
829   std::string check_info = OpSelectAndCheckResultProcess(json_ret, node);
830   return check_info == kFullySupported;
831 }
832 
TbeFinalize()833 void AscendKernelCompileManager::TbeFinalize() {
834   MS_LOG(INFO) << "TbeFinalize start.";
835   if (!tbe_init_flag_) {
836     MS_LOG(DEBUG) << "TbeFinalize already complete, no need do again";
837     return;
838   }
839   build_manager_ = nullptr;
840   tbe_init_flag_ = false;
841   is_tune_flag_ = false;
842   job_list_.clear();
843   job_id_to_node_.clear();
844   single_processed_kernels_.clear();
845   fusion_processed_kernels_.clear();
846   MS_LOG(INFO) << "TbeFinalize end.";
847 }
848 
~AscendKernelCompileManager()849 AscendKernelCompileManager::~AscendKernelCompileManager() { TbeFinalize(); }
850 
851 bool AscendKernelCompileManager::tbe_init_flag_ = false;
852 bool AscendKernelCompileManager::is_tune_flag_ = false;
853 }  // namespace ascend
854 }  // namespace kernel
855 }  // namespace mindspore
856