• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "debug/data_dump/dump_json_parser.h"
17 #include <fstream>
18 #include "utils/log_adapter.h"
19 #include "debug/common.h"
20 #include "utils/ms_context.h"
21 #include "utils/convert_utils_base.h"
22 #include "backend/session/anf_runtime_algorithm.h"
23 #include "debug/data_dump/npy_header.h"
24 #include "debug/anf_ir_utils.h"
25 #include "utils/comm_manager.h"
26 
27 namespace {
28 constexpr auto kCommonDumpSettings = "common_dump_settings";
29 constexpr auto kAsyncDumpSettings = "async_dump_settings";
30 constexpr auto kE2eDumpSettings = "e2e_dump_settings";
31 constexpr auto kDumpMode = "dump_mode";
32 constexpr auto kPath = "path";
33 constexpr auto kNetName = "net_name";
34 constexpr auto kIteration = "iteration";
35 constexpr auto kInputOutput = "input_output";
36 constexpr auto kKernels = "kernels";
37 constexpr auto kSupportDevice = "support_device";
38 constexpr auto kEnable = "enable";
39 constexpr auto kOpDebugMode = "op_debug_mode";
40 constexpr auto kTransFlag = "trans_flag";
41 constexpr auto kDumpInputAndOutput = 0;
42 constexpr auto kDumpInputOnly = 1;
43 constexpr auto kDumpOutputOnly = 2;
44 constexpr auto kMindsporeDumpConfig = "MINDSPORE_DUMP_CONFIG";
45 }  // namespace
46 
47 namespace mindspore {
CheckJsonKeyExist(const nlohmann::json & content,const std::string & key)48 auto DumpJsonParser::CheckJsonKeyExist(const nlohmann::json &content, const std::string &key) {
49   auto iter = content.find(key);
50   if (iter == content.end()) {
51     MS_LOG(EXCEPTION) << "Check dump json failed, " << key << " not found";
52   }
53   return iter;
54 }
55 
GetIfstreamString(const std::ifstream & ifstream)56 std::string GetIfstreamString(const std::ifstream &ifstream) {
57   std::stringstream buffer;
58   buffer << ifstream.rdbuf();
59   return buffer.str();
60 }
61 
IsDumpEnabled()62 bool DumpJsonParser::IsDumpEnabled() {
63   auto single_op = common::GetEnv(kGraphOpRun);
64   auto config_path = common::GetEnv(kMindsporeDumpConfig);
65   if (config_path.empty()) {
66     return false;
67   }
68   if (!single_op.empty() && single_op == "1") {
69     MS_LOG(WARNING) << "Dump is not supported when task is not sink. Please set env GRAPH_OP_RUN to 0 to enable task "
70                        "sink, so that the data can be dumped.";
71     return false;
72   }
73   MS_LOG(INFO) << "Dump config path is " << config_path;
74 
75   auto context = MsContext::GetInstance();
76   MS_EXCEPTION_IF_NULL(context);
77   if (context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
78     MS_LOG(WARNING) << "Dump is disabled in PyNative mode";
79     return false;
80   }
81   return true;
82 }
83 
Parse()84 void DumpJsonParser::Parse() {
85   std::lock_guard<std::mutex> guard(lock_);
86   if (already_parsed_) {
87     return;
88   }
89   already_parsed_ = true;
90   if (!IsDumpEnabled()) {
91     return;
92   }
93 
94   auto dump_config_file = Common::GetConfigFile(kMindsporeDumpConfig);
95   if (!dump_config_file.has_value()) {
96     MS_LOG(EXCEPTION) << "Get dump config file failed";
97   }
98 
99   std::ifstream json_file(dump_config_file.value());
100   if (!json_file.is_open()) {
101     MS_LOG(EXCEPTION) << "Dump file:" << dump_config_file.value() << " open failed."
102                       << " Errno:" << errno;
103   }
104 
105   nlohmann::json j;
106   try {
107     json_file >> j;
108   } catch (nlohmann::json::parse_error &e) {
109     MS_LOG(ERROR) << "Dump json contents:" << GetIfstreamString(json_file);
110     json_file.close();
111     MS_LOG(EXCEPTION) << "Parse dump json failed, error:" << e.what();
112   }
113 
114   // convert json to string
115   std::stringstream ss;
116   ss << j;
117   std::string cfg = ss.str();
118   json_file.close();
119   MS_LOG(INFO) << "Dump json:" << cfg;
120 
121   ParseE2eDumpSetting(j);
122   ParseCommonDumpSetting(j);
123   JudgeDumpEnabled();
124 }
125 
WriteJsonFile(const std::string & file_path,const std::ifstream & json_file)126 void WriteJsonFile(const std::string &file_path, const std::ifstream &json_file) {
127   ChangeFileMode(file_path, S_IWUSR);
128   std::ofstream json_copy(file_path);
129   if (!json_copy.is_open()) {
130     MS_LOG(EXCEPTION) << "Json file " << file_path << "open failed!";
131   }
132   json_copy << json_file.rdbuf();
133   json_copy.close();
134   ChangeFileMode(file_path, S_IRUSR);
135 }
136 
CopyDumpJsonToDir(uint32_t rank_id)137 void DumpJsonParser::CopyDumpJsonToDir(uint32_t rank_id) {
138   this->Parse();
139   if (!IsDumpEnabled()) {
140     return;
141   }
142   auto dump_config_file = Common::GetConfigFile(kMindsporeDumpConfig);
143   if (!dump_config_file.has_value()) {
144     MS_LOG(EXCEPTION) << "Get dump config file failed.";
145   }
146   std::ifstream json_file(dump_config_file.value());
147   if (async_dump_enabled_ || e2e_dump_enabled_) {
148     auto realpath =
149       Common::CreatePrefixPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/data_dump.json");
150     if (!realpath.has_value()) {
151       MS_LOG(ERROR) << "Get real path failed in CopyDumpJsonToDir.";
152     } else {
153       WriteJsonFile(realpath.value(), json_file);
154     }
155   }
156 }
157 
CopyHcclJsonToDir(uint32_t rank_id)158 void DumpJsonParser::CopyHcclJsonToDir(uint32_t rank_id) {
159   if (!IsDumpEnabled()) {
160     return;
161   }
162   std::string config_path = common::GetEnv("MINDSPORE_HCCL_CONFIG_PATH");
163   if (config_path.empty()) {
164     config_path = common::GetEnv("RANK_TABLE_FILE");
165     if (config_path.empty()) {
166       MS_LOG(INFO) << "Get hccl json config failed.";
167       return;
168     }
169   }
170   std::ifstream json_file(config_path);
171   auto realpath = Common::CreatePrefixPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/hccl.json");
172   if (!realpath.has_value()) {
173     MS_LOG(ERROR) << "Get real path failed in CopyHcclJsonToDir.";
174   } else {
175     WriteJsonFile(realpath.value(), json_file);
176   }
177 }
178 
CopyMSCfgJsonToDir(uint32_t rank_id)179 void DumpJsonParser::CopyMSCfgJsonToDir(uint32_t rank_id) {
180   if (!IsDumpEnabled()) {
181     return;
182   }
183   auto realpath = Common::CreatePrefixPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/config.json");
184   if (!realpath.has_value()) {
185     MS_LOG(ERROR) << "Get real path failed in CopyMSConfigJsonToDir.";
186   } else {
187     nlohmann::json ms_info;
188     auto context = MsContext::GetInstance();
189     MS_EXCEPTION_IF_NULL(context);
190     ms_info["device_target"] = context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
191     ms_info["ms_version"] = "1.5.0";
192     const std::string file_path = realpath.value();
193     ChangeFileMode(file_path, S_IWUSR);
194     std::ofstream json_create(file_path);
195     if (!json_create.is_open()) {
196       MS_LOG(EXCEPTION) << "Json file " << file_path << "open failed!";
197     }
198     json_create << ms_info;
199     json_create.close();
200     ChangeFileMode(file_path, S_IRUSR);
201   }
202 }
203 
GetIterDumpFlag() const204 bool DumpJsonParser::GetIterDumpFlag() const { return e2e_dump_enabled_ && IsDumpIter(cur_dump_iter_); }
205 
DumpToFile(const std::string & filename,const void * data,size_t len,const ShapeVector & shape,TypeId type)206 bool DumpJsonParser::DumpToFile(const std::string &filename, const void *data, size_t len, const ShapeVector &shape,
207                                 TypeId type) {
208   if (filename.empty() || data == nullptr || len == 0) {
209     MS_LOG(ERROR) << "Incorrect parameter.";
210     return false;
211   }
212 
213   auto file_path = Common::CreatePrefixPath(filename + ".npy");
214   if (!file_path.has_value()) {
215     MS_LOG(ERROR) << "CreatePrefixPath failed.";
216     return false;
217   }
218   const std::string file_path_str = file_path.value();
219   ChangeFileMode(file_path_str, S_IWUSR);
220   std::ofstream fd(file_path_str, std::ios::out | std::ios::trunc | std::ios::binary);
221   if (!fd.is_open()) {
222     MS_LOG(EXCEPTION) << "Open file " << file_path_str << " failed." << ErrnoToString(errno);
223   }
224   std::string npy_header = GenerateNpyHeader(shape, type);
225   if (!npy_header.empty()) {
226     fd << npy_header;
227     (void)fd.write(reinterpret_cast<const char *>(data), SizeToLong(len));
228     if (fd.bad()) {
229       fd.close();
230       MS_LOG(EXCEPTION) << "Write mem to file " << file_path_str << " failed.";
231     }
232     fd.close();
233     ChangeFileMode(file_path_str, S_IRUSR);
234   }
235   return true;
236 }
237 
ParseCommonDumpSetting(const nlohmann::json & content)238 void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
239   // async_dump is enabled by default, if e2e dump is enabled it will override this
240   auto context = MsContext::GetInstance();
241   MS_EXCEPTION_IF_NULL(context);
242   if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
243     async_dump_enabled_ = true;
244   } else if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice) {
245     if (!e2e_dump_enabled_) {
246       e2e_dump_enabled_ = true;
247       trans_flag_ = true;
248     }
249   }
250 
251   auto common_dump_settings = CheckJsonKeyExist(content, kCommonDumpSettings);
252   auto dump_mode = CheckJsonKeyExist(*common_dump_settings, kDumpMode);
253   auto net_name = CheckJsonKeyExist(*common_dump_settings, kNetName);
254   auto iteration = CheckJsonKeyExist(*common_dump_settings, kIteration);
255   auto input_output = CheckJsonKeyExist(*common_dump_settings, kInputOutput);
256   auto kernels = CheckJsonKeyExist(*common_dump_settings, kKernels);
257   auto support_device = CheckJsonKeyExist(*common_dump_settings, kSupportDevice);
258 
259   nlohmann::detail::iter_impl<const nlohmann::json> op_debug_mode;
260   if (!e2e_dump_enabled_) {
261     op_debug_mode = CheckJsonKeyExist(*common_dump_settings, kOpDebugMode);
262   }
263 
264   ParseDumpMode(*dump_mode);
265   ParseDumpPath(*common_dump_settings);  // Pass in the whole json string to parse because the path field is optional.
266   ParseNetName(*net_name);
267   ParseIteration(*iteration);
268   ParseInputOutput(*input_output);
269   ParseKernels(*kernels);
270   ParseSupportDevice(*support_device);
271   if (!e2e_dump_enabled_) {
272     ParseOpDebugMode(*op_debug_mode);
273   }
274 }
275 
ParseE2eDumpSetting(const nlohmann::json & content)276 void DumpJsonParser::ParseE2eDumpSetting(const nlohmann::json &content) {
277   auto e2e_dump_setting = content.find(kE2eDumpSettings);
278   auto context = MsContext::GetInstance();
279   MS_EXCEPTION_IF_NULL(context);
280   if (e2e_dump_setting == content.end()) {
281     MS_LOG(INFO) << "No e2e_dump_settings";
282     return;
283   }
284 
285   auto e2e_dump_enable = CheckJsonKeyExist(*e2e_dump_setting, kEnable);
286   auto trans_flag = CheckJsonKeyExist(*e2e_dump_setting, kTransFlag);
287 
288   e2e_dump_enabled_ = ParseEnable(*e2e_dump_enable);
289   if (e2e_dump_enabled_ && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
290     MS_LOG(WARNING) << "Deprecated: Synchronous dump mode is deprecated and will be removed in a future release";
291   }
292   trans_flag_ = ParseEnable(*trans_flag);
293 }
294 
CheckJsonUnsignedType(const nlohmann::json & content,const std::string & key)295 void CheckJsonUnsignedType(const nlohmann::json &content, const std::string &key) {
296   if (!content.is_number_unsigned()) {
297     MS_LOG(EXCEPTION) << "Dump config parse failed, " << key << " should be unsigned int type";
298   }
299 }
300 
CheckJsonStringType(const nlohmann::json & content,const std::string & key)301 void CheckJsonStringType(const nlohmann::json &content, const std::string &key) {
302   if (!content.is_string()) {
303     MS_LOG(EXCEPTION) << "Dump config parse failed, " << key << " should be string type";
304   }
305 }
306 
CheckJsonArrayType(const nlohmann::json & content,const std::string & key)307 void CheckJsonArrayType(const nlohmann::json &content, const std::string &key) {
308   if (!content.is_array()) {
309     MS_LOG(EXCEPTION) << "Dump config parse failed, " << key << " should be array type";
310   }
311 }
312 
ParseDumpMode(const nlohmann::json & content)313 void DumpJsonParser::ParseDumpMode(const nlohmann::json &content) {
314   CheckJsonUnsignedType(content, kDumpMode);
315   dump_mode_ = content;
316   if (dump_mode_ != 0 && dump_mode_ != 1) {
317     MS_LOG(EXCEPTION) << "Dump config parse failed, dump_mode should be 0 or 1, but got " << dump_mode_;
318   }
319 }
320 
ParseDumpPath(const nlohmann::json & content)321 void DumpJsonParser::ParseDumpPath(const nlohmann::json &content) {
322   std::string dump_path;
323   auto json_iter = content.find(kPath);
324   // Check if `path` field exists in dump json file.
325   if (json_iter != content.end()) {
326     CheckJsonStringType(*json_iter, kPath);
327     dump_path = *json_iter;
328   }
329   if (dump_path.empty()) {
330     // If no path is found or path is set as empty in dump json file, use MS_DIAGNOSTIC_DATA_PATH/debug_dump as the dump
331     // path value if the env exists.
332     dump_path = common::GetEnv("MS_DIAGNOSTIC_DATA_PATH");
333     if (dump_path.empty()) {
334       MS_LOG(EXCEPTION)
335         << "Dump path is empty. Please set it in dump json file or environment variable `MS_DIAGNOSTIC_DATA_PATH`.";
336     } else {
337       dump_path += "/debug_dump";
338     }
339   }
340   path_ = dump_path;
341   if (!std::all_of(path_.begin(), path_.end(),
342                    [](char c) { return ::isalpha(c) || ::isdigit(c) || c == '-' || c == '_' || c == '/'; })) {
343     MS_LOG(EXCEPTION) << "Dump path only support alphabets, digit or {'-', '_', '/'}, but got:" << path_;
344   }
345   if (path_[0] != '/') {
346     MS_LOG(EXCEPTION) << "Dump path only support absolute path and should start with '/'";
347   }
348 }
349 
ParseNetName(const nlohmann::json & content)350 void DumpJsonParser::ParseNetName(const nlohmann::json &content) {
351   CheckJsonStringType(content, kNetName);
352   net_name_ = content;
353   if (net_name_.empty() || !std::all_of(net_name_.begin(), net_name_.end(),
354                                         [](char c) { return ::isalpha(c) || ::isdigit(c) || c == '-' || c == '_'; })) {
355     MS_LOG(EXCEPTION) << "net_name only supports alphabetic, digit, or {'-', '_'}, but got: " << net_name_;
356   }
357 }
358 
ParseIteration(const nlohmann::json & content)359 void DumpJsonParser::ParseIteration(const nlohmann::json &content) {
360   CheckJsonStringType(content, kIteration);
361   auto context = MsContext::GetInstance();
362   MS_EXCEPTION_IF_NULL(context);
363   if (e2e_dump_enabled_ || async_dump_enabled_) {
364     iteration_ = content;
365     if (iteration_.empty() || (!std::all_of(iteration_.begin(), iteration_.end(), [](char c) {
366           return ::isdigit(c) || c == '-' || c == '|';
367         }) && iteration_ != "all")) {
368       MS_LOG(EXCEPTION) << "iteration only supports digits, {'-', '|'}, or just \"all\" but got: " << iteration_;
369     }
370   } else if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) {
371     MS_LOG(WARNING) << "Dump is not enabled. ";
372   } else {
373     MS_LOG(EXCEPTION) << "Dump Json Parse Failed. Async or E2E should be enabled. ";
374   }
375 }
376 
IsIterInRange(uint32_t iteration,const std::string & range)377 bool IsIterInRange(uint32_t iteration, const std::string &range) {
378   if (range.empty()) {
379     return false;
380   }
381   const std::string dash = "-";
382   std::size_t range_idx = range.find(dash);
383   // no dash in range, compare the value directly
384   if (range_idx == std::string::npos) {
385     return iteration == std::stoul(range);
386   }
387   // make sure there is only one dash in range
388   if (range.find(dash, range_idx + 1) != std::string::npos) {
389     return false;
390   }
391   auto low_range_str = range.substr(0, range_idx);
392   auto high_range_str = range.substr(range_idx + 1);
393   if (low_range_str.empty() || high_range_str.empty()) {
394     return false;
395   }
396   uint32_t low_range = static_cast<uint32_t>(std::stoul(low_range_str));
397   uint32_t high_range = static_cast<uint32_t>(std::stoul(high_range_str));
398   return (low_range <= iteration) && (iteration <= high_range);
399 }
400 
IsDumpIter(uint32_t iteration) const401 bool DumpJsonParser::IsDumpIter(uint32_t iteration) const {
402   // bool DumpJsonParser::IsDumpIter(uint32_t iteration) --> checks if iteration should be dumped or not.
403   if (iteration_ == "all") {
404     return true;
405   }
406   const std::string vertical_bar = "|";
407   std::size_t start = 0;
408   std::size_t end = iteration_.find(vertical_bar);
409   while (end != std::string::npos) {
410     std::string temp = iteration_.substr(start, end - start);
411     auto found = IsIterInRange(iteration, temp);
412     if (found) {
413       return true;
414     }
415     start = end + 1;
416     end = iteration_.find(vertical_bar, start);
417   }
418   std::string temp = iteration_.substr(start);
419   return IsIterInRange(iteration, temp);
420 }
421 
ParseInputOutput(const nlohmann::json & content)422 void DumpJsonParser::ParseInputOutput(const nlohmann::json &content) {
423   CheckJsonUnsignedType(content, kInputOutput);
424   input_output_ = content;
425   const uint32_t max_inout_num = 2;
426   if (input_output_ < 0 || input_output_ > max_inout_num) {
427     MS_LOG(EXCEPTION) << "Dump Json Parse Failed. input_output should be 0, 1, 2";
428   }
429 }
430 
ParseKernels(const nlohmann::json & content)431 void DumpJsonParser::ParseKernels(const nlohmann::json &content) {
432   CheckJsonArrayType(content, kKernels);
433 
434   for (const auto &kernel : content) {
435     auto kernel_str = kernel.dump();
436     kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), '\"'), kernel_str.end());
437     MS_LOG(INFO) << "Need dump kernel:" << kernel_str;
438     auto ret = kernels_.try_emplace({kernel_str, 0});
439     if (!ret.second) {
440       MS_LOG(WARNING) << "Duplicate dump kernel name:" << kernel_str;
441     }
442   }
443 }
444 
ParseSupportDevice(const nlohmann::json & content)445 void DumpJsonParser::ParseSupportDevice(const nlohmann::json &content) {
446   CheckJsonArrayType(content, kSupportDevice);
447   for (const auto &device : content) {
448     uint32_t device_id = device;
449     MS_LOG(INFO) << "Dump support device:" << device_id;
450     auto ret = support_devices_.emplace(device_id);
451     if (!ret.second) {
452       MS_LOG(WARNING) << "Duplicate support device:" << device_id;
453     }
454   }
455 }
456 
ParseEnable(const nlohmann::json & content)457 bool DumpJsonParser::ParseEnable(const nlohmann::json &content) {
458   if (!content.is_boolean()) {
459     MS_LOG(EXCEPTION) << "Dump Json Parse Failed. 'enable' should be boolean type";
460   }
461   return content;
462 }
463 
ParseOpDebugMode(const nlohmann::json & content)464 void DumpJsonParser::ParseOpDebugMode(const nlohmann::json &content) {
465   CheckJsonUnsignedType(content, kOpDebugMode);
466   op_debug_mode_ = content;
467   const size_t max_mode = 3;
468   if (op_debug_mode_ < 0 || op_debug_mode_ > max_mode) {
469     MS_LOG(EXCEPTION) << "Dump Json Parse Failed. op_debug_mode should be 0, 1, 2, 3";
470   }
471 }
472 
JsonConfigToString()473 void DumpJsonParser::JsonConfigToString() {
474   std::string cur_config;
475   cur_config.append("dump_mode:");
476   cur_config.append(std::to_string(dump_mode_));
477   cur_config.append(" path:");
478   cur_config.append(path_);
479   cur_config.append(" net_name:");
480   cur_config.append(net_name_);
481   cur_config.append(" iteration:");
482   cur_config.append(iteration_);
483   cur_config.append(" input_output:");
484   cur_config.append(std::to_string(input_output_));
485   cur_config.append("e2e_enable:");
486   cur_config.append(std::to_string(static_cast<int>(e2e_dump_enabled_)));
487   cur_config.append(" async_dump_enable:");
488   cur_config.append(std::to_string(static_cast<int>(async_dump_enabled_)));
489   MS_LOG(INFO) << cur_config;
490 }
491 
JudgeDumpEnabled()492 void DumpJsonParser::JudgeDumpEnabled() {
493   auto context = MsContext::GetInstance();
494   MS_EXCEPTION_IF_NULL(context);
495   if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice) {
496     async_dump_enabled_ = false;
497   }
498 
499   if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
500     if (async_dump_enabled_ && e2e_dump_enabled_) {
501       async_dump_enabled_ = false;
502       MS_LOG(INFO) << "Disable async dump";
503     }
504   }
505 
506   if (!async_dump_enabled_ && !e2e_dump_enabled_) {
507     MS_LOG(WARNING) << "Dump json parse failed. Dump is not enabled";
508   }
509   if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kCPUDevice) {
510     auto device_id = context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
511     if (support_devices_.find(device_id) == support_devices_.end()) {
512       async_dump_enabled_ = false;
513       e2e_dump_enabled_ = false;
514       MS_LOG(WARNING) << "Dump is not enabled. device_id:" << device_id << " not support";
515     }
516   }
517   JsonConfigToString();
518 }
519 
NeedDump(const std::string & op_full_name) const520 bool DumpJsonParser::NeedDump(const std::string &op_full_name) const {
521   if (dump_mode_ == 0) {
522     return true;
523   }
524   auto iter = kernels_.find(op_full_name);
525   return iter != kernels_.end();
526 }
527 
MatchKernel(const std::string & kernel_name)528 void DumpJsonParser::MatchKernel(const std::string &kernel_name) {
529   auto iter = kernels_.find(kernel_name);
530   if (iter == kernels_.end()) {
531     return;
532   }
533   iter->second = iter->second + 1;
534   MS_LOG(INFO) << "Match dump kernel:" << iter->first << " match times:" << iter->second;
535 }
536 
PrintUnusedKernel()537 void DumpJsonParser::PrintUnusedKernel() {
538   if (!e2e_dump_enabled_ && !async_dump_enabled_) {
539     return;
540   }
541   for (const auto &iter : kernels_) {
542     if (iter.second == 0) {
543       MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first;
544     }
545   }
546 }
547 
GetOpOverflowBinPath(uint32_t graph_id) const548 std::string DumpJsonParser::GetOpOverflowBinPath(uint32_t graph_id) const {
549   std::string bin_path;
550   bin_path.append(path_);
551   bin_path.append("/");
552   bin_path.append("rank_");
553 
554   uint32_t rank_id = 0;
555   auto ms_context = MsContext::GetInstance();
556   MS_EXCEPTION_IF_NULL(ms_context);
557   auto env_rank_id = common::GetEnv("RANK_ID");
558   if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
559     // get actual rank id if it's distribution training case.
560     if (!CommManager::GetInstance().GetRankID(kHcclWorldGroup, &rank_id)) {
561       MS_LOG(INFO) << "Failed to get rank id.";
562     }
563   }
564   bin_path.append(std::to_string(rank_id));
565 
566   bin_path.append("/");
567   bin_path.append(net_name_);
568   bin_path.append("/");
569   bin_path.append(std::to_string(graph_id));
570   bin_path.append("/");
571   bin_path.append(std::to_string(cur_dump_iter_));
572   bin_path.append("/");
573 
574   return bin_path;
575 }
576 
InputNeedDump() const577 bool DumpJsonParser::InputNeedDump() const {
578   return input_output_ == kDumpInputAndOutput || input_output_ == kDumpInputOnly;
579 }
580 
OutputNeedDump() const581 bool DumpJsonParser::OutputNeedDump() const {
582   return input_output_ == kDumpInputAndOutput || input_output_ == kDumpOutputOnly;
583 }
584 
UpdateNeedDumpKernels(const session::KernelGraph & kernel_graph)585 void DumpJsonParser::UpdateNeedDumpKernels(const session::KernelGraph &kernel_graph) {
586   if (!async_dump_enabled_) {
587     return;
588   }
589   MS_LOG(INFO) << "Update async dump kernel list for hccl";
590   std::map<std::string, uint32_t> update_kernels;
591   for (const auto &kernel : kernel_graph.execution_order()) {
592     MS_EXCEPTION_IF_NULL(kernel);
593     if (AnfAlgo::GetKernelType(kernel) == HCCL_KERNEL &&
594         DumpJsonParser::GetInstance().NeedDump(GetKernelNodeName(kernel))) {
595       auto input_size = AnfAlgo::GetInputTensorNum(kernel);
596       for (size_t i = 0; i < input_size; ++i) {
597         auto input_with_index = AnfAlgo::GetPrevNodeOutput(kernel, i);
598         auto input = input_with_index.first;
599         MS_EXCEPTION_IF_NULL(input);
600         if (input->isa<CNode>()) {
601           MS_LOG(INFO) << "[AsyncDump] Match Hccl Node:" << GetKernelNodeName(kernel)
602                        << " Input:" << GetKernelNodeName(input);
603           update_kernels.try_emplace(GetKernelNodeName(input), 0);
604         }
605       }
606     }
607   }
608   kernels_.insert(update_kernels.begin(), update_kernels.end());
609 }
610 }  // namespace mindspore
611