• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2024 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "include/backend/debug/data_dump/dump_json_parser.h"
17 #include <algorithm>
18 #include <fstream>
19 #include "debug/data_dump/npy_header.h"
20 #include "debug/utils.h"
21 #include "include/backend/anf_runtime_algorithm.h"
22 #include "include/common/debug/anf_dump_utils.h"
23 #include "include/common/debug/common.h"
24 #include "include/common/utils/anfalgo.h"
25 #include "include/common/utils/comm_manager.h"
26 #include "mindspore/core/utils/file_utils.h"
27 #include "mindspore/core/utils/ms_utils.h"
28 #include "ops/ascend_op_name.h"
29 #include "utils/convert_utils_base.h"
30 #include "utils/log_adapter.h"
31 #include "utils/ms_context.h"
32 
33 namespace {
34 constexpr auto kCommonDumpSettings = "common_dump_settings";
35 constexpr auto kE2eDumpSettings = "e2e_dump_settings";
36 constexpr auto kDumpMode = "dump_mode";
37 constexpr auto kPath = "path";
38 constexpr auto kNetName = "net_name";
39 constexpr auto kSavedData = "saved_data";
40 constexpr auto kIteration = "iteration";
41 constexpr auto kInputOutput = "input_output";
42 constexpr auto kKernels = "kernels";
43 constexpr auto kSupportDevice = "support_device";
44 constexpr auto kEnable = "enable";
45 constexpr auto kOpDebugMode = "op_debug_mode";
46 constexpr auto kTransFlag = "trans_flag";
47 constexpr auto kSaveArgs = "save_kernel_args";
48 constexpr auto kSampleMode = "sample_mode";
49 constexpr auto kSampleNum = "sample_num";
50 constexpr auto kStatCalcMode = "stat_calc_mode";
51 constexpr auto kHost = "host";
52 constexpr auto kDevice = "device";
53 constexpr auto kStatisticDump = "statistic";
54 constexpr auto kTensorDump = "tensor";
55 constexpr auto kFullDump = "full";
56 constexpr auto kFileFormat = "file_format";
57 constexpr auto kStatisticCategory = "statistic_category";
58 constexpr auto kDumpInputAndOutput = 0;
59 constexpr auto kDumpInputOnly = 1;
60 constexpr auto kDumpOutputOnly = 2;
61 constexpr auto kMindsporeDumpConfig = "MINDSPORE_DUMP_CONFIG";
62 constexpr auto kBracketsOffset = 1;
63 constexpr auto kRegexPrefixLength = 11;
64 const std::vector<std::string> kDefaultStatisticCategory = {"max", "min", "l2norm"};
65 const std::set<std::string> kDeviceStatisticCategory = {"max", "min", "avg", "l2norm"};
66 const std::set<std::string> kHostStatisticCategory = {"max",
67                                                       "min",
68                                                       "avg",
69                                                       "count",
70                                                       "negative zero count",
71                                                       "positive zero count",
72                                                       "nan count",
73                                                       "negative inf count",
74                                                       "positive inf count",
75                                                       "zero count",
76                                                       "md5",
77                                                       "l2norm"};
78 constexpr auto kDeviceStatisticsategory = "['max', 'min', 'avg', 'l2norm']";
79 constexpr auto kSupportedStatisticsategory =
80   "['max', 'min', 'avg', 'count', 'negative zero count', 'positive zero count', 'nan count', 'negative inf count', "
81   "'positive inf count', 'zero count', 'md5', 'l2norm']";
82 }  // namespace
83 
84 namespace mindspore {
CheckJsonKeyExist(const nlohmann::json & content,const std::string & key)85 auto DumpJsonParser::CheckJsonKeyExist(const nlohmann::json &content, const std::string &key) {
86   nlohmann::json::const_iterator iter = content.find(key);
87   if (iter == content.end()) {
88     MS_LOG(EXCEPTION) << "Check dump json failed, " << key << " not found";
89   }
90   return iter;
91 }
92 
CheckSelectableKeyExist(const nlohmann::json & content,const std::string & key)93 bool DumpJsonParser::CheckSelectableKeyExist(const nlohmann::json &content, const std::string &key) {
94   nlohmann::json::const_iterator iter = content.find(key);
95   if (iter == content.end()) {
96     return false;
97   }
98   return true;
99 }
100 
GetIfstreamString(const std::ifstream & ifstream)101 std::string GetIfstreamString(const std::ifstream &ifstream) {
102   std::stringstream buffer;
103   buffer << ifstream.rdbuf();
104   return buffer.str();
105 }
106 
IsDumpEnabled()107 bool DumpJsonParser::IsDumpEnabled() {
108   auto config_path = common::GetEnv(kMindsporeDumpConfig);
109   if (config_path.empty()) {
110     return false;
111   }
112   MS_LOG(INFO) << "Dump config path is " << config_path;
113 
114   auto context = MsContext::GetInstance();
115   MS_EXCEPTION_IF_NULL(context);
116   if (context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode &&
117       context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kAscendDevice) {
118     MS_LOG(EXCEPTION) << "In GPU or CPU, Dump is disabled in PyNative mode. Please set mode to GRAPH_MODE in context.";
119   }
120   if (context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode &&
121       context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice && e2e_dump_enabled_) {
122     MS_LOG(EXCEPTION) << "Dump is only support asynchronous for Ascend in PyNative mode.";
123   }
124   return true;
125 }
126 
PyNativeModeCheck()127 void DumpJsonParser::PyNativeModeCheck() {
128   auto context = MsContext::GetInstance();
129   MS_EXCEPTION_IF_NULL(context);
130   if (context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode &&
131       dump_mode_ == static_cast<uint32_t>(DUMP_KERNELS_WITH_FLAG)) {
132     MS_LOG(EXCEPTION) << "Cell dump is only supported in GRAPH mode. Please set dump_mode to 0 or 1 in PyNative mode.";
133   }
134 }
135 
CheckE2eSetting()136 void DumpJsonParser::CheckE2eSetting() {
137   auto context = MsContext::GetInstance();
138   MS_EXCEPTION_IF_NULL(context);
139   if (e2e_dump_enabled()) {
140     if (!context->IsKByKExecutorMode()) {
141       MS_LOG(WARNING) << "E2e dump only support kernel by kernel mode on Ascend platform.";
142     }
143     CheckStatCalcModeVaild();
144   } else {
145     if (dump_mode_ == static_cast<uint32_t>(DUMP_KERNELS_WITH_FLAG)) {
146       MS_LOG(EXCEPTION) << "Cell dump only support e2e dump mode. Please set dump_mode to 0 or 1.";
147     }
148   }
149 }
150 
151 /*
152  * Feature group: Dump.
153  * Target device group: Ascend, GPU and CPU.
154  * Runtime category: Old runtime, MindRT.
155  * Description: Parse the configuration option in dump json file pointed by environment variable MINDSPORE_DUMP_CONFIG.
156  */
Parse()157 void DumpJsonParser::Parse() {
158   std::lock_guard<std::mutex> guard(lock_);
159   if (already_parsed_) {
160     return;
161   }
162   already_parsed_ = true;
163   if (!IsDumpEnabled()) {
164     return;
165   }
166 
167   auto dump_config_file = Common::GetConfigFile(kMindsporeDumpConfig);
168   if (!dump_config_file.has_value()) {
169     MS_LOG(EXCEPTION) << "Get dump config file failed";
170   }
171 
172   std::ifstream json_file(dump_config_file.value());
173   if (!json_file.is_open()) {
174     MS_LOG(EXCEPTION) << "Dump file:" << dump_config_file.value() << " open failed. Errno:" << errno;
175   }
176 
177   nlohmann::json j;
178   try {
179     json_file >> j;
180   } catch (nlohmann::json::parse_error &e) {
181     MS_LOG(ERROR) << "Dump json contents:" << GetIfstreamString(json_file);
182     json_file.close();
183     MS_LOG(EXCEPTION) << "Parse dump json failed, error:" << e.what();
184   }
185 
186   // convert json to string
187   std::stringstream ss;
188   ss << j;
189   std::string cfg = ss.str();
190   json_file.close();
191   MS_LOG(INFO) << "Dump json:" << cfg;
192 
193   ParseE2eDumpSetting(j);
194   ParseCommonDumpSetting(j);
195   PyNativeModeCheck();
196   CheckE2eSetting();
197   JudgeDumpEnabled();
198   CheckStatCalcModeVaild();
199   ParseStatisticCategory(j);
200 }
201 
ParseStatisticCategory(const nlohmann::json & content)202 void DumpJsonParser::ParseStatisticCategory(const nlohmann::json &content) {
203   if (!IsStatisticDump()) {
204     return;
205   }
206   auto common_dump_settings = CheckJsonKeyExist(content, kCommonDumpSettings);
207   auto set_statistic_category = CheckSelectableKeyExist(*common_dump_settings, kStatisticCategory);
208   if (set_statistic_category) {
209     auto user_statistics = CheckJsonKeyExist(*common_dump_settings, kStatisticCategory);
210     CheckJsonArrayType(*user_statistics, kStatisticCategory);
211     std::string unsupported_items = "";
212     if (IsDeviceCalcStats()) {
213       std::string device_unsupported_items = "";
214       for (const auto &statistic_item_json : *user_statistics) {
215         std::string statistic_item = statistic_item_json;
216         auto rt_find = kDeviceStatisticCategory.find(statistic_item);
217         if (rt_find == kDeviceStatisticCategory.end()) {
218           auto in_host_category = kHostStatisticCategory.find(statistic_item);
219           if (in_host_category == kHostStatisticCategory.end()) {
220             unsupported_items += statistic_item + ", ";
221           } else {
222             device_unsupported_items += statistic_item + ", ";
223           }
224         } else {
225           statistic_category_.push_back(statistic_item);
226           MS_LOG(INFO) << "The item: " << statistic_item
227                        << " is a valid statistic category, it will be computed on device.";
228         }
229       }
230       if (!device_unsupported_items.empty()) {
231         MS_LOG(WARNING) << "The following statistic_category only support to be compute on host:"
232                         << device_unsupported_items
233                         << "the valid statistic_category on device are as follows:" << kDeviceStatisticsategory;
234       }
235     } else {
236       for (const auto &statistic_item_json : *user_statistics) {
237         std::string statistic_item = statistic_item_json;
238         auto rt_find = kHostStatisticCategory.find(statistic_item);
239         if (rt_find == kHostStatisticCategory.end()) {
240           unsupported_items += statistic_item + ", ";
241         } else {
242           statistic_category_.push_back(statistic_item);
243           MS_LOG(INFO) << "The item: " << statistic_item
244                        << " is a valid statistic category, it will be computed on host.";
245         }
246       }
247     }
248     if (!unsupported_items.empty()) {
249       MS_LOG(EXCEPTION) << "The following statistic_category is invalid:" << unsupported_items
250                         << "the valid statistic_category are as follows:" << kSupportedStatisticsategory;
251     }
252   } else {
253     statistic_category_ = kDefaultStatisticCategory;
254     MS_LOG(INFO) << "Statistic category is not set, use the default items as follows:";
255     for (auto &itm : kDefaultStatisticCategory) {
256       MS_LOG(INFO) << itm;
257     }
258   }
259   CsvHeaderUtil::GetInstance().SetStatCsvHeader(statistic_category_);
260 }
261 
WriteJsonFile(const std::string & file_path,const std::ifstream & json_file)262 void WriteJsonFile(const std::string &file_path, const std::ifstream &json_file) {
263   ChangeFileMode(file_path, S_IWUSR);
264   std::ofstream json_copy(file_path);
265   if (!json_copy.is_open()) {
266     MS_LOG(EXCEPTION) << "Json file " << file_path << "open failed!";
267   }
268   json_copy << json_file.rdbuf();
269   json_copy.close();
270   ChangeFileMode(file_path, S_IRUSR);
271 }
272 
273 /*
274  * Feature group: Dump.
275  * Target device group: Ascend, GPU and CPU.
276  * Runtime category: Old runtime, MindRT.
277  * Description: Copy the dump configuration file to the root directory of dump path.
278  */
CopyDumpJsonToDir(uint32_t rank_id)279 void DumpJsonParser::CopyDumpJsonToDir(uint32_t rank_id) {
280   this->Parse();
281   if (!IsDumpEnabled()) {
282     return;
283   }
284   auto dump_config_file = Common::GetConfigFile(kMindsporeDumpConfig);
285   if (!dump_config_file.has_value()) {
286     MS_LOG(EXCEPTION) << "Get dump config file failed.";
287   }
288   std::ifstream json_file(dump_config_file.value());
289   if (async_dump_enabled_ || e2e_dump_enabled_) {
290     auto realpath =
291       Common::CreatePrefixPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/data_dump.json");
292     if (!realpath.has_value()) {
293       MS_LOG(ERROR) << "Get real path failed in CopyDumpJsonToDir.";
294     } else {
295       if (!Common::FileExists(realpath.value())) {
296         WriteJsonFile(realpath.value(), json_file);
297       } else {
298         MS_LOG(WARNING) << "The file: " << realpath.value() << " is already exist, skip copy it.";
299       }
300     }
301   }
302 }
303 
304 /*
305  * Feature group: Dump.
306  * Target device group: Ascend.
307  * Runtime category: Old runtime, MindRT.
308  * Description: Copy the hccl configuration file to the root directory of dump path.
309  */
CopyHcclJsonToDir(uint32_t rank_id)310 void DumpJsonParser::CopyHcclJsonToDir(uint32_t rank_id) {
311   if (!IsDumpEnabled()) {
312     return;
313   }
314   std::string config_path = common::GetEnv("MINDSPORE_HCCL_CONFIG_PATH");
315   if (config_path.empty()) {
316     config_path = common::GetEnv("RANK_TABLE_FILE");
317     if (config_path.empty()) {
318       MS_LOG(INFO) << "Get hccl json config failed.";
319       return;
320     }
321   }
322   std::ifstream json_file(config_path);
323   auto realpath = Common::CreatePrefixPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/hccl.json");
324   if (!realpath.has_value()) {
325     MS_LOG(ERROR) << "Get real path failed in CopyHcclJsonToDir.";
326   } else {
327     WriteJsonFile(realpath.value(), json_file);
328   }
329 }
330 
331 /*
332  * Feature group: Dump.
333  * Target device group: Ascend, GPU and CPU.
334  * Runtime category: Old runtime, MindRT.
335  * Description: Copy the mindspore configuration file to the root directory of dump path. It provides the device and
336  * ms_version information.
337  */
CopyMSCfgJsonToDir(uint32_t rank_id)338 void DumpJsonParser::CopyMSCfgJsonToDir(uint32_t rank_id) {
339   if (!IsDumpEnabled()) {
340     return;
341   }
342   auto realpath = Common::CreatePrefixPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/config.json");
343   if (!realpath.has_value()) {
344     MS_LOG(ERROR) << "Get real path failed in CopyMSConfigJsonToDir.";
345   } else {
346     if (Common::FileExists(realpath.value())) {
347       MS_LOG(WARNING) << "The file: " << realpath.value() << " is already exist, skip copy it.";
348       return;
349     }
350     nlohmann::json ms_info;
351     auto context = MsContext::GetInstance();
352     MS_EXCEPTION_IF_NULL(context);
353     ms_info["device_target"] = context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
354     ms_info["ms_version"] = MSVERSION;
355     const std::string file_path = realpath.value();
356     ChangeFileMode(file_path, S_IWUSR);
357     std::ofstream json_create(file_path);
358     if (!json_create.is_open()) {
359       MS_LOG(EXCEPTION) << "Json file " << file_path << "open failed!";
360     }
361     json_create << ms_info;
362     json_create.close();
363     ChangeFileMode(file_path, S_IRUSR);
364   }
365 }
366 
GetIterDumpFlag() const367 bool DumpJsonParser::GetIterDumpFlag() const { return e2e_dump_enabled_ && IsDumpIter(cur_dump_iter_); }
368 
DumpEnabledForIter() const369 bool DumpJsonParser::DumpEnabledForIter() const {
370   return ((e2e_dump_enabled_ || async_dump_enabled_) && IsDumpIter(cur_dump_iter_));
371 }
372 
373 /*
374  * Feature group: Dump.
375  * Target device group: Ascend, GPU and CPU.
376  * Runtime category: Old runtime, MindRT.
377  * Description: Dump data in the given address into npy file.
378  */
DumpToFile(const std::string & filename,const void * data,size_t len,const ShapeVector & shape,TypeId type)379 bool DumpJsonParser::DumpToFile(const std::string &filename, const void *data, size_t len, const ShapeVector &shape,
380                                 TypeId type) {
381   if (filename.empty() && (data == nullptr || len == 0)) {
382     MS_LOG(ERROR) << "Filename and data are empty or null.";
383     return false;
384   } else if (filename.empty()) {
385     MS_LOG(ERROR) << "Filename is empty.";
386     return false;
387   } else if (data == nullptr || len == 0) {
388     MS_LOG(WARNING) << "Data is empty or null for file: " << filename;
389     return false;
390   }
391   std::string npy_header = GenerateNpyHeader(shape, type);
392   if (npy_header.empty()) {
393     MS_LOG(WARNING) << "Failed to generate npy_header for file: " << filename;
394     return false;
395   }
396   std::string npy_suffix = ".npy";
397   std::string origin_file_path = filename + npy_suffix;
398   std::optional<std::string> prefix_path;
399   std::optional<std::string> origin_name;
400   std::optional<std::string> mapped_name;
401   bool need_map = Common::MappingName(origin_file_path, &prefix_path, &origin_name, &mapped_name);
402   if (!prefix_path.has_value() || !origin_name.has_value() || !mapped_name.has_value()) {
403     MS_LOG(ERROR) << "Cannot get prefix_path or file_name from: " << origin_file_path;
404     return false;
405   }
406   std::string final_file_path = origin_file_path;
407   if (need_map) {
408     std::string origin_name_str = origin_name.value();
409     std::string mapped_name_str = mapped_name.value();
410     std::lock_guard<std::mutex> guard(lock_);
411     auto mapping_file = Common::CreatePrefixPath(prefix_path.value() + "/mapping.csv");
412     if (!mapping_file.has_value()) {
413       MS_LOG(ERROR) << "CreatePrefixPath for mapping.csv failed.";
414       return false;
415     }
416     const std::string mapping_file_str = mapping_file.value();
417     // try to open file
418     ChangeFileMode(mapping_file_str, S_IWUSR);
419     std::ofstream fout(mapping_file_str, std::ofstream::app);
420     if (!fout.is_open()) {
421       MS_LOG(WARNING) << "Open file for mapping.csv failed.";
422       return false;
423     }
424     fout << mapped_name_str << "," << origin_name_str << "\n";
425     fout.close();
426     ChangeFileMode(mapping_file_str, S_IRUSR);
427     final_file_path = prefix_path.value() + "/" + mapped_name_str;
428   }
429   auto file_path = Common::CreatePrefixPath(final_file_path);
430   if (!file_path.has_value()) {
431     MS_LOG(ERROR) << "CreatePrefixPath failed.";
432     return false;
433   }
434   const std::string file_path_str = file_path.value();
435   MS_LOG(INFO) << "Dump path is " << file_path_str;
436   ChangeFileMode(file_path_str, S_IWUSR);
437 
438   MSLogTime msTime;
439   msTime.Start();
440   std::ofstream fd(file_path_str, std::ios::out | std::ios::trunc | std::ios::binary);
441   if (!fd.is_open()) {
442     MS_LOG(EXCEPTION) << "Open file " << file_path_str << " failed." << ErrnoToString(errno);
443   }
444   fd << npy_header;
445   (void)fd.write(reinterpret_cast<const char *>(data), SizeToLong(len));
446   if (fd.bad()) {
447     fd.close();
448     MS_LOG(EXCEPTION)
449       << "Write mem to file " << file_path_str
450       << " failed. This error may be caused by insufficient disk space. Please check the available disk space.";
451   }
452   fd.close();
453   msTime.End();
454   MS_LOG(DEBUG) << "Dump file costs time : " << msTime.GetRunTimeUS() << " microseconds.";
455 
456   ChangeFileMode(file_path_str, S_IRUSR);
457   return true;
458 }
459 
ParseCommonDumpSetting(const nlohmann::json & content)460 void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
461   // async_dump is enabled by default, if e2e dump is enabled it will override this
462   auto context = MsContext::GetInstance();
463   MS_EXCEPTION_IF_NULL(context);
464   if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
465     async_dump_enabled_ = true;
466   } else if (!e2e_dump_enabled_) {
467     e2e_dump_enabled_ = true;
468     trans_flag_ = true;
469     sample_mode_ = 0;
470     sample_num_ = 100;
471   }
472 
473   auto common_dump_settings = CheckJsonKeyExist(content, kCommonDumpSettings);
474   auto dump_mode = CheckJsonKeyExist(*common_dump_settings, kDumpMode);
475   auto net_name = CheckJsonKeyExist(*common_dump_settings, kNetName);
476   auto iteration = CheckJsonKeyExist(*common_dump_settings, kIteration);
477   auto input_output = CheckJsonKeyExist(*common_dump_settings, kInputOutput);
478   auto kernels = CheckJsonKeyExist(*common_dump_settings, kKernels);
479   auto support_device = CheckJsonKeyExist(*common_dump_settings, kSupportDevice);
480 
481   nlohmann::detail::iter_impl<const nlohmann::json> op_debug_mode;
482   if (!e2e_dump_enabled_) {
483     op_debug_mode = CheckJsonKeyExist(*common_dump_settings, kOpDebugMode);
484   } else {
485     if (CheckSelectableKeyExist(*common_dump_settings, kOpDebugMode)) {
486       op_debug_mode = CheckJsonKeyExist(*common_dump_settings, kOpDebugMode);
487     }
488   }
489 
490   ParseDumpMode(*dump_mode);
491   ParseDumpPath(*common_dump_settings);  // Pass in the whole json string to parse because the path field is optional.
492   ParseNetName(*net_name);
493   ParseIteration(*iteration);
494   ParseInputOutput(*input_output);
495   ParseKernels(*kernels);
496   ParseSupportDevice(*support_device);
497   if (!e2e_dump_enabled_) {
498     ParseOpDebugMode(*op_debug_mode);
499     ParseFileFormat(
500       *common_dump_settings);  // Pass in the whole json string to parse because file_format field is optional.
501   } else {
502     if (CheckSelectableKeyExist(*common_dump_settings, kOpDebugMode)) {
503       ParseOpDebugMode(*op_debug_mode);
504     }
505   }
506   ParseSavedData(*common_dump_settings);  // saved data optional
507 }
508 
ParseE2eDumpSetting(const nlohmann::json & content)509 void DumpJsonParser::ParseE2eDumpSetting(const nlohmann::json &content) {
510   auto e2e_dump_setting = content.find(kE2eDumpSettings);
511   auto context = MsContext::GetInstance();
512   MS_EXCEPTION_IF_NULL(context);
513   if (e2e_dump_setting == content.end()) {
514     MS_LOG(INFO) << "No e2e_dump_settings";
515     return;
516   }
517 
518   auto e2e_dump_enable = CheckJsonKeyExist(*e2e_dump_setting, kEnable);
519   auto trans_flag = CheckJsonKeyExist(*e2e_dump_setting, kTransFlag);
520 
521   if (CheckSelectableKeyExist(*e2e_dump_setting, kSaveArgs)) {
522     auto save_args_flag = CheckJsonKeyExist(*e2e_dump_setting, kSaveArgs);
523     save_args_flag_ = ParseEnable(*save_args_flag);
524   }
525   e2e_dump_enabled_ = ParseEnable(*e2e_dump_enable);
526   trans_flag_ = ParseEnable(*trans_flag);
527   ParseStatCalcMode(*e2e_dump_setting);
528   if (CheckSelectableKeyExist(*e2e_dump_setting, kSampleMode)) {
529     auto sample_mode = CheckJsonKeyExist(*e2e_dump_setting, kSampleMode);
530     ParseSampleMode(*sample_mode);
531     if (CheckSelectableKeyExist(*e2e_dump_setting, kSampleNum) &&
532         sample_mode_ == static_cast<uint32_t>(DUMP_HEAD_AND_TAIL)) {
533       auto sample_num = CheckJsonKeyExist(*e2e_dump_setting, kSampleNum);
534       ParseSampleNum(*sample_num);
535     }
536   }
537 }
538 
CheckJsonUnsignedType(const nlohmann::json & content,const std::string & key)539 void CheckJsonUnsignedType(const nlohmann::json &content, const std::string &key) {
540   if (!content.is_number_unsigned()) {
541     MS_LOG(EXCEPTION) << "Dump config parse failed, " << key << " should be unsigned int type";
542   }
543 }
544 
CheckJsonStringType(const nlohmann::json & content,const std::string & key)545 void CheckJsonStringType(const nlohmann::json &content, const std::string &key) {
546   if (!content.is_string()) {
547     MS_LOG(EXCEPTION) << "Dump config parse failed, " << key << " should be string type";
548   }
549 }
550 
CheckJsonArrayType(const nlohmann::json & content,const std::string & key)551 void CheckJsonArrayType(const nlohmann::json &content, const std::string &key) {
552   if (!content.is_array()) {
553     MS_LOG(EXCEPTION) << "Dump config parse failed, " << key << " should be array type";
554   }
555 }
556 
ParseDumpMode(const nlohmann::json & content)557 void DumpJsonParser::ParseDumpMode(const nlohmann::json &content) {
558   auto context = MsContext::GetInstance();
559   MS_EXCEPTION_IF_NULL(context);
560   CheckJsonUnsignedType(content, kDumpMode);
561   dump_mode_ = content;
562   if (dump_mode_ > static_cast<uint32_t>(DUMP_KERNELS_WITH_FLAG)) {
563     MS_LOG(EXCEPTION) << "Dump config parse failed, dump_mode should be 0, 1 or 2, but got " << dump_mode_;
564   }
565   if (dump_mode_ == static_cast<uint32_t>(DUMP_KERNELS_WITH_FLAG)) {
566     if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kAscendDevice) {
567       MS_LOG(EXCEPTION) << "Cell dump is only supported in Ascend async dump. Please set dump_mode to 0 or 1.";
568     }
569   }
570 }
571 
ParseDumpPath(const nlohmann::json & content)572 void DumpJsonParser::ParseDumpPath(const nlohmann::json &content) {
573   std::string dump_path;
574   auto json_iter = content.find(kPath);
575   // Check if `path` field exists in dump json file.
576   if (json_iter != content.end()) {
577     CheckJsonStringType(*json_iter, kPath);
578     dump_path = *json_iter;
579   }
580   if (dump_path.empty()) {
581     // If no path is found or path is set as empty in dump json file, use MS_DIAGNOSTIC_DATA_PATH/debug_dump as the dump
582     // path value if the env exists.
583     dump_path = common::GetEnv("MS_DIAGNOSTIC_DATA_PATH");
584     if (dump_path.empty()) {
585       MS_LOG(EXCEPTION)
586         << "Dump path is empty. Please set it in dump json file or environment variable `MS_DIAGNOSTIC_DATA_PATH`.";
587     } else {
588       dump_path += "/debug_dump";
589     }
590   }
591   path_ = dump_path;
592   if (!std::all_of(path_.begin(), path_.end(),
593                    [](char c) { return ::isalpha(c) || ::isdigit(c) || c == '-' || c == '_' || c == '/'; })) {
594     MS_LOG(EXCEPTION) << "Dump path only support alphabets, digit or {'-', '_', '/'}, but got:" << path_;
595   }
596   if (path_[0] != '/') {
597     MS_LOG(EXCEPTION) << "Dump path only support absolute path and should start with '/'";
598   }
599 }
600 
ParseNetName(const nlohmann::json & content)601 void DumpJsonParser::ParseNetName(const nlohmann::json &content) {
602   CheckJsonStringType(content, kNetName);
603   net_name_ = content;
604   if (net_name_.empty() || !std::all_of(net_name_.begin(), net_name_.end(),
605                                         [](char c) { return ::isalpha(c) || ::isdigit(c) || c == '-' || c == '_'; })) {
606     MS_LOG(EXCEPTION) << "net_name only supports alphabetic, digit, or {'-', '_'}, but got: " << net_name_;
607   }
608 }
609 
ParseSavedData(const nlohmann::json & content)610 void DumpJsonParser::ParseSavedData(const nlohmann::json &content) {
611   saved_data_ = kTensorDump;  // default to tensor data dump
612   auto json_iter = content.find(kSavedData);
613   if (json_iter != content.end()) {
614     CheckJsonStringType(*json_iter, kSavedData);
615     saved_data_ = *json_iter;
616   }
617   if (e2e_dump_enabled_ && op_debug_mode_ == static_cast<uint32_t>(DUMP_LITE_EXCEPTION) && saved_data_ != kTensorDump) {
618     MS_LOG(WARNING) << "E2e exception dump only support save tensor, saved_data is set to tensor";
619     saved_data_ = kTensorDump;
620   }
621   if (saved_data_ != kStatisticDump && saved_data_ != kTensorDump && saved_data_ != kFullDump) {
622     MS_LOG(EXCEPTION) << "Dump Json parse failed, saved_data only supports statistic, tensor, or full, but got: "
623                       << saved_data_ << ". Please set saved_data to either statistic, tensor, or full";
624   }
625   auto context = MsContext::GetInstance();
626   if (IsStatisticDump() && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) {
627     MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on GPU and Ascend, please "
628                          "set saved_data to tensor or use a GPU or Ascend device";
629   }
630   if (IsStatisticDump() && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
631     if (!IsNpyFormat() && !e2e_dump_enabled_) {
632       MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on Ascend when "
633                            "file_format is set to 'npy'.";
634     }
635   }
636 }
637 
ParseIteration(const nlohmann::json & content)638 void DumpJsonParser::ParseIteration(const nlohmann::json &content) {
639   CheckJsonStringType(content, kIteration);
640   auto context = MsContext::GetInstance();
641   MS_EXCEPTION_IF_NULL(context);
642   if (e2e_dump_enabled_ || async_dump_enabled_) {
643     iteration_ = content;
644     if (iteration_.empty() || (!std::all_of(iteration_.begin(), iteration_.end(), [](char c) {
645           return ::isdigit(c) || c == '-' || c == '|';
646         }) && iteration_ != "all")) {
647       MS_LOG(EXCEPTION) << "iteration only supports digits, {'-', '|'}, or just \"all\" but got: " << iteration_;
648     }
649   } else if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) {
650     MS_LOG(WARNING) << "Dump is not enabled. ";
651   } else {
652     MS_LOG(EXCEPTION) << "Dump Json Parse Failed. Async or E2E should be enabled. ";
653   }
654 }
655 
IsIterInRange(uint32_t iteration,const std::string & range)656 bool IsIterInRange(uint32_t iteration, const std::string &range) {
657   if (range.empty()) {
658     return false;
659   }
660   const std::string dash = "-";
661   std::size_t range_idx = range.find(dash);
662   // no dash in range, compare the value directly
663   if (range_idx == std::string::npos) {
664     size_t range_d = 0;
665     if (!CheckStoul(&range_d, range)) {
666       MS_LOG(INFO) << "Failed to convert the single step range: " << range
667                    << " into an integer, so the iteration: " << iteration << " is regarded as not in dump range.";
668       return false;
669     }
670     return iteration == range_d;
671   }
672   // make sure there is only one dash in range
673   if (range.find(dash, range_idx + 1) != std::string::npos) {
674     return false;
675   }
676   auto low_range_str = range.substr(0, range_idx);
677   auto high_range_str = range.substr(range_idx + 1);
678   if (low_range_str.empty() || high_range_str.empty()) {
679     return false;
680   }
681   size_t low_range = 0;
682   if (!CheckStoul(&low_range, low_range_str)) {
683     MS_LOG(INFO) << "Failed to convert the low_range_str: " << low_range_str
684                  << " into an integer, so the iteration: " << iteration << " is regarded as not in dump range.";
685     return false;
686   }
687   size_t high_range = 0;
688   if (!CheckStoul(&high_range, high_range_str)) {
689     MS_LOG(INFO) << "Failed to convert the high_range_str: " << high_range_str
690                  << " into an integer, so the iteration: " << iteration << " is regarded as not in dump range.";
691     return false;
692   }
693   return (low_range <= iteration) && (iteration <= high_range);
694 }
695 
IsStatisticDump() const696 bool DumpJsonParser::IsStatisticDump() const { return saved_data_ == kStatisticDump || IsFullDump(); }
697 
IsTensorDump() const698 bool DumpJsonParser::IsTensorDump() const { return saved_data_ == kTensorDump || IsFullDump(); }
699 
IsFullDump() const700 bool DumpJsonParser::IsFullDump() const { return saved_data_ == kFullDump; }
701 
IsNpyFormat() const702 bool DumpJsonParser::IsNpyFormat() const { return file_format_ == JsonFileFormat::FORMAT_NPY; }
703 
IsDumpIter(uint32_t iteration) const704 bool DumpJsonParser::IsDumpIter(uint32_t iteration) const {
705   // bool DumpJsonParser::IsDumpIter(uint32_t iteration) --> checks if iteration should be dumped or not.
706   if (iteration_ == "all") {
707     return true;
708   }
709   const std::string vertical_bar = "|";
710   std::size_t start = 0;
711   std::size_t end = iteration_.find(vertical_bar);
712   while (end != std::string::npos) {
713     std::string temp = iteration_.substr(start, end - start);
714     auto found = IsIterInRange(iteration, temp);
715     if (found) {
716       return true;
717     }
718     start = end + 1;
719     end = iteration_.find(vertical_bar, start);
720   }
721   std::string temp = iteration_.substr(start);
722   return IsIterInRange(iteration, temp);
723 }
724 
ParseInputOutput(const nlohmann::json & content)725 void DumpJsonParser::ParseInputOutput(const nlohmann::json &content) {
726   CheckJsonUnsignedType(content, kInputOutput);
727   input_output_ = content;
728   const uint32_t max_inout_num = 2;
729   if (input_output_ > max_inout_num) {
730     MS_LOG(EXCEPTION) << "Dump Json Parse Failed. input_output should be 0, 1, 2";
731   }
732 }
733 
ParseKernels(const nlohmann::json & content)734 void DumpJsonParser::ParseKernels(const nlohmann::json &content) {
735   CheckJsonArrayType(content, kKernels);
736   if (dump_mode_ != static_cast<uint32_t>(DUMP_KERNEL)) {
737     MS_LOG(INFO) << "Dump config field <" << kKernels << "> is not used as the dump mode is not 1.";
738     return;
739   }
740   kernels_json_ = content;
741   auto context = MsContext::GetInstance();
742   MS_EXCEPTION_IF_NULL(context);
743   std::string backend = context->backend_policy();
744   for (const auto &kernel : content) {
745     bool ret;
746     auto kernel_str = kernel.dump();
747     MS_LOG(INFO) << "Need dump kernel:" << kernel_str;
748     kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), '\"'), kernel_str.end());
749     kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), ' '), kernel_str.end());
750     if (kernel_str == "") {
751       continue;
752     }
753     if (static_cast<int>(kernel_str.find("name-regex(")) == 0 &&
754         static_cast<int>(kernel_str.rfind(")")) == static_cast<int>(kernel_str.length()) - kBracketsOffset) {
755       std::string kernel_reg_exp = kernel_str.substr(
756         kRegexPrefixLength, static_cast<int>(kernel_str.length()) - kRegexPrefixLength - kBracketsOffset);
757       ret = kernel_regs_.try_emplace(kernel_str, std::regex(kernel_reg_exp)).second;
758       dump_layer_ += kernel_str + " ";
759     } else {
760       if (static_cast<int>(kernel_str.rfind('/')) == -1 && static_cast<int>(kernel_str.rfind("-op")) == -1) {
761         if (backend == "ge") {
762           MS_LOG(WARNING) << "It is not supported to specify operator types on 1980B backend. " << kernel_str
763                           << " maybe not take effect.";
764           dump_layer_ += kernel_str + " ";
765         }
766         ret = kernel_types_.try_emplace({kernel_str, 0}).second;
767       } else {
768         ret = kernels_.try_emplace({kernel_str, 0}).second;
769         dump_layer_ += kernel_str + " ";
770       }
771     }
772     kernel_strings_.try_emplace({kernel_str, 0});
773     if (!ret) {
774       MS_LOG(WARNING) << "Duplicate dump kernel name:" << kernel_str;
775     }
776     if (kernel_strings_.empty()) {
777       kernel_types_.try_emplace({"", 0});
778       kernel_strings_.try_emplace({"", 0});
779     }
780   }
781 }
782 
ParseStatCalcMode(const nlohmann::json & content)783 void DumpJsonParser::ParseStatCalcMode(const nlohmann::json &content) {
784   auto iter = content.find(kStatCalcMode);
785   stat_calc_mode_ = kHost;
786   if (iter == content.end()) {
787     MS_LOG(INFO) << "'stat_calc_mode' is not set, default is " << stat_calc_mode_;
788     return;
789   }
790   CheckJsonStringType(*iter, kStatCalcMode);
791   std::string calc_mode = *iter;
792   if (calc_mode != kHost && calc_mode != kDevice) {
793     MS_LOG(EXCEPTION) << "Dump Json parse failed, 'stat_calc_mode' only supports 'host' or 'device', but got: "
794                       << calc_mode << ". Please set 'stat_cal_mode' to 'host' or 'device'";
795   }
796   stat_calc_mode_ = calc_mode;
797 }
798 
CheckStatCalcModeVaild()799 void DumpJsonParser::CheckStatCalcModeVaild() {
800   if (IsTensorDump() && stat_calc_mode_ == kDevice) {
801     MS_LOG(WARNING) << "When 'saved_data' is 'tensor' or 'full', the device cannot be used to calculate statistics and "
802                        "the 'stat_calc_mode' is forced to 'host'.";
803     stat_calc_mode_ = kHost;
804   }
805   auto context = MsContext::GetInstance();
806   MS_EXCEPTION_IF_NULL(context);
807   auto device_target = context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
808   if (device_target != kAscendDevice && stat_calc_mode_ == kDevice) {
809     MS_LOG(WARNING)
810       << "The 'device' option of 'stat_calc_mode' currently only supports the ascend platform. The current platform is "
811       << device_target << ", and the 'stat_calc_mode' option is forcibly set to 'host'.";
812     stat_calc_mode_ = kHost;
813   }
814   MS_LOG(INFO) << "stat_calc_mode is set to " << stat_calc_mode_;
815 }
816 
IsDeviceCalcStats() const817 bool DumpJsonParser::IsDeviceCalcStats() const { return stat_calc_mode_ == kDevice; }
818 
ParseSupportDevice(const nlohmann::json & content)819 void DumpJsonParser::ParseSupportDevice(const nlohmann::json &content) {
820   CheckJsonArrayType(content, kSupportDevice);
821   for (const auto &device : content) {
822     uint32_t device_id = device;
823     MS_LOG(INFO) << "Dump support device:" << device_id;
824     auto ret = support_devices_.emplace(device_id);
825     if (!ret.second) {
826       MS_LOG(WARNING) << "Duplicate support device:" << device_id;
827     }
828   }
829 }
830 
ParseEnable(const nlohmann::json & content) const831 bool DumpJsonParser::ParseEnable(const nlohmann::json &content) const {
832   if (!content.is_boolean()) {
833     MS_LOG(EXCEPTION) << "Dump Json Parse Failed. 'enable' should be boolean type";
834   }
835   return content;
836 }
837 
ParseSampleMode(const nlohmann::json & content)838 void DumpJsonParser::ParseSampleMode(const nlohmann::json &content) {
839   CheckJsonUnsignedType(content, kSampleMode);
840   sample_mode_ = content;
841   const uint32_t max_inout_num = 1;
842   if (sample_mode_ > max_inout_num) {
843     MS_LOG(EXCEPTION) << "Dump Json Parse Failed. sample_mode should be 0, 1";
844   }
845 }
846 
ParseSampleNum(const nlohmann::json & content)847 void DumpJsonParser::ParseSampleNum(const nlohmann::json &content) {
848   CheckJsonUnsignedType(content, kSampleMode);
849   sample_num_ = content;
850   const uint32_t min_inout_num = 1;
851   if (sample_num_ < min_inout_num) {
852     MS_LOG(EXCEPTION) << "Dump Json Parse Failed. sample_num should be greater than 0";
853   }
854 }
855 
ParseOpDebugMode(const nlohmann::json & content)856 void DumpJsonParser::ParseOpDebugMode(const nlohmann::json &content) {
857   CheckJsonUnsignedType(content, kOpDebugMode);
858   op_debug_mode_ = content;
859   switch (op_debug_mode_) {
860     case static_cast<uint32_t>(DUMP_WHOLE):
861       break;
862     case static_cast<uint32_t>(DUMP_AICORE_OVERFLOW):
863     case static_cast<uint32_t>(DUMP_ATOMIC_OVERFLOW):
864       if (e2e_dump_enabled_) {
865         MS_LOG(EXCEPTION) << "Dump Json Parse Failed. op_debug_mode should be 0, 3, 4";
866       }
867       break;
868     case static_cast<uint32_t>(DUMP_BOTH_OVERFLOW): {
869       if (e2e_dump_enabled_) {
870         dump_mode_ = static_cast<uint32_t>(DUMP_ALL);
871       }
872       break;
873     }
874     case static_cast<uint32_t>(DUMP_LITE_EXCEPTION): {
875       auto context = MsContext::GetInstance();
876       MS_EXCEPTION_IF_NULL(context);
877       auto device_target = context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
878       if (device_target == "CPU" || device_target == "GPU") {
879         MS_LOG(WARNING) << "Abnormal dump is not supported on " << device_target
880                         << " backend, and none operator data would be saved when abnormal dump is enabled. ";
881       }
882       if (IsAclDump() || e2e_dump_enabled_) {
883         if (e2e_dump_enabled_ && iteration_ != "all") {
884           MS_LOG(WARNING) << "For e2e exception dump, it is not support to specify iteration, set iteration to all.";
885           iteration_ = "all";
886         }
887         if (e2e_dump_enabled_ && sample_mode_ != 0) {
888           MS_LOG(WARNING) << "For e2e exception dump, it is not support to sample dump, set sample_mode to 0, the "
889                              "whole tensor would be saved when exception occur.";
890           sample_mode_ = 0;
891         }
892         break;
893       } else {
894         MS_LOG(EXCEPTION) << "Dump Json Parse Failed. op_debug_mode should be 0, 1, 2, 3";
895       }
896     }
897     default:
898       if (IsAclDump()) {
899         MS_LOG(EXCEPTION) << "Dump Json Parse Failed. op_debug_mode should be 0, 1, 2, 3, 4";
900       } else if (e2e_dump_enabled_) {
901         MS_LOG(EXCEPTION) << "Dump Json Parse Failed. op_debug_mode should be 0, 3, 4";
902       } else {
903         MS_LOG(EXCEPTION) << "Dump Json Parse Failed. op_debug_mode should be 0, 1, 2, 3";
904       }
905   }
906 }
907 
ParseFileFormat(const nlohmann::json & content)908 void DumpJsonParser::ParseFileFormat(const nlohmann::json &content) {
909   auto iter = content.find(kFileFormat);
910   if (iter == content.end()) {
911     file_format_ = JsonFileFormat::FORMAT_BIN;
912   } else {
913     CheckJsonStringType(*iter, kFileFormat);
914     std::string file_format = *iter;
915     const std::map<std::string, JsonFileFormat> str_to_fmt_enum = {{"bin", JsonFileFormat::FORMAT_BIN},
916                                                                    {"npy", JsonFileFormat::FORMAT_NPY}};
917     if (str_to_fmt_enum.find(file_format) == str_to_fmt_enum.end()) {
918       MS_LOG(EXCEPTION) << "Dump Json Parse Failed. 'file_format' should be either 'npy' or 'bin', but got: "
919                         << file_format;
920     }
921     file_format_ = str_to_fmt_enum.at(file_format);
922   }
923 }
924 
JsonConfigToString()925 void DumpJsonParser::JsonConfigToString() {
926   std::string cur_config;
927   cur_config.append("dump_mode:");
928   cur_config.append(std::to_string(dump_mode_));
929   cur_config.append(" path:");
930   cur_config.append(path_);
931   cur_config.append(" net_name:");
932   cur_config.append(net_name_);
933   cur_config.append(" iteration:");
934   cur_config.append(iteration_);
935   cur_config.append(" input_output:");
936   cur_config.append(std::to_string(input_output_));
937   cur_config.append("e2e_enable:");
938   cur_config.append(std::to_string(static_cast<int>(e2e_dump_enabled_)));
939   cur_config.append(" async_dump_enable:");
940   cur_config.append(std::to_string(static_cast<int>(async_dump_enabled_)));
941   MS_LOG(INFO) << cur_config;
942 }
943 
JudgeDumpEnabled()944 void DumpJsonParser::JudgeDumpEnabled() {
945   auto context = MsContext::GetInstance();
946   MS_EXCEPTION_IF_NULL(context);
947   if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice) {
948     async_dump_enabled_ = false;
949   }
950 
951   if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
952     if (async_dump_enabled_ && e2e_dump_enabled_) {
953       async_dump_enabled_ = false;
954       MS_LOG(INFO) << "Disable async dump";
955     }
956   }
957 
958   if (!async_dump_enabled_ && !e2e_dump_enabled_) {
959     MS_LOG(WARNING) << "Dump json parse failed. Dump is not enabled";
960   }
961   if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kCPUDevice) {
962     auto device_id = context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
963     if (support_devices_.find(device_id) == support_devices_.end()) {
964       async_dump_enabled_ = false;
965       e2e_dump_enabled_ = false;
966       MS_LOG(WARNING) << "Dump is not enabled. device_id:" << device_id << " not support";
967     }
968   }
969   if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
970     if (async_dump_enabled_ && !IsAclDump()) {
971       if (context->IsKByKExecutorMode()) {
972         MS_LOG(WARNING)
973           << "When jit_level is set to 'o0' or 'o1', async_dump only support acl dump method, ie. set environment "
974              "MS_ACL_DUMP_CFG_PATH to the same path with MINDSPORE_DUMP_CONFIG. In fact, e2e dump is preferable.";
975       }
976     }
977   }
978   JsonConfigToString();
979 }
980 
981 /*
982  * Feature group: Dump.
983  * Target device group: Ascend, GPU and CPU.
984  * Runtime category: Old runtime, MindRT.
985  * Description: Check if the given op needs to be dumped based the configuration option.
986  */
NeedDump(const std::string & op_full_name)987 bool DumpJsonParser::NeedDump(const std::string &op_full_name) {
988   bool need_dump = false;
989 
990   switch (dump_mode_) {
991     case DUMP_ALL:
992       need_dump = true;
993       break;
994     case DUMP_KERNEL:
995       for (const auto &iter : kernel_regs_) {
996         if (regex_match(op_full_name, iter.second)) {
997           need_dump = true;
998           MatchKernel(iter.first);
999           break;
1000         }
1001       }
1002       if (need_dump) {
1003         break;
1004       }
1005       if (kernels_.find(op_full_name) != kernels_.end()) {
1006         need_dump = true;
1007         MatchKernel(op_full_name);
1008         break;
1009       }
1010       for (const auto &iter : kernel_types_) {
1011         int start_index = static_cast<int>(op_full_name.rfind('/')) + 1;
1012         int end_index = static_cast<int>(op_full_name.rfind('-'));
1013         if (end_index == -1) {
1014           end_index = static_cast<int>(op_full_name.length());
1015         }
1016         std::string op_name = op_full_name.substr(start_index, end_index - start_index);
1017         transform(op_name.begin(), op_name.end(), op_name.begin(), ::tolower);
1018         std::string kernel_type(iter.first);
1019         transform(kernel_type.begin(), kernel_type.end(), kernel_type.begin(), ::tolower);
1020         if (op_name.find(kernel_type) != std::string::npos) {
1021           need_dump = true;
1022           MatchKernel(kernel_type);
1023           break;
1024         }
1025       }
1026       break;
1027     case DUMP_KERNELS_WITH_FLAG:
1028       if (std::find(cell_dump_kernels_.begin(), cell_dump_kernels_.end(), op_full_name) != cell_dump_kernels_.end()) {
1029         need_dump = true;
1030       }
1031       break;
1032     default:
1033       break;
1034   }
1035   return need_dump;
1036 }
1037 
1038 /*
1039  * Feature group: Dump.
1040  * Target device group: Ascend, GPU and CPU.
1041  * Runtime category: Old runtime, MindRT.
1042  * Description: Increment the count of dumping for given kernel.
1043  */
MatchKernel(const std::string & kernel_name)1044 void DumpJsonParser::MatchKernel(const std::string &kernel_name) {
1045   auto iter = kernel_strings_.find(kernel_name);
1046   if (iter == kernel_strings_.end()) {
1047     return;
1048   }
1049   iter->second = iter->second + 1;
1050   MS_LOG(INFO) << "Match dump kernel:" << iter->first << " match times:" << iter->second;
1051 }
1052 
PrintUnusedKernel()1053 void DumpJsonParser::PrintUnusedKernel() {
1054   if ((!e2e_dump_enabled_ && !async_dump_enabled_) || dump_mode_ != static_cast<uint32_t>(DUMP_KERNEL)) {
1055     return;
1056   }
1057   for (const auto &iter : kernel_strings_) {
1058     if (iter.second == 0) {
1059       MS_LOG(WARNING) << "[DataDump] Unused Kernel in json: " << iter.first;
1060     }
1061   }
1062 }
1063 
1064 /*
1065  * Feature group: Online debugger.
1066  * Target device group: Ascend.
1067  * Runtime category: Old runtime, MindRT.
1068  * Description: Generate the directory path where overflow bin file locates.
1069  */
GetOpOverflowBinPath(uint32_t graph_id) const1070 std::string DumpJsonParser::GetOpOverflowBinPath(uint32_t graph_id) const {
1071   std::string bin_path;
1072   bin_path.append(path_);
1073   bin_path.append("/");
1074   bin_path.append("rank_");
1075 
1076   uint32_t rank_id = 0;
1077   auto ms_context = MsContext::GetInstance();
1078   MS_EXCEPTION_IF_NULL(ms_context);
1079   auto env_rank_id = common::GetEnv("RANK_ID");
1080   if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
1081     // get actual rank id if it's distribution training case.
1082     if (!CommManager::GetInstance().GetRankID(kHcclWorldGroup, &rank_id)) {
1083       MS_LOG(INFO) << "Failed to get rank id.";
1084     }
1085   }
1086   bin_path.append(std::to_string(rank_id));
1087 
1088   bin_path.append("/");
1089   bin_path.append(net_name_);
1090   bin_path.append("/");
1091   bin_path.append(std::to_string(graph_id));
1092   bin_path.append("/");
1093   bin_path.append(std::to_string(cur_dump_iter_));
1094   bin_path.append("/");
1095 
1096   return bin_path;
1097 }
1098 
InputNeedDump() const1099 bool DumpJsonParser::InputNeedDump() const {
1100   return input_output_ == kDumpInputAndOutput || input_output_ == kDumpInputOnly;
1101 }
1102 
OutputNeedDump() const1103 bool DumpJsonParser::OutputNeedDump() const {
1104   return input_output_ == kDumpInputAndOutput || input_output_ == kDumpOutputOnly;
1105 }
1106 
1107 /*
1108  * Feature group: Dump.
1109  * Target device group: Ascend.
1110  * Runtime category: Old runtime, MindRT.
1111  * Description: Obtain the cell dump flag of each operators in the given kernel graph.
1112  */
GetCellDumpFlag(const session::KernelGraph & kernel_graph)1113 void DumpJsonParser::GetCellDumpFlag(const session::KernelGraph &kernel_graph) {
1114   if (dump_mode_ != static_cast<uint32_t>(DUMP_KERNELS_WITH_FLAG)) {
1115     return;
1116   }
1117   for (const auto &kernel : kernel_graph.execution_order()) {
1118     MS_EXCEPTION_IF_NULL(kernel);
1119     auto dump_flag = common::AnfAlgo::GetDumpFlag(kernel);
1120     if (dump_flag.has_value() && dump_flag.value().compare("true") == 0) {
1121       MS_LOG(DEBUG) << "Dump flag is true for " << GetKernelNodeName(kernel);
1122       cell_dump_kernels_.push_back(GetKernelNodeName(kernel));
1123     }
1124   }
1125 }
1126 
UpdateNeedDumpKernels(const session::KernelGraph & kernel_graph)1127 void DumpJsonParser::UpdateNeedDumpKernels(const session::KernelGraph &kernel_graph) {
1128   MS_LOG(INFO) << "Get kernel dump flag";
1129   GetCellDumpFlag(kernel_graph);
1130 
1131   if (!async_dump_enabled_) {
1132     return;
1133   }
1134 
1135   MS_LOG(INFO) << "Update async dump kernel list for hccl";
1136   for (const auto &kernel : kernel_graph.execution_order()) {
1137     MS_EXCEPTION_IF_NULL(kernel);
1138     if (AnfAlgo::GetKernelType(kernel) == HCCL_KERNEL &&
1139         DumpJsonParser::GetInstance().NeedDump(GetKernelNodeName(kernel)) &&
1140         DumpJsonParser::GetInstance().InputNeedDump()) {
1141       auto input_size = common::AnfAlgo::GetInputTensorNum(kernel);
1142       for (size_t i = 0; i < input_size; ++i) {
1143         auto input_with_index = common::AnfAlgo::GetPrevNodeOutput(kernel, i);
1144         auto input = input_with_index.first;
1145         MS_EXCEPTION_IF_NULL(input);
1146         if (input->isa<CNode>()) {
1147           MS_LOG(INFO) << "[AsyncDump] Match Hccl Node:" << GetKernelNodeName(kernel)
1148                        << " Input:" << GetKernelNodeName(input);
1149           hccl_input_kernels_.insert(GetKernelNodeName(input));
1150         }
1151       }
1152     }
1153   }
1154 }
1155 
IsHCCLKernelInput(const std::string & kernel_name) const1156 bool DumpJsonParser::IsHCCLKernelInput(const std::string &kernel_name) const {
1157   if (hccl_input_kernels_.empty()) {
1158     return false;
1159   }
1160   auto iter = std::find(hccl_input_kernels_.begin(), hccl_input_kernels_.end(), kernel_name);
1161   if (iter != hccl_input_kernels_.end()) {
1162     return true;
1163   }
1164   return false;
1165 }
1166 
IsAclDump()1167 bool DumpJsonParser::IsAclDump() {
1168   bool is_acl_dump = false;
1169   auto env_enable_kbk = common::GetEnv("MS_ACL_DUMP_CFG_PATH");
1170   auto dump_enable_kbk = common::GetEnv("MINDSPORE_DUMP_CONFIG");
1171   if (!env_enable_kbk.empty() && env_enable_kbk == dump_enable_kbk) {
1172     is_acl_dump = true;
1173   }
1174   return is_acl_dump;
1175 }
1176 }  // namespace mindspore
1177