1 /**
2 * Copyright 2020-2024 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "include/backend/debug/data_dump/dump_json_parser.h"
17 #include <algorithm>
18 #include <fstream>
19 #include "debug/data_dump/npy_header.h"
20 #include "debug/utils.h"
21 #include "include/backend/anf_runtime_algorithm.h"
22 #include "include/common/debug/anf_dump_utils.h"
23 #include "include/common/debug/common.h"
24 #include "include/common/utils/anfalgo.h"
25 #include "include/common/utils/comm_manager.h"
26 #include "mindspore/core/utils/file_utils.h"
27 #include "mindspore/core/utils/ms_utils.h"
28 #include "ops/ascend_op_name.h"
29 #include "utils/convert_utils_base.h"
30 #include "utils/log_adapter.h"
31 #include "utils/ms_context.h"
32
33 namespace {
34 constexpr auto kCommonDumpSettings = "common_dump_settings";
35 constexpr auto kE2eDumpSettings = "e2e_dump_settings";
36 constexpr auto kDumpMode = "dump_mode";
37 constexpr auto kPath = "path";
38 constexpr auto kNetName = "net_name";
39 constexpr auto kSavedData = "saved_data";
40 constexpr auto kIteration = "iteration";
41 constexpr auto kInputOutput = "input_output";
42 constexpr auto kKernels = "kernels";
43 constexpr auto kSupportDevice = "support_device";
44 constexpr auto kEnable = "enable";
45 constexpr auto kOpDebugMode = "op_debug_mode";
46 constexpr auto kTransFlag = "trans_flag";
47 constexpr auto kSaveArgs = "save_kernel_args";
48 constexpr auto kSampleMode = "sample_mode";
49 constexpr auto kSampleNum = "sample_num";
50 constexpr auto kStatCalcMode = "stat_calc_mode";
51 constexpr auto kHost = "host";
52 constexpr auto kDevice = "device";
53 constexpr auto kStatisticDump = "statistic";
54 constexpr auto kTensorDump = "tensor";
55 constexpr auto kFullDump = "full";
56 constexpr auto kFileFormat = "file_format";
57 constexpr auto kStatisticCategory = "statistic_category";
58 constexpr auto kDumpInputAndOutput = 0;
59 constexpr auto kDumpInputOnly = 1;
60 constexpr auto kDumpOutputOnly = 2;
61 constexpr auto kMindsporeDumpConfig = "MINDSPORE_DUMP_CONFIG";
62 constexpr auto kBracketsOffset = 1;
63 constexpr auto kRegexPrefixLength = 11;
64 const std::vector<std::string> kDefaultStatisticCategory = {"max", "min", "l2norm"};
65 const std::set<std::string> kDeviceStatisticCategory = {"max", "min", "avg", "l2norm"};
66 const std::set<std::string> kHostStatisticCategory = {"max",
67 "min",
68 "avg",
69 "count",
70 "negative zero count",
71 "positive zero count",
72 "nan count",
73 "negative inf count",
74 "positive inf count",
75 "zero count",
76 "md5",
77 "l2norm"};
78 constexpr auto kDeviceStatisticsategory = "['max', 'min', 'avg', 'l2norm']";
79 constexpr auto kSupportedStatisticsategory =
80 "['max', 'min', 'avg', 'count', 'negative zero count', 'positive zero count', 'nan count', 'negative inf count', "
81 "'positive inf count', 'zero count', 'md5', 'l2norm']";
82 } // namespace
83
84 namespace mindspore {
CheckJsonKeyExist(const nlohmann::json & content,const std::string & key)85 auto DumpJsonParser::CheckJsonKeyExist(const nlohmann::json &content, const std::string &key) {
86 nlohmann::json::const_iterator iter = content.find(key);
87 if (iter == content.end()) {
88 MS_LOG(EXCEPTION) << "Check dump json failed, " << key << " not found";
89 }
90 return iter;
91 }
92
CheckSelectableKeyExist(const nlohmann::json & content,const std::string & key)93 bool DumpJsonParser::CheckSelectableKeyExist(const nlohmann::json &content, const std::string &key) {
94 nlohmann::json::const_iterator iter = content.find(key);
95 if (iter == content.end()) {
96 return false;
97 }
98 return true;
99 }
100
GetIfstreamString(const std::ifstream & ifstream)101 std::string GetIfstreamString(const std::ifstream &ifstream) {
102 std::stringstream buffer;
103 buffer << ifstream.rdbuf();
104 return buffer.str();
105 }
106
IsDumpEnabled()107 bool DumpJsonParser::IsDumpEnabled() {
108 auto config_path = common::GetEnv(kMindsporeDumpConfig);
109 if (config_path.empty()) {
110 return false;
111 }
112 MS_LOG(INFO) << "Dump config path is " << config_path;
113
114 auto context = MsContext::GetInstance();
115 MS_EXCEPTION_IF_NULL(context);
116 if (context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode &&
117 context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kAscendDevice) {
118 MS_LOG(EXCEPTION) << "In GPU or CPU, Dump is disabled in PyNative mode. Please set mode to GRAPH_MODE in context.";
119 }
120 if (context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode &&
121 context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice && e2e_dump_enabled_) {
122 MS_LOG(EXCEPTION) << "Dump is only support asynchronous for Ascend in PyNative mode.";
123 }
124 return true;
125 }
126
PyNativeModeCheck()127 void DumpJsonParser::PyNativeModeCheck() {
128 auto context = MsContext::GetInstance();
129 MS_EXCEPTION_IF_NULL(context);
130 if (context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode &&
131 dump_mode_ == static_cast<uint32_t>(DUMP_KERNELS_WITH_FLAG)) {
132 MS_LOG(EXCEPTION) << "Cell dump is only supported in GRAPH mode. Please set dump_mode to 0 or 1 in PyNative mode.";
133 }
134 }
135
CheckE2eSetting()136 void DumpJsonParser::CheckE2eSetting() {
137 auto context = MsContext::GetInstance();
138 MS_EXCEPTION_IF_NULL(context);
139 if (e2e_dump_enabled()) {
140 if (!context->IsKByKExecutorMode()) {
141 MS_LOG(WARNING) << "E2e dump only support kernel by kernel mode on Ascend platform.";
142 }
143 CheckStatCalcModeVaild();
144 } else {
145 if (dump_mode_ == static_cast<uint32_t>(DUMP_KERNELS_WITH_FLAG)) {
146 MS_LOG(EXCEPTION) << "Cell dump only support e2e dump mode. Please set dump_mode to 0 or 1.";
147 }
148 }
149 }
150
151 /*
152 * Feature group: Dump.
153 * Target device group: Ascend, GPU and CPU.
154 * Runtime category: Old runtime, MindRT.
155 * Description: Parse the configuration option in dump json file pointed by environment variable MINDSPORE_DUMP_CONFIG.
156 */
Parse()157 void DumpJsonParser::Parse() {
158 std::lock_guard<std::mutex> guard(lock_);
159 if (already_parsed_) {
160 return;
161 }
162 already_parsed_ = true;
163 if (!IsDumpEnabled()) {
164 return;
165 }
166
167 auto dump_config_file = Common::GetConfigFile(kMindsporeDumpConfig);
168 if (!dump_config_file.has_value()) {
169 MS_LOG(EXCEPTION) << "Get dump config file failed";
170 }
171
172 std::ifstream json_file(dump_config_file.value());
173 if (!json_file.is_open()) {
174 MS_LOG(EXCEPTION) << "Dump file:" << dump_config_file.value() << " open failed. Errno:" << errno;
175 }
176
177 nlohmann::json j;
178 try {
179 json_file >> j;
180 } catch (nlohmann::json::parse_error &e) {
181 MS_LOG(ERROR) << "Dump json contents:" << GetIfstreamString(json_file);
182 json_file.close();
183 MS_LOG(EXCEPTION) << "Parse dump json failed, error:" << e.what();
184 }
185
186 // convert json to string
187 std::stringstream ss;
188 ss << j;
189 std::string cfg = ss.str();
190 json_file.close();
191 MS_LOG(INFO) << "Dump json:" << cfg;
192
193 ParseE2eDumpSetting(j);
194 ParseCommonDumpSetting(j);
195 PyNativeModeCheck();
196 CheckE2eSetting();
197 JudgeDumpEnabled();
198 CheckStatCalcModeVaild();
199 ParseStatisticCategory(j);
200 }
201
ParseStatisticCategory(const nlohmann::json & content)202 void DumpJsonParser::ParseStatisticCategory(const nlohmann::json &content) {
203 if (!IsStatisticDump()) {
204 return;
205 }
206 auto common_dump_settings = CheckJsonKeyExist(content, kCommonDumpSettings);
207 auto set_statistic_category = CheckSelectableKeyExist(*common_dump_settings, kStatisticCategory);
208 if (set_statistic_category) {
209 auto user_statistics = CheckJsonKeyExist(*common_dump_settings, kStatisticCategory);
210 CheckJsonArrayType(*user_statistics, kStatisticCategory);
211 std::string unsupported_items = "";
212 if (IsDeviceCalcStats()) {
213 std::string device_unsupported_items = "";
214 for (const auto &statistic_item_json : *user_statistics) {
215 std::string statistic_item = statistic_item_json;
216 auto rt_find = kDeviceStatisticCategory.find(statistic_item);
217 if (rt_find == kDeviceStatisticCategory.end()) {
218 auto in_host_category = kHostStatisticCategory.find(statistic_item);
219 if (in_host_category == kHostStatisticCategory.end()) {
220 unsupported_items += statistic_item + ", ";
221 } else {
222 device_unsupported_items += statistic_item + ", ";
223 }
224 } else {
225 statistic_category_.push_back(statistic_item);
226 MS_LOG(INFO) << "The item: " << statistic_item
227 << " is a valid statistic category, it will be computed on device.";
228 }
229 }
230 if (!device_unsupported_items.empty()) {
231 MS_LOG(WARNING) << "The following statistic_category only support to be compute on host:"
232 << device_unsupported_items
233 << "the valid statistic_category on device are as follows:" << kDeviceStatisticsategory;
234 }
235 } else {
236 for (const auto &statistic_item_json : *user_statistics) {
237 std::string statistic_item = statistic_item_json;
238 auto rt_find = kHostStatisticCategory.find(statistic_item);
239 if (rt_find == kHostStatisticCategory.end()) {
240 unsupported_items += statistic_item + ", ";
241 } else {
242 statistic_category_.push_back(statistic_item);
243 MS_LOG(INFO) << "The item: " << statistic_item
244 << " is a valid statistic category, it will be computed on host.";
245 }
246 }
247 }
248 if (!unsupported_items.empty()) {
249 MS_LOG(EXCEPTION) << "The following statistic_category is invalid:" << unsupported_items
250 << "the valid statistic_category are as follows:" << kSupportedStatisticsategory;
251 }
252 } else {
253 statistic_category_ = kDefaultStatisticCategory;
254 MS_LOG(INFO) << "Statistic category is not set, use the default items as follows:";
255 for (auto &itm : kDefaultStatisticCategory) {
256 MS_LOG(INFO) << itm;
257 }
258 }
259 CsvHeaderUtil::GetInstance().SetStatCsvHeader(statistic_category_);
260 }
261
WriteJsonFile(const std::string & file_path,const std::ifstream & json_file)262 void WriteJsonFile(const std::string &file_path, const std::ifstream &json_file) {
263 ChangeFileMode(file_path, S_IWUSR);
264 std::ofstream json_copy(file_path);
265 if (!json_copy.is_open()) {
266 MS_LOG(EXCEPTION) << "Json file " << file_path << "open failed!";
267 }
268 json_copy << json_file.rdbuf();
269 json_copy.close();
270 ChangeFileMode(file_path, S_IRUSR);
271 }
272
273 /*
274 * Feature group: Dump.
275 * Target device group: Ascend, GPU and CPU.
276 * Runtime category: Old runtime, MindRT.
277 * Description: Copy the dump configuration file to the root directory of dump path.
278 */
CopyDumpJsonToDir(uint32_t rank_id)279 void DumpJsonParser::CopyDumpJsonToDir(uint32_t rank_id) {
280 this->Parse();
281 if (!IsDumpEnabled()) {
282 return;
283 }
284 auto dump_config_file = Common::GetConfigFile(kMindsporeDumpConfig);
285 if (!dump_config_file.has_value()) {
286 MS_LOG(EXCEPTION) << "Get dump config file failed.";
287 }
288 std::ifstream json_file(dump_config_file.value());
289 if (async_dump_enabled_ || e2e_dump_enabled_) {
290 auto realpath =
291 Common::CreatePrefixPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/data_dump.json");
292 if (!realpath.has_value()) {
293 MS_LOG(ERROR) << "Get real path failed in CopyDumpJsonToDir.";
294 } else {
295 if (!Common::FileExists(realpath.value())) {
296 WriteJsonFile(realpath.value(), json_file);
297 } else {
298 MS_LOG(WARNING) << "The file: " << realpath.value() << " is already exist, skip copy it.";
299 }
300 }
301 }
302 }
303
304 /*
305 * Feature group: Dump.
306 * Target device group: Ascend.
307 * Runtime category: Old runtime, MindRT.
308 * Description: Copy the hccl configuration file to the root directory of dump path.
309 */
CopyHcclJsonToDir(uint32_t rank_id)310 void DumpJsonParser::CopyHcclJsonToDir(uint32_t rank_id) {
311 if (!IsDumpEnabled()) {
312 return;
313 }
314 std::string config_path = common::GetEnv("MINDSPORE_HCCL_CONFIG_PATH");
315 if (config_path.empty()) {
316 config_path = common::GetEnv("RANK_TABLE_FILE");
317 if (config_path.empty()) {
318 MS_LOG(INFO) << "Get hccl json config failed.";
319 return;
320 }
321 }
322 std::ifstream json_file(config_path);
323 auto realpath = Common::CreatePrefixPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/hccl.json");
324 if (!realpath.has_value()) {
325 MS_LOG(ERROR) << "Get real path failed in CopyHcclJsonToDir.";
326 } else {
327 WriteJsonFile(realpath.value(), json_file);
328 }
329 }
330
331 /*
332 * Feature group: Dump.
333 * Target device group: Ascend, GPU and CPU.
334 * Runtime category: Old runtime, MindRT.
335 * Description: Copy the mindspore configuration file to the root directory of dump path. It provides the device and
336 * ms_version information.
337 */
CopyMSCfgJsonToDir(uint32_t rank_id)338 void DumpJsonParser::CopyMSCfgJsonToDir(uint32_t rank_id) {
339 if (!IsDumpEnabled()) {
340 return;
341 }
342 auto realpath = Common::CreatePrefixPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/config.json");
343 if (!realpath.has_value()) {
344 MS_LOG(ERROR) << "Get real path failed in CopyMSConfigJsonToDir.";
345 } else {
346 if (Common::FileExists(realpath.value())) {
347 MS_LOG(WARNING) << "The file: " << realpath.value() << " is already exist, skip copy it.";
348 return;
349 }
350 nlohmann::json ms_info;
351 auto context = MsContext::GetInstance();
352 MS_EXCEPTION_IF_NULL(context);
353 ms_info["device_target"] = context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
354 ms_info["ms_version"] = MSVERSION;
355 const std::string file_path = realpath.value();
356 ChangeFileMode(file_path, S_IWUSR);
357 std::ofstream json_create(file_path);
358 if (!json_create.is_open()) {
359 MS_LOG(EXCEPTION) << "Json file " << file_path << "open failed!";
360 }
361 json_create << ms_info;
362 json_create.close();
363 ChangeFileMode(file_path, S_IRUSR);
364 }
365 }
366
GetIterDumpFlag() const367 bool DumpJsonParser::GetIterDumpFlag() const { return e2e_dump_enabled_ && IsDumpIter(cur_dump_iter_); }
368
DumpEnabledForIter() const369 bool DumpJsonParser::DumpEnabledForIter() const {
370 return ((e2e_dump_enabled_ || async_dump_enabled_) && IsDumpIter(cur_dump_iter_));
371 }
372
373 /*
374 * Feature group: Dump.
375 * Target device group: Ascend, GPU and CPU.
376 * Runtime category: Old runtime, MindRT.
377 * Description: Dump data in the given address into npy file.
378 */
DumpToFile(const std::string & filename,const void * data,size_t len,const ShapeVector & shape,TypeId type)379 bool DumpJsonParser::DumpToFile(const std::string &filename, const void *data, size_t len, const ShapeVector &shape,
380 TypeId type) {
381 if (filename.empty() && (data == nullptr || len == 0)) {
382 MS_LOG(ERROR) << "Filename and data are empty or null.";
383 return false;
384 } else if (filename.empty()) {
385 MS_LOG(ERROR) << "Filename is empty.";
386 return false;
387 } else if (data == nullptr || len == 0) {
388 MS_LOG(WARNING) << "Data is empty or null for file: " << filename;
389 return false;
390 }
391 std::string npy_header = GenerateNpyHeader(shape, type);
392 if (npy_header.empty()) {
393 MS_LOG(WARNING) << "Failed to generate npy_header for file: " << filename;
394 return false;
395 }
396 std::string npy_suffix = ".npy";
397 std::string origin_file_path = filename + npy_suffix;
398 std::optional<std::string> prefix_path;
399 std::optional<std::string> origin_name;
400 std::optional<std::string> mapped_name;
401 bool need_map = Common::MappingName(origin_file_path, &prefix_path, &origin_name, &mapped_name);
402 if (!prefix_path.has_value() || !origin_name.has_value() || !mapped_name.has_value()) {
403 MS_LOG(ERROR) << "Cannot get prefix_path or file_name from: " << origin_file_path;
404 return false;
405 }
406 std::string final_file_path = origin_file_path;
407 if (need_map) {
408 std::string origin_name_str = origin_name.value();
409 std::string mapped_name_str = mapped_name.value();
410 std::lock_guard<std::mutex> guard(lock_);
411 auto mapping_file = Common::CreatePrefixPath(prefix_path.value() + "/mapping.csv");
412 if (!mapping_file.has_value()) {
413 MS_LOG(ERROR) << "CreatePrefixPath for mapping.csv failed.";
414 return false;
415 }
416 const std::string mapping_file_str = mapping_file.value();
417 // try to open file
418 ChangeFileMode(mapping_file_str, S_IWUSR);
419 std::ofstream fout(mapping_file_str, std::ofstream::app);
420 if (!fout.is_open()) {
421 MS_LOG(WARNING) << "Open file for mapping.csv failed.";
422 return false;
423 }
424 fout << mapped_name_str << "," << origin_name_str << "\n";
425 fout.close();
426 ChangeFileMode(mapping_file_str, S_IRUSR);
427 final_file_path = prefix_path.value() + "/" + mapped_name_str;
428 }
429 auto file_path = Common::CreatePrefixPath(final_file_path);
430 if (!file_path.has_value()) {
431 MS_LOG(ERROR) << "CreatePrefixPath failed.";
432 return false;
433 }
434 const std::string file_path_str = file_path.value();
435 MS_LOG(INFO) << "Dump path is " << file_path_str;
436 ChangeFileMode(file_path_str, S_IWUSR);
437
438 MSLogTime msTime;
439 msTime.Start();
440 std::ofstream fd(file_path_str, std::ios::out | std::ios::trunc | std::ios::binary);
441 if (!fd.is_open()) {
442 MS_LOG(EXCEPTION) << "Open file " << file_path_str << " failed." << ErrnoToString(errno);
443 }
444 fd << npy_header;
445 (void)fd.write(reinterpret_cast<const char *>(data), SizeToLong(len));
446 if (fd.bad()) {
447 fd.close();
448 MS_LOG(EXCEPTION)
449 << "Write mem to file " << file_path_str
450 << " failed. This error may be caused by insufficient disk space. Please check the available disk space.";
451 }
452 fd.close();
453 msTime.End();
454 MS_LOG(DEBUG) << "Dump file costs time : " << msTime.GetRunTimeUS() << " microseconds.";
455
456 ChangeFileMode(file_path_str, S_IRUSR);
457 return true;
458 }
459
ParseCommonDumpSetting(const nlohmann::json & content)460 void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
461 // async_dump is enabled by default, if e2e dump is enabled it will override this
462 auto context = MsContext::GetInstance();
463 MS_EXCEPTION_IF_NULL(context);
464 if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
465 async_dump_enabled_ = true;
466 } else if (!e2e_dump_enabled_) {
467 e2e_dump_enabled_ = true;
468 trans_flag_ = true;
469 sample_mode_ = 0;
470 sample_num_ = 100;
471 }
472
473 auto common_dump_settings = CheckJsonKeyExist(content, kCommonDumpSettings);
474 auto dump_mode = CheckJsonKeyExist(*common_dump_settings, kDumpMode);
475 auto net_name = CheckJsonKeyExist(*common_dump_settings, kNetName);
476 auto iteration = CheckJsonKeyExist(*common_dump_settings, kIteration);
477 auto input_output = CheckJsonKeyExist(*common_dump_settings, kInputOutput);
478 auto kernels = CheckJsonKeyExist(*common_dump_settings, kKernels);
479 auto support_device = CheckJsonKeyExist(*common_dump_settings, kSupportDevice);
480
481 nlohmann::detail::iter_impl<const nlohmann::json> op_debug_mode;
482 if (!e2e_dump_enabled_) {
483 op_debug_mode = CheckJsonKeyExist(*common_dump_settings, kOpDebugMode);
484 } else {
485 if (CheckSelectableKeyExist(*common_dump_settings, kOpDebugMode)) {
486 op_debug_mode = CheckJsonKeyExist(*common_dump_settings, kOpDebugMode);
487 }
488 }
489
490 ParseDumpMode(*dump_mode);
491 ParseDumpPath(*common_dump_settings); // Pass in the whole json string to parse because the path field is optional.
492 ParseNetName(*net_name);
493 ParseIteration(*iteration);
494 ParseInputOutput(*input_output);
495 ParseKernels(*kernels);
496 ParseSupportDevice(*support_device);
497 if (!e2e_dump_enabled_) {
498 ParseOpDebugMode(*op_debug_mode);
499 ParseFileFormat(
500 *common_dump_settings); // Pass in the whole json string to parse because file_format field is optional.
501 } else {
502 if (CheckSelectableKeyExist(*common_dump_settings, kOpDebugMode)) {
503 ParseOpDebugMode(*op_debug_mode);
504 }
505 }
506 ParseSavedData(*common_dump_settings); // saved data optional
507 }
508
ParseE2eDumpSetting(const nlohmann::json & content)509 void DumpJsonParser::ParseE2eDumpSetting(const nlohmann::json &content) {
510 auto e2e_dump_setting = content.find(kE2eDumpSettings);
511 auto context = MsContext::GetInstance();
512 MS_EXCEPTION_IF_NULL(context);
513 if (e2e_dump_setting == content.end()) {
514 MS_LOG(INFO) << "No e2e_dump_settings";
515 return;
516 }
517
518 auto e2e_dump_enable = CheckJsonKeyExist(*e2e_dump_setting, kEnable);
519 auto trans_flag = CheckJsonKeyExist(*e2e_dump_setting, kTransFlag);
520
521 if (CheckSelectableKeyExist(*e2e_dump_setting, kSaveArgs)) {
522 auto save_args_flag = CheckJsonKeyExist(*e2e_dump_setting, kSaveArgs);
523 save_args_flag_ = ParseEnable(*save_args_flag);
524 }
525 e2e_dump_enabled_ = ParseEnable(*e2e_dump_enable);
526 trans_flag_ = ParseEnable(*trans_flag);
527 ParseStatCalcMode(*e2e_dump_setting);
528 if (CheckSelectableKeyExist(*e2e_dump_setting, kSampleMode)) {
529 auto sample_mode = CheckJsonKeyExist(*e2e_dump_setting, kSampleMode);
530 ParseSampleMode(*sample_mode);
531 if (CheckSelectableKeyExist(*e2e_dump_setting, kSampleNum) &&
532 sample_mode_ == static_cast<uint32_t>(DUMP_HEAD_AND_TAIL)) {
533 auto sample_num = CheckJsonKeyExist(*e2e_dump_setting, kSampleNum);
534 ParseSampleNum(*sample_num);
535 }
536 }
537 }
538
CheckJsonUnsignedType(const nlohmann::json & content,const std::string & key)539 void CheckJsonUnsignedType(const nlohmann::json &content, const std::string &key) {
540 if (!content.is_number_unsigned()) {
541 MS_LOG(EXCEPTION) << "Dump config parse failed, " << key << " should be unsigned int type";
542 }
543 }
544
CheckJsonStringType(const nlohmann::json & content,const std::string & key)545 void CheckJsonStringType(const nlohmann::json &content, const std::string &key) {
546 if (!content.is_string()) {
547 MS_LOG(EXCEPTION) << "Dump config parse failed, " << key << " should be string type";
548 }
549 }
550
CheckJsonArrayType(const nlohmann::json & content,const std::string & key)551 void CheckJsonArrayType(const nlohmann::json &content, const std::string &key) {
552 if (!content.is_array()) {
553 MS_LOG(EXCEPTION) << "Dump config parse failed, " << key << " should be array type";
554 }
555 }
556
ParseDumpMode(const nlohmann::json & content)557 void DumpJsonParser::ParseDumpMode(const nlohmann::json &content) {
558 auto context = MsContext::GetInstance();
559 MS_EXCEPTION_IF_NULL(context);
560 CheckJsonUnsignedType(content, kDumpMode);
561 dump_mode_ = content;
562 if (dump_mode_ > static_cast<uint32_t>(DUMP_KERNELS_WITH_FLAG)) {
563 MS_LOG(EXCEPTION) << "Dump config parse failed, dump_mode should be 0, 1 or 2, but got " << dump_mode_;
564 }
565 if (dump_mode_ == static_cast<uint32_t>(DUMP_KERNELS_WITH_FLAG)) {
566 if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kAscendDevice) {
567 MS_LOG(EXCEPTION) << "Cell dump is only supported in Ascend async dump. Please set dump_mode to 0 or 1.";
568 }
569 }
570 }
571
ParseDumpPath(const nlohmann::json & content)572 void DumpJsonParser::ParseDumpPath(const nlohmann::json &content) {
573 std::string dump_path;
574 auto json_iter = content.find(kPath);
575 // Check if `path` field exists in dump json file.
576 if (json_iter != content.end()) {
577 CheckJsonStringType(*json_iter, kPath);
578 dump_path = *json_iter;
579 }
580 if (dump_path.empty()) {
581 // If no path is found or path is set as empty in dump json file, use MS_DIAGNOSTIC_DATA_PATH/debug_dump as the dump
582 // path value if the env exists.
583 dump_path = common::GetEnv("MS_DIAGNOSTIC_DATA_PATH");
584 if (dump_path.empty()) {
585 MS_LOG(EXCEPTION)
586 << "Dump path is empty. Please set it in dump json file or environment variable `MS_DIAGNOSTIC_DATA_PATH`.";
587 } else {
588 dump_path += "/debug_dump";
589 }
590 }
591 path_ = dump_path;
592 if (!std::all_of(path_.begin(), path_.end(),
593 [](char c) { return ::isalpha(c) || ::isdigit(c) || c == '-' || c == '_' || c == '/'; })) {
594 MS_LOG(EXCEPTION) << "Dump path only support alphabets, digit or {'-', '_', '/'}, but got:" << path_;
595 }
596 if (path_[0] != '/') {
597 MS_LOG(EXCEPTION) << "Dump path only support absolute path and should start with '/'";
598 }
599 }
600
ParseNetName(const nlohmann::json & content)601 void DumpJsonParser::ParseNetName(const nlohmann::json &content) {
602 CheckJsonStringType(content, kNetName);
603 net_name_ = content;
604 if (net_name_.empty() || !std::all_of(net_name_.begin(), net_name_.end(),
605 [](char c) { return ::isalpha(c) || ::isdigit(c) || c == '-' || c == '_'; })) {
606 MS_LOG(EXCEPTION) << "net_name only supports alphabetic, digit, or {'-', '_'}, but got: " << net_name_;
607 }
608 }
609
ParseSavedData(const nlohmann::json & content)610 void DumpJsonParser::ParseSavedData(const nlohmann::json &content) {
611 saved_data_ = kTensorDump; // default to tensor data dump
612 auto json_iter = content.find(kSavedData);
613 if (json_iter != content.end()) {
614 CheckJsonStringType(*json_iter, kSavedData);
615 saved_data_ = *json_iter;
616 }
617 if (e2e_dump_enabled_ && op_debug_mode_ == static_cast<uint32_t>(DUMP_LITE_EXCEPTION) && saved_data_ != kTensorDump) {
618 MS_LOG(WARNING) << "E2e exception dump only support save tensor, saved_data is set to tensor";
619 saved_data_ = kTensorDump;
620 }
621 if (saved_data_ != kStatisticDump && saved_data_ != kTensorDump && saved_data_ != kFullDump) {
622 MS_LOG(EXCEPTION) << "Dump Json parse failed, saved_data only supports statistic, tensor, or full, but got: "
623 << saved_data_ << ". Please set saved_data to either statistic, tensor, or full";
624 }
625 auto context = MsContext::GetInstance();
626 if (IsStatisticDump() && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) {
627 MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on GPU and Ascend, please "
628 "set saved_data to tensor or use a GPU or Ascend device";
629 }
630 if (IsStatisticDump() && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
631 if (!IsNpyFormat() && !e2e_dump_enabled_) {
632 MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on Ascend when "
633 "file_format is set to 'npy'.";
634 }
635 }
636 }
637
ParseIteration(const nlohmann::json & content)638 void DumpJsonParser::ParseIteration(const nlohmann::json &content) {
639 CheckJsonStringType(content, kIteration);
640 auto context = MsContext::GetInstance();
641 MS_EXCEPTION_IF_NULL(context);
642 if (e2e_dump_enabled_ || async_dump_enabled_) {
643 iteration_ = content;
644 if (iteration_.empty() || (!std::all_of(iteration_.begin(), iteration_.end(), [](char c) {
645 return ::isdigit(c) || c == '-' || c == '|';
646 }) && iteration_ != "all")) {
647 MS_LOG(EXCEPTION) << "iteration only supports digits, {'-', '|'}, or just \"all\" but got: " << iteration_;
648 }
649 } else if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) {
650 MS_LOG(WARNING) << "Dump is not enabled. ";
651 } else {
652 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. Async or E2E should be enabled. ";
653 }
654 }
655
IsIterInRange(uint32_t iteration,const std::string & range)656 bool IsIterInRange(uint32_t iteration, const std::string &range) {
657 if (range.empty()) {
658 return false;
659 }
660 const std::string dash = "-";
661 std::size_t range_idx = range.find(dash);
662 // no dash in range, compare the value directly
663 if (range_idx == std::string::npos) {
664 size_t range_d = 0;
665 if (!CheckStoul(&range_d, range)) {
666 MS_LOG(INFO) << "Failed to convert the single step range: " << range
667 << " into an integer, so the iteration: " << iteration << " is regarded as not in dump range.";
668 return false;
669 }
670 return iteration == range_d;
671 }
672 // make sure there is only one dash in range
673 if (range.find(dash, range_idx + 1) != std::string::npos) {
674 return false;
675 }
676 auto low_range_str = range.substr(0, range_idx);
677 auto high_range_str = range.substr(range_idx + 1);
678 if (low_range_str.empty() || high_range_str.empty()) {
679 return false;
680 }
681 size_t low_range = 0;
682 if (!CheckStoul(&low_range, low_range_str)) {
683 MS_LOG(INFO) << "Failed to convert the low_range_str: " << low_range_str
684 << " into an integer, so the iteration: " << iteration << " is regarded as not in dump range.";
685 return false;
686 }
687 size_t high_range = 0;
688 if (!CheckStoul(&high_range, high_range_str)) {
689 MS_LOG(INFO) << "Failed to convert the high_range_str: " << high_range_str
690 << " into an integer, so the iteration: " << iteration << " is regarded as not in dump range.";
691 return false;
692 }
693 return (low_range <= iteration) && (iteration <= high_range);
694 }
695
IsStatisticDump() const696 bool DumpJsonParser::IsStatisticDump() const { return saved_data_ == kStatisticDump || IsFullDump(); }
697
IsTensorDump() const698 bool DumpJsonParser::IsTensorDump() const { return saved_data_ == kTensorDump || IsFullDump(); }
699
IsFullDump() const700 bool DumpJsonParser::IsFullDump() const { return saved_data_ == kFullDump; }
701
IsNpyFormat() const702 bool DumpJsonParser::IsNpyFormat() const { return file_format_ == JsonFileFormat::FORMAT_NPY; }
703
IsDumpIter(uint32_t iteration) const704 bool DumpJsonParser::IsDumpIter(uint32_t iteration) const {
705 // bool DumpJsonParser::IsDumpIter(uint32_t iteration) --> checks if iteration should be dumped or not.
706 if (iteration_ == "all") {
707 return true;
708 }
709 const std::string vertical_bar = "|";
710 std::size_t start = 0;
711 std::size_t end = iteration_.find(vertical_bar);
712 while (end != std::string::npos) {
713 std::string temp = iteration_.substr(start, end - start);
714 auto found = IsIterInRange(iteration, temp);
715 if (found) {
716 return true;
717 }
718 start = end + 1;
719 end = iteration_.find(vertical_bar, start);
720 }
721 std::string temp = iteration_.substr(start);
722 return IsIterInRange(iteration, temp);
723 }
724
ParseInputOutput(const nlohmann::json & content)725 void DumpJsonParser::ParseInputOutput(const nlohmann::json &content) {
726 CheckJsonUnsignedType(content, kInputOutput);
727 input_output_ = content;
728 const uint32_t max_inout_num = 2;
729 if (input_output_ > max_inout_num) {
730 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. input_output should be 0, 1, 2";
731 }
732 }
733
ParseKernels(const nlohmann::json & content)734 void DumpJsonParser::ParseKernels(const nlohmann::json &content) {
735 CheckJsonArrayType(content, kKernels);
736 if (dump_mode_ != static_cast<uint32_t>(DUMP_KERNEL)) {
737 MS_LOG(INFO) << "Dump config field <" << kKernels << "> is not used as the dump mode is not 1.";
738 return;
739 }
740 kernels_json_ = content;
741 auto context = MsContext::GetInstance();
742 MS_EXCEPTION_IF_NULL(context);
743 std::string backend = context->backend_policy();
744 for (const auto &kernel : content) {
745 bool ret;
746 auto kernel_str = kernel.dump();
747 MS_LOG(INFO) << "Need dump kernel:" << kernel_str;
748 kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), '\"'), kernel_str.end());
749 kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), ' '), kernel_str.end());
750 if (kernel_str == "") {
751 continue;
752 }
753 if (static_cast<int>(kernel_str.find("name-regex(")) == 0 &&
754 static_cast<int>(kernel_str.rfind(")")) == static_cast<int>(kernel_str.length()) - kBracketsOffset) {
755 std::string kernel_reg_exp = kernel_str.substr(
756 kRegexPrefixLength, static_cast<int>(kernel_str.length()) - kRegexPrefixLength - kBracketsOffset);
757 ret = kernel_regs_.try_emplace(kernel_str, std::regex(kernel_reg_exp)).second;
758 dump_layer_ += kernel_str + " ";
759 } else {
760 if (static_cast<int>(kernel_str.rfind('/')) == -1 && static_cast<int>(kernel_str.rfind("-op")) == -1) {
761 if (backend == "ge") {
762 MS_LOG(WARNING) << "It is not supported to specify operator types on 1980B backend. " << kernel_str
763 << " maybe not take effect.";
764 dump_layer_ += kernel_str + " ";
765 }
766 ret = kernel_types_.try_emplace({kernel_str, 0}).second;
767 } else {
768 ret = kernels_.try_emplace({kernel_str, 0}).second;
769 dump_layer_ += kernel_str + " ";
770 }
771 }
772 kernel_strings_.try_emplace({kernel_str, 0});
773 if (!ret) {
774 MS_LOG(WARNING) << "Duplicate dump kernel name:" << kernel_str;
775 }
776 if (kernel_strings_.empty()) {
777 kernel_types_.try_emplace({"", 0});
778 kernel_strings_.try_emplace({"", 0});
779 }
780 }
781 }
782
ParseStatCalcMode(const nlohmann::json & content)783 void DumpJsonParser::ParseStatCalcMode(const nlohmann::json &content) {
784 auto iter = content.find(kStatCalcMode);
785 stat_calc_mode_ = kHost;
786 if (iter == content.end()) {
787 MS_LOG(INFO) << "'stat_calc_mode' is not set, default is " << stat_calc_mode_;
788 return;
789 }
790 CheckJsonStringType(*iter, kStatCalcMode);
791 std::string calc_mode = *iter;
792 if (calc_mode != kHost && calc_mode != kDevice) {
793 MS_LOG(EXCEPTION) << "Dump Json parse failed, 'stat_calc_mode' only supports 'host' or 'device', but got: "
794 << calc_mode << ". Please set 'stat_cal_mode' to 'host' or 'device'";
795 }
796 stat_calc_mode_ = calc_mode;
797 }
798
CheckStatCalcModeVaild()799 void DumpJsonParser::CheckStatCalcModeVaild() {
800 if (IsTensorDump() && stat_calc_mode_ == kDevice) {
801 MS_LOG(WARNING) << "When 'saved_data' is 'tensor' or 'full', the device cannot be used to calculate statistics and "
802 "the 'stat_calc_mode' is forced to 'host'.";
803 stat_calc_mode_ = kHost;
804 }
805 auto context = MsContext::GetInstance();
806 MS_EXCEPTION_IF_NULL(context);
807 auto device_target = context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
808 if (device_target != kAscendDevice && stat_calc_mode_ == kDevice) {
809 MS_LOG(WARNING)
810 << "The 'device' option of 'stat_calc_mode' currently only supports the ascend platform. The current platform is "
811 << device_target << ", and the 'stat_calc_mode' option is forcibly set to 'host'.";
812 stat_calc_mode_ = kHost;
813 }
814 MS_LOG(INFO) << "stat_calc_mode is set to " << stat_calc_mode_;
815 }
816
IsDeviceCalcStats() const817 bool DumpJsonParser::IsDeviceCalcStats() const { return stat_calc_mode_ == kDevice; }
818
ParseSupportDevice(const nlohmann::json & content)819 void DumpJsonParser::ParseSupportDevice(const nlohmann::json &content) {
820 CheckJsonArrayType(content, kSupportDevice);
821 for (const auto &device : content) {
822 uint32_t device_id = device;
823 MS_LOG(INFO) << "Dump support device:" << device_id;
824 auto ret = support_devices_.emplace(device_id);
825 if (!ret.second) {
826 MS_LOG(WARNING) << "Duplicate support device:" << device_id;
827 }
828 }
829 }
830
ParseEnable(const nlohmann::json & content) const831 bool DumpJsonParser::ParseEnable(const nlohmann::json &content) const {
832 if (!content.is_boolean()) {
833 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. 'enable' should be boolean type";
834 }
835 return content;
836 }
837
ParseSampleMode(const nlohmann::json & content)838 void DumpJsonParser::ParseSampleMode(const nlohmann::json &content) {
839 CheckJsonUnsignedType(content, kSampleMode);
840 sample_mode_ = content;
841 const uint32_t max_inout_num = 1;
842 if (sample_mode_ > max_inout_num) {
843 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. sample_mode should be 0, 1";
844 }
845 }
846
ParseSampleNum(const nlohmann::json & content)847 void DumpJsonParser::ParseSampleNum(const nlohmann::json &content) {
848 CheckJsonUnsignedType(content, kSampleMode);
849 sample_num_ = content;
850 const uint32_t min_inout_num = 1;
851 if (sample_num_ < min_inout_num) {
852 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. sample_num should be greater than 0";
853 }
854 }
855
ParseOpDebugMode(const nlohmann::json & content)856 void DumpJsonParser::ParseOpDebugMode(const nlohmann::json &content) {
857 CheckJsonUnsignedType(content, kOpDebugMode);
858 op_debug_mode_ = content;
859 switch (op_debug_mode_) {
860 case static_cast<uint32_t>(DUMP_WHOLE):
861 break;
862 case static_cast<uint32_t>(DUMP_AICORE_OVERFLOW):
863 case static_cast<uint32_t>(DUMP_ATOMIC_OVERFLOW):
864 if (e2e_dump_enabled_) {
865 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. op_debug_mode should be 0, 3, 4";
866 }
867 break;
868 case static_cast<uint32_t>(DUMP_BOTH_OVERFLOW): {
869 if (e2e_dump_enabled_) {
870 dump_mode_ = static_cast<uint32_t>(DUMP_ALL);
871 }
872 break;
873 }
874 case static_cast<uint32_t>(DUMP_LITE_EXCEPTION): {
875 auto context = MsContext::GetInstance();
876 MS_EXCEPTION_IF_NULL(context);
877 auto device_target = context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
878 if (device_target == "CPU" || device_target == "GPU") {
879 MS_LOG(WARNING) << "Abnormal dump is not supported on " << device_target
880 << " backend, and none operator data would be saved when abnormal dump is enabled. ";
881 }
882 if (IsAclDump() || e2e_dump_enabled_) {
883 if (e2e_dump_enabled_ && iteration_ != "all") {
884 MS_LOG(WARNING) << "For e2e exception dump, it is not support to specify iteration, set iteration to all.";
885 iteration_ = "all";
886 }
887 if (e2e_dump_enabled_ && sample_mode_ != 0) {
888 MS_LOG(WARNING) << "For e2e exception dump, it is not support to sample dump, set sample_mode to 0, the "
889 "whole tensor would be saved when exception occur.";
890 sample_mode_ = 0;
891 }
892 break;
893 } else {
894 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. op_debug_mode should be 0, 1, 2, 3";
895 }
896 }
897 default:
898 if (IsAclDump()) {
899 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. op_debug_mode should be 0, 1, 2, 3, 4";
900 } else if (e2e_dump_enabled_) {
901 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. op_debug_mode should be 0, 3, 4";
902 } else {
903 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. op_debug_mode should be 0, 1, 2, 3";
904 }
905 }
906 }
907
ParseFileFormat(const nlohmann::json & content)908 void DumpJsonParser::ParseFileFormat(const nlohmann::json &content) {
909 auto iter = content.find(kFileFormat);
910 if (iter == content.end()) {
911 file_format_ = JsonFileFormat::FORMAT_BIN;
912 } else {
913 CheckJsonStringType(*iter, kFileFormat);
914 std::string file_format = *iter;
915 const std::map<std::string, JsonFileFormat> str_to_fmt_enum = {{"bin", JsonFileFormat::FORMAT_BIN},
916 {"npy", JsonFileFormat::FORMAT_NPY}};
917 if (str_to_fmt_enum.find(file_format) == str_to_fmt_enum.end()) {
918 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. 'file_format' should be either 'npy' or 'bin', but got: "
919 << file_format;
920 }
921 file_format_ = str_to_fmt_enum.at(file_format);
922 }
923 }
924
JsonConfigToString()925 void DumpJsonParser::JsonConfigToString() {
926 std::string cur_config;
927 cur_config.append("dump_mode:");
928 cur_config.append(std::to_string(dump_mode_));
929 cur_config.append(" path:");
930 cur_config.append(path_);
931 cur_config.append(" net_name:");
932 cur_config.append(net_name_);
933 cur_config.append(" iteration:");
934 cur_config.append(iteration_);
935 cur_config.append(" input_output:");
936 cur_config.append(std::to_string(input_output_));
937 cur_config.append("e2e_enable:");
938 cur_config.append(std::to_string(static_cast<int>(e2e_dump_enabled_)));
939 cur_config.append(" async_dump_enable:");
940 cur_config.append(std::to_string(static_cast<int>(async_dump_enabled_)));
941 MS_LOG(INFO) << cur_config;
942 }
943
JudgeDumpEnabled()944 void DumpJsonParser::JudgeDumpEnabled() {
945 auto context = MsContext::GetInstance();
946 MS_EXCEPTION_IF_NULL(context);
947 if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice) {
948 async_dump_enabled_ = false;
949 }
950
951 if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
952 if (async_dump_enabled_ && e2e_dump_enabled_) {
953 async_dump_enabled_ = false;
954 MS_LOG(INFO) << "Disable async dump";
955 }
956 }
957
958 if (!async_dump_enabled_ && !e2e_dump_enabled_) {
959 MS_LOG(WARNING) << "Dump json parse failed. Dump is not enabled";
960 }
961 if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kCPUDevice) {
962 auto device_id = context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
963 if (support_devices_.find(device_id) == support_devices_.end()) {
964 async_dump_enabled_ = false;
965 e2e_dump_enabled_ = false;
966 MS_LOG(WARNING) << "Dump is not enabled. device_id:" << device_id << " not support";
967 }
968 }
969 if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
970 if (async_dump_enabled_ && !IsAclDump()) {
971 if (context->IsKByKExecutorMode()) {
972 MS_LOG(WARNING)
973 << "When jit_level is set to 'o0' or 'o1', async_dump only support acl dump method, ie. set environment "
974 "MS_ACL_DUMP_CFG_PATH to the same path with MINDSPORE_DUMP_CONFIG. In fact, e2e dump is preferable.";
975 }
976 }
977 }
978 JsonConfigToString();
979 }
980
981 /*
982 * Feature group: Dump.
983 * Target device group: Ascend, GPU and CPU.
984 * Runtime category: Old runtime, MindRT.
985 * Description: Check if the given op needs to be dumped based the configuration option.
986 */
NeedDump(const std::string & op_full_name)987 bool DumpJsonParser::NeedDump(const std::string &op_full_name) {
988 bool need_dump = false;
989
990 switch (dump_mode_) {
991 case DUMP_ALL:
992 need_dump = true;
993 break;
994 case DUMP_KERNEL:
995 for (const auto &iter : kernel_regs_) {
996 if (regex_match(op_full_name, iter.second)) {
997 need_dump = true;
998 MatchKernel(iter.first);
999 break;
1000 }
1001 }
1002 if (need_dump) {
1003 break;
1004 }
1005 if (kernels_.find(op_full_name) != kernels_.end()) {
1006 need_dump = true;
1007 MatchKernel(op_full_name);
1008 break;
1009 }
1010 for (const auto &iter : kernel_types_) {
1011 int start_index = static_cast<int>(op_full_name.rfind('/')) + 1;
1012 int end_index = static_cast<int>(op_full_name.rfind('-'));
1013 if (end_index == -1) {
1014 end_index = static_cast<int>(op_full_name.length());
1015 }
1016 std::string op_name = op_full_name.substr(start_index, end_index - start_index);
1017 transform(op_name.begin(), op_name.end(), op_name.begin(), ::tolower);
1018 std::string kernel_type(iter.first);
1019 transform(kernel_type.begin(), kernel_type.end(), kernel_type.begin(), ::tolower);
1020 if (op_name.find(kernel_type) != std::string::npos) {
1021 need_dump = true;
1022 MatchKernel(kernel_type);
1023 break;
1024 }
1025 }
1026 break;
1027 case DUMP_KERNELS_WITH_FLAG:
1028 if (std::find(cell_dump_kernels_.begin(), cell_dump_kernels_.end(), op_full_name) != cell_dump_kernels_.end()) {
1029 need_dump = true;
1030 }
1031 break;
1032 default:
1033 break;
1034 }
1035 return need_dump;
1036 }
1037
1038 /*
1039 * Feature group: Dump.
1040 * Target device group: Ascend, GPU and CPU.
1041 * Runtime category: Old runtime, MindRT.
1042 * Description: Increment the count of dumping for given kernel.
1043 */
MatchKernel(const std::string & kernel_name)1044 void DumpJsonParser::MatchKernel(const std::string &kernel_name) {
1045 auto iter = kernel_strings_.find(kernel_name);
1046 if (iter == kernel_strings_.end()) {
1047 return;
1048 }
1049 iter->second = iter->second + 1;
1050 MS_LOG(INFO) << "Match dump kernel:" << iter->first << " match times:" << iter->second;
1051 }
1052
PrintUnusedKernel()1053 void DumpJsonParser::PrintUnusedKernel() {
1054 if ((!e2e_dump_enabled_ && !async_dump_enabled_) || dump_mode_ != static_cast<uint32_t>(DUMP_KERNEL)) {
1055 return;
1056 }
1057 for (const auto &iter : kernel_strings_) {
1058 if (iter.second == 0) {
1059 MS_LOG(WARNING) << "[DataDump] Unused Kernel in json: " << iter.first;
1060 }
1061 }
1062 }
1063
1064 /*
1065 * Feature group: Online debugger.
1066 * Target device group: Ascend.
1067 * Runtime category: Old runtime, MindRT.
1068 * Description: Generate the directory path where overflow bin file locates.
1069 */
GetOpOverflowBinPath(uint32_t graph_id) const1070 std::string DumpJsonParser::GetOpOverflowBinPath(uint32_t graph_id) const {
1071 std::string bin_path;
1072 bin_path.append(path_);
1073 bin_path.append("/");
1074 bin_path.append("rank_");
1075
1076 uint32_t rank_id = 0;
1077 auto ms_context = MsContext::GetInstance();
1078 MS_EXCEPTION_IF_NULL(ms_context);
1079 auto env_rank_id = common::GetEnv("RANK_ID");
1080 if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
1081 // get actual rank id if it's distribution training case.
1082 if (!CommManager::GetInstance().GetRankID(kHcclWorldGroup, &rank_id)) {
1083 MS_LOG(INFO) << "Failed to get rank id.";
1084 }
1085 }
1086 bin_path.append(std::to_string(rank_id));
1087
1088 bin_path.append("/");
1089 bin_path.append(net_name_);
1090 bin_path.append("/");
1091 bin_path.append(std::to_string(graph_id));
1092 bin_path.append("/");
1093 bin_path.append(std::to_string(cur_dump_iter_));
1094 bin_path.append("/");
1095
1096 return bin_path;
1097 }
1098
InputNeedDump() const1099 bool DumpJsonParser::InputNeedDump() const {
1100 return input_output_ == kDumpInputAndOutput || input_output_ == kDumpInputOnly;
1101 }
1102
OutputNeedDump() const1103 bool DumpJsonParser::OutputNeedDump() const {
1104 return input_output_ == kDumpInputAndOutput || input_output_ == kDumpOutputOnly;
1105 }
1106
1107 /*
1108 * Feature group: Dump.
1109 * Target device group: Ascend.
1110 * Runtime category: Old runtime, MindRT.
1111 * Description: Obtain the cell dump flag of each operators in the given kernel graph.
1112 */
GetCellDumpFlag(const session::KernelGraph & kernel_graph)1113 void DumpJsonParser::GetCellDumpFlag(const session::KernelGraph &kernel_graph) {
1114 if (dump_mode_ != static_cast<uint32_t>(DUMP_KERNELS_WITH_FLAG)) {
1115 return;
1116 }
1117 for (const auto &kernel : kernel_graph.execution_order()) {
1118 MS_EXCEPTION_IF_NULL(kernel);
1119 auto dump_flag = common::AnfAlgo::GetDumpFlag(kernel);
1120 if (dump_flag.has_value() && dump_flag.value().compare("true") == 0) {
1121 MS_LOG(DEBUG) << "Dump flag is true for " << GetKernelNodeName(kernel);
1122 cell_dump_kernels_.push_back(GetKernelNodeName(kernel));
1123 }
1124 }
1125 }
1126
UpdateNeedDumpKernels(const session::KernelGraph & kernel_graph)1127 void DumpJsonParser::UpdateNeedDumpKernels(const session::KernelGraph &kernel_graph) {
1128 MS_LOG(INFO) << "Get kernel dump flag";
1129 GetCellDumpFlag(kernel_graph);
1130
1131 if (!async_dump_enabled_) {
1132 return;
1133 }
1134
1135 MS_LOG(INFO) << "Update async dump kernel list for hccl";
1136 for (const auto &kernel : kernel_graph.execution_order()) {
1137 MS_EXCEPTION_IF_NULL(kernel);
1138 if (AnfAlgo::GetKernelType(kernel) == HCCL_KERNEL &&
1139 DumpJsonParser::GetInstance().NeedDump(GetKernelNodeName(kernel)) &&
1140 DumpJsonParser::GetInstance().InputNeedDump()) {
1141 auto input_size = common::AnfAlgo::GetInputTensorNum(kernel);
1142 for (size_t i = 0; i < input_size; ++i) {
1143 auto input_with_index = common::AnfAlgo::GetPrevNodeOutput(kernel, i);
1144 auto input = input_with_index.first;
1145 MS_EXCEPTION_IF_NULL(input);
1146 if (input->isa<CNode>()) {
1147 MS_LOG(INFO) << "[AsyncDump] Match Hccl Node:" << GetKernelNodeName(kernel)
1148 << " Input:" << GetKernelNodeName(input);
1149 hccl_input_kernels_.insert(GetKernelNodeName(input));
1150 }
1151 }
1152 }
1153 }
1154 }
1155
IsHCCLKernelInput(const std::string & kernel_name) const1156 bool DumpJsonParser::IsHCCLKernelInput(const std::string &kernel_name) const {
1157 if (hccl_input_kernels_.empty()) {
1158 return false;
1159 }
1160 auto iter = std::find(hccl_input_kernels_.begin(), hccl_input_kernels_.end(), kernel_name);
1161 if (iter != hccl_input_kernels_.end()) {
1162 return true;
1163 }
1164 return false;
1165 }
1166
IsAclDump()1167 bool DumpJsonParser::IsAclDump() {
1168 bool is_acl_dump = false;
1169 auto env_enable_kbk = common::GetEnv("MS_ACL_DUMP_CFG_PATH");
1170 auto dump_enable_kbk = common::GetEnv("MINDSPORE_DUMP_CONFIG");
1171 if (!env_enable_kbk.empty() && env_enable_kbk == dump_enable_kbk) {
1172 is_acl_dump = true;
1173 }
1174 return is_acl_dump;
1175 }
1176 } // namespace mindspore
1177