1 /**
2 * Copyright 2020-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "debug/data_dump/dump_json_parser.h"
17 #include <fstream>
18 #include "utils/log_adapter.h"
19 #include "debug/common.h"
20 #include "utils/ms_context.h"
21 #include "utils/convert_utils_base.h"
22 #include "backend/session/anf_runtime_algorithm.h"
23 #include "debug/data_dump/npy_header.h"
24 #include "debug/anf_ir_utils.h"
25 #include "utils/comm_manager.h"
26
27 namespace {
28 constexpr auto kCommonDumpSettings = "common_dump_settings";
29 constexpr auto kAsyncDumpSettings = "async_dump_settings";
30 constexpr auto kE2eDumpSettings = "e2e_dump_settings";
31 constexpr auto kDumpMode = "dump_mode";
32 constexpr auto kPath = "path";
33 constexpr auto kNetName = "net_name";
34 constexpr auto kIteration = "iteration";
35 constexpr auto kInputOutput = "input_output";
36 constexpr auto kKernels = "kernels";
37 constexpr auto kSupportDevice = "support_device";
38 constexpr auto kEnable = "enable";
39 constexpr auto kOpDebugMode = "op_debug_mode";
40 constexpr auto kTransFlag = "trans_flag";
41 constexpr auto kDumpInputAndOutput = 0;
42 constexpr auto kDumpInputOnly = 1;
43 constexpr auto kDumpOutputOnly = 2;
44 constexpr auto kMindsporeDumpConfig = "MINDSPORE_DUMP_CONFIG";
45 } // namespace
46
47 namespace mindspore {
CheckJsonKeyExist(const nlohmann::json & content,const std::string & key)48 auto DumpJsonParser::CheckJsonKeyExist(const nlohmann::json &content, const std::string &key) {
49 auto iter = content.find(key);
50 if (iter == content.end()) {
51 MS_LOG(EXCEPTION) << "Check dump json failed, " << key << " not found";
52 }
53 return iter;
54 }
55
GetIfstreamString(const std::ifstream & ifstream)56 std::string GetIfstreamString(const std::ifstream &ifstream) {
57 std::stringstream buffer;
58 buffer << ifstream.rdbuf();
59 return buffer.str();
60 }
61
IsDumpEnabled()62 bool DumpJsonParser::IsDumpEnabled() {
63 auto single_op = common::GetEnv(kGraphOpRun);
64 auto config_path = common::GetEnv(kMindsporeDumpConfig);
65 if (config_path.empty()) {
66 return false;
67 }
68 if (!single_op.empty() && single_op == "1") {
69 MS_LOG(WARNING) << "Dump is not supported when task is not sink. Please set env GRAPH_OP_RUN to 0 to enable task "
70 "sink, so that the data can be dumped.";
71 return false;
72 }
73 MS_LOG(INFO) << "Dump config path is " << config_path;
74
75 auto context = MsContext::GetInstance();
76 MS_EXCEPTION_IF_NULL(context);
77 if (context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
78 MS_LOG(WARNING) << "Dump is disabled in PyNative mode";
79 return false;
80 }
81 return true;
82 }
83
Parse()84 void DumpJsonParser::Parse() {
85 std::lock_guard<std::mutex> guard(lock_);
86 if (already_parsed_) {
87 return;
88 }
89 already_parsed_ = true;
90 if (!IsDumpEnabled()) {
91 return;
92 }
93
94 auto dump_config_file = Common::GetConfigFile(kMindsporeDumpConfig);
95 if (!dump_config_file.has_value()) {
96 MS_LOG(EXCEPTION) << "Get dump config file failed";
97 }
98
99 std::ifstream json_file(dump_config_file.value());
100 if (!json_file.is_open()) {
101 MS_LOG(EXCEPTION) << "Dump file:" << dump_config_file.value() << " open failed."
102 << " Errno:" << errno;
103 }
104
105 nlohmann::json j;
106 try {
107 json_file >> j;
108 } catch (nlohmann::json::parse_error &e) {
109 MS_LOG(ERROR) << "Dump json contents:" << GetIfstreamString(json_file);
110 json_file.close();
111 MS_LOG(EXCEPTION) << "Parse dump json failed, error:" << e.what();
112 }
113
114 // convert json to string
115 std::stringstream ss;
116 ss << j;
117 std::string cfg = ss.str();
118 json_file.close();
119 MS_LOG(INFO) << "Dump json:" << cfg;
120
121 ParseE2eDumpSetting(j);
122 ParseCommonDumpSetting(j);
123 JudgeDumpEnabled();
124 }
125
WriteJsonFile(const std::string & file_path,const std::ifstream & json_file)126 void WriteJsonFile(const std::string &file_path, const std::ifstream &json_file) {
127 ChangeFileMode(file_path, S_IWUSR);
128 std::ofstream json_copy(file_path);
129 if (!json_copy.is_open()) {
130 MS_LOG(EXCEPTION) << "Json file " << file_path << "open failed!";
131 }
132 json_copy << json_file.rdbuf();
133 json_copy.close();
134 ChangeFileMode(file_path, S_IRUSR);
135 }
136
CopyDumpJsonToDir(uint32_t rank_id)137 void DumpJsonParser::CopyDumpJsonToDir(uint32_t rank_id) {
138 this->Parse();
139 if (!IsDumpEnabled()) {
140 return;
141 }
142 auto dump_config_file = Common::GetConfigFile(kMindsporeDumpConfig);
143 if (!dump_config_file.has_value()) {
144 MS_LOG(EXCEPTION) << "Get dump config file failed.";
145 }
146 std::ifstream json_file(dump_config_file.value());
147 if (async_dump_enabled_ || e2e_dump_enabled_) {
148 auto realpath =
149 Common::CreatePrefixPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/data_dump.json");
150 if (!realpath.has_value()) {
151 MS_LOG(ERROR) << "Get real path failed in CopyDumpJsonToDir.";
152 } else {
153 WriteJsonFile(realpath.value(), json_file);
154 }
155 }
156 }
157
CopyHcclJsonToDir(uint32_t rank_id)158 void DumpJsonParser::CopyHcclJsonToDir(uint32_t rank_id) {
159 if (!IsDumpEnabled()) {
160 return;
161 }
162 std::string config_path = common::GetEnv("MINDSPORE_HCCL_CONFIG_PATH");
163 if (config_path.empty()) {
164 config_path = common::GetEnv("RANK_TABLE_FILE");
165 if (config_path.empty()) {
166 MS_LOG(INFO) << "Get hccl json config failed.";
167 return;
168 }
169 }
170 std::ifstream json_file(config_path);
171 auto realpath = Common::CreatePrefixPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/hccl.json");
172 if (!realpath.has_value()) {
173 MS_LOG(ERROR) << "Get real path failed in CopyHcclJsonToDir.";
174 } else {
175 WriteJsonFile(realpath.value(), json_file);
176 }
177 }
178
CopyMSCfgJsonToDir(uint32_t rank_id)179 void DumpJsonParser::CopyMSCfgJsonToDir(uint32_t rank_id) {
180 if (!IsDumpEnabled()) {
181 return;
182 }
183 auto realpath = Common::CreatePrefixPath(path_ + "/rank_" + std::to_string(rank_id) + "/.dump_metadata/config.json");
184 if (!realpath.has_value()) {
185 MS_LOG(ERROR) << "Get real path failed in CopyMSConfigJsonToDir.";
186 } else {
187 nlohmann::json ms_info;
188 auto context = MsContext::GetInstance();
189 MS_EXCEPTION_IF_NULL(context);
190 ms_info["device_target"] = context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
191 ms_info["ms_version"] = "1.5.0";
192 const std::string file_path = realpath.value();
193 ChangeFileMode(file_path, S_IWUSR);
194 std::ofstream json_create(file_path);
195 if (!json_create.is_open()) {
196 MS_LOG(EXCEPTION) << "Json file " << file_path << "open failed!";
197 }
198 json_create << ms_info;
199 json_create.close();
200 ChangeFileMode(file_path, S_IRUSR);
201 }
202 }
203
GetIterDumpFlag() const204 bool DumpJsonParser::GetIterDumpFlag() const { return e2e_dump_enabled_ && IsDumpIter(cur_dump_iter_); }
205
DumpToFile(const std::string & filename,const void * data,size_t len,const ShapeVector & shape,TypeId type)206 bool DumpJsonParser::DumpToFile(const std::string &filename, const void *data, size_t len, const ShapeVector &shape,
207 TypeId type) {
208 if (filename.empty() || data == nullptr || len == 0) {
209 MS_LOG(ERROR) << "Incorrect parameter.";
210 return false;
211 }
212
213 auto file_path = Common::CreatePrefixPath(filename + ".npy");
214 if (!file_path.has_value()) {
215 MS_LOG(ERROR) << "CreatePrefixPath failed.";
216 return false;
217 }
218 const std::string file_path_str = file_path.value();
219 ChangeFileMode(file_path_str, S_IWUSR);
220 std::ofstream fd(file_path_str, std::ios::out | std::ios::trunc | std::ios::binary);
221 if (!fd.is_open()) {
222 MS_LOG(EXCEPTION) << "Open file " << file_path_str << " failed." << ErrnoToString(errno);
223 }
224 std::string npy_header = GenerateNpyHeader(shape, type);
225 if (!npy_header.empty()) {
226 fd << npy_header;
227 (void)fd.write(reinterpret_cast<const char *>(data), SizeToLong(len));
228 if (fd.bad()) {
229 fd.close();
230 MS_LOG(EXCEPTION) << "Write mem to file " << file_path_str << " failed.";
231 }
232 fd.close();
233 ChangeFileMode(file_path_str, S_IRUSR);
234 }
235 return true;
236 }
237
ParseCommonDumpSetting(const nlohmann::json & content)238 void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
239 // async_dump is enabled by default, if e2e dump is enabled it will override this
240 auto context = MsContext::GetInstance();
241 MS_EXCEPTION_IF_NULL(context);
242 if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
243 async_dump_enabled_ = true;
244 } else if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice) {
245 if (!e2e_dump_enabled_) {
246 e2e_dump_enabled_ = true;
247 trans_flag_ = true;
248 }
249 }
250
251 auto common_dump_settings = CheckJsonKeyExist(content, kCommonDumpSettings);
252 auto dump_mode = CheckJsonKeyExist(*common_dump_settings, kDumpMode);
253 auto net_name = CheckJsonKeyExist(*common_dump_settings, kNetName);
254 auto iteration = CheckJsonKeyExist(*common_dump_settings, kIteration);
255 auto input_output = CheckJsonKeyExist(*common_dump_settings, kInputOutput);
256 auto kernels = CheckJsonKeyExist(*common_dump_settings, kKernels);
257 auto support_device = CheckJsonKeyExist(*common_dump_settings, kSupportDevice);
258
259 nlohmann::detail::iter_impl<const nlohmann::json> op_debug_mode;
260 if (!e2e_dump_enabled_) {
261 op_debug_mode = CheckJsonKeyExist(*common_dump_settings, kOpDebugMode);
262 }
263
264 ParseDumpMode(*dump_mode);
265 ParseDumpPath(*common_dump_settings); // Pass in the whole json string to parse because the path field is optional.
266 ParseNetName(*net_name);
267 ParseIteration(*iteration);
268 ParseInputOutput(*input_output);
269 ParseKernels(*kernels);
270 ParseSupportDevice(*support_device);
271 if (!e2e_dump_enabled_) {
272 ParseOpDebugMode(*op_debug_mode);
273 }
274 }
275
ParseE2eDumpSetting(const nlohmann::json & content)276 void DumpJsonParser::ParseE2eDumpSetting(const nlohmann::json &content) {
277 auto e2e_dump_setting = content.find(kE2eDumpSettings);
278 auto context = MsContext::GetInstance();
279 MS_EXCEPTION_IF_NULL(context);
280 if (e2e_dump_setting == content.end()) {
281 MS_LOG(INFO) << "No e2e_dump_settings";
282 return;
283 }
284
285 auto e2e_dump_enable = CheckJsonKeyExist(*e2e_dump_setting, kEnable);
286 auto trans_flag = CheckJsonKeyExist(*e2e_dump_setting, kTransFlag);
287
288 e2e_dump_enabled_ = ParseEnable(*e2e_dump_enable);
289 if (e2e_dump_enabled_ && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
290 MS_LOG(WARNING) << "Deprecated: Synchronous dump mode is deprecated and will be removed in a future release";
291 }
292 trans_flag_ = ParseEnable(*trans_flag);
293 }
294
CheckJsonUnsignedType(const nlohmann::json & content,const std::string & key)295 void CheckJsonUnsignedType(const nlohmann::json &content, const std::string &key) {
296 if (!content.is_number_unsigned()) {
297 MS_LOG(EXCEPTION) << "Dump config parse failed, " << key << " should be unsigned int type";
298 }
299 }
300
CheckJsonStringType(const nlohmann::json & content,const std::string & key)301 void CheckJsonStringType(const nlohmann::json &content, const std::string &key) {
302 if (!content.is_string()) {
303 MS_LOG(EXCEPTION) << "Dump config parse failed, " << key << " should be string type";
304 }
305 }
306
CheckJsonArrayType(const nlohmann::json & content,const std::string & key)307 void CheckJsonArrayType(const nlohmann::json &content, const std::string &key) {
308 if (!content.is_array()) {
309 MS_LOG(EXCEPTION) << "Dump config parse failed, " << key << " should be array type";
310 }
311 }
312
ParseDumpMode(const nlohmann::json & content)313 void DumpJsonParser::ParseDumpMode(const nlohmann::json &content) {
314 CheckJsonUnsignedType(content, kDumpMode);
315 dump_mode_ = content;
316 if (dump_mode_ != 0 && dump_mode_ != 1) {
317 MS_LOG(EXCEPTION) << "Dump config parse failed, dump_mode should be 0 or 1, but got " << dump_mode_;
318 }
319 }
320
ParseDumpPath(const nlohmann::json & content)321 void DumpJsonParser::ParseDumpPath(const nlohmann::json &content) {
322 std::string dump_path;
323 auto json_iter = content.find(kPath);
324 // Check if `path` field exists in dump json file.
325 if (json_iter != content.end()) {
326 CheckJsonStringType(*json_iter, kPath);
327 dump_path = *json_iter;
328 }
329 if (dump_path.empty()) {
330 // If no path is found or path is set as empty in dump json file, use MS_DIAGNOSTIC_DATA_PATH/debug_dump as the dump
331 // path value if the env exists.
332 dump_path = common::GetEnv("MS_DIAGNOSTIC_DATA_PATH");
333 if (dump_path.empty()) {
334 MS_LOG(EXCEPTION)
335 << "Dump path is empty. Please set it in dump json file or environment variable `MS_DIAGNOSTIC_DATA_PATH`.";
336 } else {
337 dump_path += "/debug_dump";
338 }
339 }
340 path_ = dump_path;
341 if (!std::all_of(path_.begin(), path_.end(),
342 [](char c) { return ::isalpha(c) || ::isdigit(c) || c == '-' || c == '_' || c == '/'; })) {
343 MS_LOG(EXCEPTION) << "Dump path only support alphabets, digit or {'-', '_', '/'}, but got:" << path_;
344 }
345 if (path_[0] != '/') {
346 MS_LOG(EXCEPTION) << "Dump path only support absolute path and should start with '/'";
347 }
348 }
349
ParseNetName(const nlohmann::json & content)350 void DumpJsonParser::ParseNetName(const nlohmann::json &content) {
351 CheckJsonStringType(content, kNetName);
352 net_name_ = content;
353 if (net_name_.empty() || !std::all_of(net_name_.begin(), net_name_.end(),
354 [](char c) { return ::isalpha(c) || ::isdigit(c) || c == '-' || c == '_'; })) {
355 MS_LOG(EXCEPTION) << "net_name only supports alphabetic, digit, or {'-', '_'}, but got: " << net_name_;
356 }
357 }
358
ParseIteration(const nlohmann::json & content)359 void DumpJsonParser::ParseIteration(const nlohmann::json &content) {
360 CheckJsonStringType(content, kIteration);
361 auto context = MsContext::GetInstance();
362 MS_EXCEPTION_IF_NULL(context);
363 if (e2e_dump_enabled_ || async_dump_enabled_) {
364 iteration_ = content;
365 if (iteration_.empty() || (!std::all_of(iteration_.begin(), iteration_.end(), [](char c) {
366 return ::isdigit(c) || c == '-' || c == '|';
367 }) && iteration_ != "all")) {
368 MS_LOG(EXCEPTION) << "iteration only supports digits, {'-', '|'}, or just \"all\" but got: " << iteration_;
369 }
370 } else if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) {
371 MS_LOG(WARNING) << "Dump is not enabled. ";
372 } else {
373 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. Async or E2E should be enabled. ";
374 }
375 }
376
IsIterInRange(uint32_t iteration,const std::string & range)377 bool IsIterInRange(uint32_t iteration, const std::string &range) {
378 if (range.empty()) {
379 return false;
380 }
381 const std::string dash = "-";
382 std::size_t range_idx = range.find(dash);
383 // no dash in range, compare the value directly
384 if (range_idx == std::string::npos) {
385 return iteration == std::stoul(range);
386 }
387 // make sure there is only one dash in range
388 if (range.find(dash, range_idx + 1) != std::string::npos) {
389 return false;
390 }
391 auto low_range_str = range.substr(0, range_idx);
392 auto high_range_str = range.substr(range_idx + 1);
393 if (low_range_str.empty() || high_range_str.empty()) {
394 return false;
395 }
396 uint32_t low_range = static_cast<uint32_t>(std::stoul(low_range_str));
397 uint32_t high_range = static_cast<uint32_t>(std::stoul(high_range_str));
398 return (low_range <= iteration) && (iteration <= high_range);
399 }
400
IsDumpIter(uint32_t iteration) const401 bool DumpJsonParser::IsDumpIter(uint32_t iteration) const {
402 // bool DumpJsonParser::IsDumpIter(uint32_t iteration) --> checks if iteration should be dumped or not.
403 if (iteration_ == "all") {
404 return true;
405 }
406 const std::string vertical_bar = "|";
407 std::size_t start = 0;
408 std::size_t end = iteration_.find(vertical_bar);
409 while (end != std::string::npos) {
410 std::string temp = iteration_.substr(start, end - start);
411 auto found = IsIterInRange(iteration, temp);
412 if (found) {
413 return true;
414 }
415 start = end + 1;
416 end = iteration_.find(vertical_bar, start);
417 }
418 std::string temp = iteration_.substr(start);
419 return IsIterInRange(iteration, temp);
420 }
421
ParseInputOutput(const nlohmann::json & content)422 void DumpJsonParser::ParseInputOutput(const nlohmann::json &content) {
423 CheckJsonUnsignedType(content, kInputOutput);
424 input_output_ = content;
425 const uint32_t max_inout_num = 2;
426 if (input_output_ < 0 || input_output_ > max_inout_num) {
427 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. input_output should be 0, 1, 2";
428 }
429 }
430
ParseKernels(const nlohmann::json & content)431 void DumpJsonParser::ParseKernels(const nlohmann::json &content) {
432 CheckJsonArrayType(content, kKernels);
433
434 for (const auto &kernel : content) {
435 auto kernel_str = kernel.dump();
436 kernel_str.erase(std::remove(kernel_str.begin(), kernel_str.end(), '\"'), kernel_str.end());
437 MS_LOG(INFO) << "Need dump kernel:" << kernel_str;
438 auto ret = kernels_.try_emplace({kernel_str, 0});
439 if (!ret.second) {
440 MS_LOG(WARNING) << "Duplicate dump kernel name:" << kernel_str;
441 }
442 }
443 }
444
ParseSupportDevice(const nlohmann::json & content)445 void DumpJsonParser::ParseSupportDevice(const nlohmann::json &content) {
446 CheckJsonArrayType(content, kSupportDevice);
447 for (const auto &device : content) {
448 uint32_t device_id = device;
449 MS_LOG(INFO) << "Dump support device:" << device_id;
450 auto ret = support_devices_.emplace(device_id);
451 if (!ret.second) {
452 MS_LOG(WARNING) << "Duplicate support device:" << device_id;
453 }
454 }
455 }
456
ParseEnable(const nlohmann::json & content)457 bool DumpJsonParser::ParseEnable(const nlohmann::json &content) {
458 if (!content.is_boolean()) {
459 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. 'enable' should be boolean type";
460 }
461 return content;
462 }
463
ParseOpDebugMode(const nlohmann::json & content)464 void DumpJsonParser::ParseOpDebugMode(const nlohmann::json &content) {
465 CheckJsonUnsignedType(content, kOpDebugMode);
466 op_debug_mode_ = content;
467 const size_t max_mode = 3;
468 if (op_debug_mode_ < 0 || op_debug_mode_ > max_mode) {
469 MS_LOG(EXCEPTION) << "Dump Json Parse Failed. op_debug_mode should be 0, 1, 2, 3";
470 }
471 }
472
JsonConfigToString()473 void DumpJsonParser::JsonConfigToString() {
474 std::string cur_config;
475 cur_config.append("dump_mode:");
476 cur_config.append(std::to_string(dump_mode_));
477 cur_config.append(" path:");
478 cur_config.append(path_);
479 cur_config.append(" net_name:");
480 cur_config.append(net_name_);
481 cur_config.append(" iteration:");
482 cur_config.append(iteration_);
483 cur_config.append(" input_output:");
484 cur_config.append(std::to_string(input_output_));
485 cur_config.append("e2e_enable:");
486 cur_config.append(std::to_string(static_cast<int>(e2e_dump_enabled_)));
487 cur_config.append(" async_dump_enable:");
488 cur_config.append(std::to_string(static_cast<int>(async_dump_enabled_)));
489 MS_LOG(INFO) << cur_config;
490 }
491
JudgeDumpEnabled()492 void DumpJsonParser::JudgeDumpEnabled() {
493 auto context = MsContext::GetInstance();
494 MS_EXCEPTION_IF_NULL(context);
495 if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice) {
496 async_dump_enabled_ = false;
497 }
498
499 if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) {
500 if (async_dump_enabled_ && e2e_dump_enabled_) {
501 async_dump_enabled_ = false;
502 MS_LOG(INFO) << "Disable async dump";
503 }
504 }
505
506 if (!async_dump_enabled_ && !e2e_dump_enabled_) {
507 MS_LOG(WARNING) << "Dump json parse failed. Dump is not enabled";
508 }
509 if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kCPUDevice) {
510 auto device_id = context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
511 if (support_devices_.find(device_id) == support_devices_.end()) {
512 async_dump_enabled_ = false;
513 e2e_dump_enabled_ = false;
514 MS_LOG(WARNING) << "Dump is not enabled. device_id:" << device_id << " not support";
515 }
516 }
517 JsonConfigToString();
518 }
519
NeedDump(const std::string & op_full_name) const520 bool DumpJsonParser::NeedDump(const std::string &op_full_name) const {
521 if (dump_mode_ == 0) {
522 return true;
523 }
524 auto iter = kernels_.find(op_full_name);
525 return iter != kernels_.end();
526 }
527
MatchKernel(const std::string & kernel_name)528 void DumpJsonParser::MatchKernel(const std::string &kernel_name) {
529 auto iter = kernels_.find(kernel_name);
530 if (iter == kernels_.end()) {
531 return;
532 }
533 iter->second = iter->second + 1;
534 MS_LOG(INFO) << "Match dump kernel:" << iter->first << " match times:" << iter->second;
535 }
536
PrintUnusedKernel()537 void DumpJsonParser::PrintUnusedKernel() {
538 if (!e2e_dump_enabled_ && !async_dump_enabled_) {
539 return;
540 }
541 for (const auto &iter : kernels_) {
542 if (iter.second == 0) {
543 MS_LOG(WARNING) << "[DataDump] Unused Kernel in json:" << iter.first;
544 }
545 }
546 }
547
GetOpOverflowBinPath(uint32_t graph_id) const548 std::string DumpJsonParser::GetOpOverflowBinPath(uint32_t graph_id) const {
549 std::string bin_path;
550 bin_path.append(path_);
551 bin_path.append("/");
552 bin_path.append("rank_");
553
554 uint32_t rank_id = 0;
555 auto ms_context = MsContext::GetInstance();
556 MS_EXCEPTION_IF_NULL(ms_context);
557 auto env_rank_id = common::GetEnv("RANK_ID");
558 if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
559 // get actual rank id if it's distribution training case.
560 if (!CommManager::GetInstance().GetRankID(kHcclWorldGroup, &rank_id)) {
561 MS_LOG(INFO) << "Failed to get rank id.";
562 }
563 }
564 bin_path.append(std::to_string(rank_id));
565
566 bin_path.append("/");
567 bin_path.append(net_name_);
568 bin_path.append("/");
569 bin_path.append(std::to_string(graph_id));
570 bin_path.append("/");
571 bin_path.append(std::to_string(cur_dump_iter_));
572 bin_path.append("/");
573
574 return bin_path;
575 }
576
InputNeedDump() const577 bool DumpJsonParser::InputNeedDump() const {
578 return input_output_ == kDumpInputAndOutput || input_output_ == kDumpInputOnly;
579 }
580
OutputNeedDump() const581 bool DumpJsonParser::OutputNeedDump() const {
582 return input_output_ == kDumpInputAndOutput || input_output_ == kDumpOutputOnly;
583 }
584
UpdateNeedDumpKernels(const session::KernelGraph & kernel_graph)585 void DumpJsonParser::UpdateNeedDumpKernels(const session::KernelGraph &kernel_graph) {
586 if (!async_dump_enabled_) {
587 return;
588 }
589 MS_LOG(INFO) << "Update async dump kernel list for hccl";
590 std::map<std::string, uint32_t> update_kernels;
591 for (const auto &kernel : kernel_graph.execution_order()) {
592 MS_EXCEPTION_IF_NULL(kernel);
593 if (AnfAlgo::GetKernelType(kernel) == HCCL_KERNEL &&
594 DumpJsonParser::GetInstance().NeedDump(GetKernelNodeName(kernel))) {
595 auto input_size = AnfAlgo::GetInputTensorNum(kernel);
596 for (size_t i = 0; i < input_size; ++i) {
597 auto input_with_index = AnfAlgo::GetPrevNodeOutput(kernel, i);
598 auto input = input_with_index.first;
599 MS_EXCEPTION_IF_NULL(input);
600 if (input->isa<CNode>()) {
601 MS_LOG(INFO) << "[AsyncDump] Match Hccl Node:" << GetKernelNodeName(kernel)
602 << " Input:" << GetKernelNodeName(input);
603 update_kernels.try_emplace(GetKernelNodeName(input), 0);
604 }
605 }
606 }
607 }
608 kernels_.insert(update_kernels.begin(), update_kernels.end());
609 }
610 } // namespace mindspore
611