1 /**
2 * Copyright 2019-2022 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "debug/debug_services.h"
17 #include <dirent.h>
18 #include <algorithm>
19 #include <functional>
20 #include <fstream>
21 #include <future>
22 #include <thread>
23 #include <iterator>
24 #include <map>
25 #include <numeric>
26 #include <limits>
27 #include <unordered_set>
28 #include <utility>
29 #include <regex>
30 #include <iomanip>
31 #include "openssl/md5.h"
32 #include "pybind11/stl.h"
33 #ifndef OFFLINE_DBG_MODE
34 #include "include/common/debug/common.h"
35 #include "include/backend/debug/debugger/debugger.h"
36 #include "include/common/debug/anf_dump_utils.h"
37 #include "include/common/utils/anfalgo.h"
38 #endif
39 #include "debug/utils.h"
40 #include "nlohmann/json.hpp"
41 #include "debug/debugger/tensor_summary.h"
42 #include "utils/file_utils.h"
43 #include "include/backend/anf_runtime_algorithm.h"
44 #include "mindspore/core/utils/ms_utils.h"
45 #include "include/backend/debug/data_dump/dump_json_parser.h"
46
47 namespace mindspore {
48 namespace {
49 static constexpr const char constant_prefix[] = "Default--data-";
50 static constexpr const char kNpyExt[] = ".npy";
51 constexpr float ms_to_s = 1000.0;
52 constexpr int precision = 2;
53 #ifndef OFFLINE_DBG_MODE
54 constexpr int md5_bit_wide = 2;
55 constexpr int md5_len = 32;
56 #endif
57 static constexpr int32_t wp_progress_period = 300;
58 #ifdef __APPLE__
59 constexpr int kStrErrorNone = 0;
60 #else
61 constexpr char *kStrErrorNone = nullptr;
62 #endif
63 } // namespace
64
IsRegFile(const std::string & file_path)65 bool IsRegFile(const std::string &file_path) {
66 struct stat st;
67 int ret = stat(file_path.c_str(), &st);
68 if (ret != 0) {
69 MS_LOG(ERROR) << "stat error for " << file_path << ", ret is: " << ret;
70 return false;
71 }
72 return S_ISREG(st.st_mode);
73 }
74
75 #ifndef OFFLINE_DBG_MODE
openssl_md5(char * input,char * output,int64_t len)76 void openssl_md5(char *input, char *output, int64_t len) {
77 unsigned char digest[MD5_DIGEST_LENGTH];
78 MD5(reinterpret_cast<unsigned char *>(input), len, reinterpret_cast<unsigned char *>(digest));
79 for (int i = 0; i < MD5_DIGEST_LENGTH; i++) {
80 int rest_len = md5_len + 1 - i * md5_bit_wide;
81 auto ret =
82 snprintf_s(&output[i * md5_bit_wide], rest_len, md5_bit_wide, "%02x", static_cast<unsigned int>(digest[i]));
83 if (ret < 0) {
84 MS_LOG(ERROR) << "snprintf_s encountered an error when record md5, which may lead to incorrect MD5 value in the "
85 "statistic.csv file.";
86 } else if (ret >= rest_len) {
87 MS_LOG(ERROR) << "snprintf_s output is truncated when record md5, which may lead to incorrect MD5 value in the "
88 "statistic.csv file.";
89 }
90 }
91 }
92 #endif
93
DebugServices()94 DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
95
DebugServices(const DebugServices & other)96 DebugServices::DebugServices(const DebugServices &other) {
97 wp_id_cache_ = other.wp_id_cache_;
98 net_name_ = other.net_name_;
99 dump_dir_ = other.dump_dir_;
100 is_sync_mode_ = other.is_sync_mode_;
101 tensor_loader_ = other.tensor_loader_;
102 watchpoint_table_ = other.watchpoint_table_;
103 }
104
operator =(const DebugServices & other)105 DebugServices &DebugServices::operator=(const DebugServices &other) {
106 if (this != &other) {
107 tensor_loader_ = other.tensor_loader_;
108 watchpoint_table_ = other.watchpoint_table_;
109 }
110 return *this;
111 }
112
113 /*
114 * Feature group: Online debugger, Offline debugger.
115 * Target device group: Ascend, GPU.
116 * Runtime category: Old runtime, MindRT.
117 * Description: Create a watchpoint_t object and set the watchpoint's variables and add the watchpoint to the
118 * watchpoint_table.
119 */
AddWatchpoint(int id,int watch_condition,float parameter,const std::vector<std::tuple<std::string,bool>> & check_node_list,const std::vector<parameter_t> & parameter_list,const std::vector<std::tuple<std::string,std::vector<uint32_t>>> * check_node_device_list,const std::vector<std::tuple<std::string,std::vector<uint32_t>>> * check_node_graph_list)120 void DebugServices::AddWatchpoint(
121 int id, int watch_condition, float parameter, const std::vector<std::tuple<std::string, bool>> &check_node_list,
122 const std::vector<parameter_t> ¶meter_list,
123 const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
124 const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
125 std::lock_guard<std::mutex> lg(lock_);
126
127 watchpoint_t watchpoint_item;
128 if (id < 0) {
129 MS_LOG(EXCEPTION) << "The watchpoint id should be an integer greater then 0, but got " << id;
130 }
131 watchpoint_item.id = static_cast<unsigned int>(id);
132 watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
133 watchpoint_item.condition.parameter = parameter;
134 watchpoint_item.check_node_list = check_node_list;
135 // For offline debugger check_node_device_list is not nullptr.
136 if (check_node_device_list != nullptr) {
137 watchpoint_item.check_node_device_list = *check_node_device_list;
138 }
139 // For offline debugger check_node_graph_list is not nullptr.
140 if (check_node_graph_list != nullptr) {
141 watchpoint_item.check_node_graph_list = *check_node_graph_list;
142 }
143 watchpoint_item.parameter_list = parameter_list;
144 watchpoint_table_[id] = watchpoint_item;
145 }
146
RemoveWatchpoint(unsigned int id)147 void DebugServices::RemoveWatchpoint(unsigned int id) {
148 std::lock_guard<std::mutex> lg(lock_);
149 (void)watchpoint_table_.erase(id);
150 }
151
152 /*
153 * Feature group: Online debugger, Offline debugger.
154 * Target device group: Ascend, GPU.
155 * Runtime category: Old runtime, MindRT.
156 * Description: Returns a tensor summary unique pointer based on the given tensor_dtype, returns nullptr if the type is
157 * not supported.
158 */
GetSummaryPtr(const std::shared_ptr<TensorData> & tensor,const void * const previous_tensor_ptr,uint64_t num_elements,uint64_t prev_num_elements,int tensor_dtype)159 std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
160 const void *const previous_tensor_ptr, uint64_t num_elements,
161 uint64_t prev_num_elements, int tensor_dtype) {
162 MS_EXCEPTION_IF_NULL(tensor);
163 switch (tensor_dtype) {
164 case DbgDataType::DT_UINT8: {
165 return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
166 prev_num_elements);
167 }
168 case DbgDataType::DT_INT8: {
169 return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
170 prev_num_elements);
171 }
172 case DbgDataType::DT_UINT16: {
173 return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
174 prev_num_elements);
175 }
176 case DbgDataType::DT_INT16: {
177 return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
178 prev_num_elements);
179 }
180 case DbgDataType::DT_UINT32: {
181 return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
182 prev_num_elements);
183 }
184 case DbgDataType::DT_INT32:
185 case DbgDataType::DT_BASE_INT: {
186 return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
187 prev_num_elements);
188 }
189 case DbgDataType::DT_UINT64: {
190 return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
191 prev_num_elements);
192 }
193 case DbgDataType::DT_INT64: {
194 return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
195 prev_num_elements);
196 }
197 case DbgDataType::DT_FLOAT16: {
198 return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
199 prev_num_elements);
200 }
201 case DbgDataType::DT_BFLOAT16: {
202 return std::make_unique<TensorSummary<bfloat16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
203 prev_num_elements);
204 }
205 case DbgDataType::DT_FLOAT32:
206 case DbgDataType::DT_BASE_FLOAT: {
207 return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
208 prev_num_elements);
209 }
210 case DbgDataType::DT_FLOAT64: {
211 return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
212 prev_num_elements);
213 }
214 case DbgDataType::DT_BOOL: {
215 return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
216 prev_num_elements);
217 }
218 default:
219 MS_LOG(INFO) << "Unsupported tensor type";
220 // return a null pointer
221 return std::unique_ptr<TensorSummary<int32_t>>{};
222 }
223 }
224
225 /*
226 * Feature group: Online debugger, Offline debugger.
227 * Target device group: Ascend, GPU.
228 * Runtime category: Old runtime, MindRT.
229 * Description: Returns TensorStat for the given tensor based on the base_summary_ptr.
230 */
GetTensorStatistics(const std::shared_ptr<TensorData> & tensor)231 DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
232 if (tensor == nullptr) {
233 MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
234 TensorStat empty_tensor_stat_data;
235 return empty_tensor_stat_data;
236 }
237 std::unique_ptr<ITensorSummary> base_summary_ptr;
238 void *previous_tensor_ptr = nullptr;
239 base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), 0, tensor->GetType());
240 if (base_summary_ptr == nullptr) {
241 MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
242 TensorStat empty_tensor_stat_data;
243 return empty_tensor_stat_data;
244 }
245 std::string md5 = "";
246 MSLogTime msTime;
247 #ifndef OFFLINE_DBG_MODE
248 auto statistic_category = DumpJsonParser::GetInstance().statistic_category();
249 if (std::find(statistic_category.begin(), statistic_category.end(), "md5") != statistic_category.end()) {
250 msTime.Start();
251 char md5str[33];
252 auto ret = memset_s(md5str, sizeof(md5str), '\0', sizeof(md5str));
253 if (ret != EOK) {
254 MS_LOG(ERROR) << "Failed to call memset_s, skip record MD5.";
255 } else {
256 openssl_md5(const_cast<char *>(tensor->GetDataPtr()), md5str, tensor->GetByteSize());
257 md5 = std::string(md5str);
258 }
259 msTime.End();
260 MS_LOG(DEBUG) << "Calc md5 costs time : " << msTime.GetRunTimeUS() << " microseconds.";
261 }
262 #endif
263 msTime.Start();
264 base_summary_ptr->TensorStatistics(tensor->GetType());
265 msTime.End();
266 MS_LOG(DEBUG) << "Calc statistic costs time : " << msTime.GetRunTimeUS() << " microseconds.";
267 TensorStat tensor_stat_data(
268 tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
269 base_summary_ptr->max_value(), base_summary_ptr->min_value(), base_summary_ptr->avg_value(),
270 base_summary_ptr->count(), base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
271 base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(), base_summary_ptr->pos_inf_count(),
272 base_summary_ptr->zero_count(), base_summary_ptr->l2_value(), md5);
273
274 return tensor_stat_data;
275 }
276
277 #ifdef OFFLINE_DBG_MODE
278 /*
279 * Feature group: Offline debugger.
280 * Target device group: Ascend, GPU.
281 * Runtime category: Old runtime, MindRT.
282 * Description: Returns previous_tensor_ptr if graph hisotry file is found and the current iteration is not the first
283 * run iteration for tensor's graph.
284 */
GetPrevTensor(const std::shared_ptr<TensorData> & tensor,bool previous_iter_tensor_needed,uint64_t * prev_num_elements,bool * history_not_found)285 const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
286 uint64_t *prev_num_elements, bool *history_not_found) {
287 MS_EXCEPTION_IF_NULL(tensor);
288 const void *previous_tensor_ptr = nullptr;
289 std::shared_ptr<TensorData> tensor_prev;
290 std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(tensor->GetDeviceId(), tensor->GetRootGraphId());
291 if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
292 *history_not_found = 1;
293 MS_LOG(DEBUG) << "Graph run history is not available for graph: " << tensor->GetRootGraphId();
294 } else if (previous_iter_tensor_needed && GetPrevIteration(tensor) != UINT32_MAX) {
295 // when prev_tensor is not available, the prev iteration is set to UINT32_MAX
296 // read data in offline mode
297 NPYFilePool file_paths;
298 ProcessedNPYFiles processed_npy_files;
299 if (!is_sync_mode_) {
300 ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
301 std::vector<unsigned int>{tensor->GetDeviceId()},
302 std::vector<unsigned int>{tensor->GetPrevIteration()},
303 std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
304 processed_npy_files = ProcessNPYFilePool(file_paths);
305 }
306 std::vector<std::shared_ptr<TensorData>> result_list_prev;
307 ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
308 std::vector<unsigned int>{tensor->GetDeviceId()},
309 std::vector<unsigned int>{tensor->GetPrevIteration()},
310 std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
311 &processed_npy_files, &result_list_prev, false);
312 tensor_prev = result_list_prev[0];
313 if (tensor_prev->GetByteSize() == 0) {
314 tensor_prev.reset();
315 } else {
316 previous_tensor_ptr = tensor_prev->GetDataPtr();
317 *prev_num_elements = tensor_prev->GetNumElements();
318 }
319 }
320 return previous_tensor_ptr;
321 }
322 #endif
323
324 /*
325 * Feature group: Offline debugger, Online debugger.
326 * Target device group: Ascend, GPU.
327 * Runtime category: Old runtime, MindRT.
328 * Description: Goes through all the watchpoints in the watchpoint table. If the current tensor is in the list of
329 * check_nodes, that watchpoint is added to the vector of watchpoint_to_check (vector of watchpoints that should be
330 * checked for the current tensor) .
331 */
AddWatchPointsToCheck(bool init_dbg_suspend,bool step_end,bool recheck,const std::shared_ptr<TensorData> & tensor,bool * previous_iter_tensor_needed,std::string * const qualified_tensor_name,std::vector<watchpoint_t> * const watchpoints_to_check)332 void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
333 const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
334 std::string *const qualified_tensor_name,
335 std::vector<watchpoint_t> *const watchpoints_to_check) {
336 if (tensor == nullptr) {
337 MS_LOG(DEBUG) << "tensor is nullptr.";
338 return;
339 }
340 const auto tensor_name = tensor->GetName();
341 const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
342 const auto tensor_device_id = tensor->GetDeviceId();
343 const auto tensor_root_graph_id = tensor->GetRootGraphId();
344 for (auto w_table_item : watchpoint_table_) {
345 auto wp = std::get<1>(w_table_item);
346 // check ONLY init conditions on initial suspended state.
347 // skip other conditions on initial suspended state
348 if (init_dbg_suspend && (wp.condition.type != INIT)) {
349 continue;
350 }
351 // skip init condition if not init suspend
352 if ((wp.condition.type == INIT) && !init_dbg_suspend) {
353 continue;
354 }
355 // check change conditions only on step end.
356 if (wp.change_condition() && !step_end) {
357 continue;
358 }
359 // if recheck, ignore the cache results and reanalyze everything.
360 // if not a recheck, check only unanalyzed tensors
361 if (!recheck) {
362 std::lock_guard<std::mutex> lg(wp_lock_);
363 bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
364 if (wp_cache_hit) {
365 continue;
366 }
367 }
368 std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot, tensor_device_id, tensor_root_graph_id);
369 if (!found.empty()) {
370 *qualified_tensor_name = found;
371 watchpoints_to_check->push_back(w_table_item.second);
372 #ifdef OFFLINE_DBG_MODE
373 if (wp.change_condition()) {
374 *previous_iter_tensor_needed = true;
375 }
376 #endif
377 }
378 }
379 }
380
AddAnalyzedTensorToCache(const bool recheck,const unsigned int id,const std::string & tensor_name)381 void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
382 const std::string &tensor_name) {
383 // add analyzed tensor to cache
384 if (!recheck) {
385 std::lock_guard<std::mutex> lg(wp_lock_);
386 (void)wp_id_cache_[tensor_name].insert(id);
387 }
388 }
389
SetCheckWatchpointsResult(const int chunk_id,ChunkData * chunk_data,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id,const int exec_order,const std::string time_stamp,const std::string & qualified_tensor_name,const std::string & tensor_slot,const watchpoint_t & wp,const unsigned int device_id_val,const unsigned int root_graph_id_val,const std::vector<parameter_t> & parameter_list,const int32_t error_code) const390 void DebugServices::SetCheckWatchpointsResult(const int chunk_id, ChunkData *chunk_data,
391 std::vector<unsigned int> *const device_id,
392 std::vector<unsigned int> *const root_graph_id, const int exec_order,
393 const std::string time_stamp, const std::string &qualified_tensor_name,
394 const std::string &tensor_slot, const watchpoint_t &wp,
395 const unsigned int device_id_val, const unsigned int root_graph_id_val,
396 const std::vector<parameter_t> ¶meter_list,
397 const int32_t error_code) const {
398 (void)(chunk_data->chunk_exec_orders)[chunk_id].emplace_back(exec_order);
399 (void)(chunk_data->chunk_names)[chunk_id].emplace_back(qualified_tensor_name);
400 (void)(chunk_data->chunk_slots)[chunk_id].emplace_back(tensor_slot);
401 (void)(chunk_data->chunk_conditions)[chunk_id].emplace_back(wp.condition.type);
402 (void)(chunk_data->chunk_watchpoint_id)[chunk_id].emplace_back(wp.id);
403 if (device_id != nullptr) {
404 (void)(chunk_data->chunk_device_id)[chunk_id].emplace_back(device_id_val);
405 }
406 if (root_graph_id != nullptr) {
407 (void)(chunk_data->chunk_root_graph_id)[chunk_id].emplace_back(root_graph_id_val);
408 }
409 (void)(chunk_data->chunk_parameters)[chunk_id].emplace_back(parameter_list);
410 (void)(chunk_data->chunk_error_codes)[chunk_id].emplace_back(error_code);
411 (void)(chunk_data->chunk_time_stamp)[chunk_id].emplace_back(time_stamp);
412 }
413
414 #ifdef OFFLINE_DBG_MODE
415 /*
416 * Feature group: Offline debugger.
417 * Target device group: Ascend, GPU.
418 * Runtime category: Old runtime, MindRT.
419 * Description: Sets and checks the OUT_OF_MEMORY error_code (for memory limit feature) and NO_VALUE error_code (for
420 * new python API feature). Sets checkwatchpoint results.
421 */
CheckOutofMemoryandNoValue(const bool no_mem_to_read,const bool error_on_no_value,const std::vector<watchpoint_t> watchpoints_to_check,int chunk_id,ChunkData * chunk_data,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id,const int exec_order,const std::string time_stamp,const std::string & qualified_tensor_name,const std::string & tensor_slot,const unsigned int device_id_val,const unsigned int root_graph_id_val,const std::vector<parameter_t> & parameter_list) const422 void DebugServices::CheckOutofMemoryandNoValue(const bool no_mem_to_read, const bool error_on_no_value,
423 const std::vector<watchpoint_t> watchpoints_to_check, int chunk_id,
424 ChunkData *chunk_data, std::vector<unsigned int> *const device_id,
425 std::vector<unsigned int> *const root_graph_id, const int exec_order,
426 const std::string time_stamp, const std::string &qualified_tensor_name,
427 const std::string &tensor_slot, const unsigned int device_id_val,
428 const unsigned int root_graph_id_val,
429 const std::vector<parameter_t> ¶meter_list) const {
430 bool set_is_needed = no_mem_to_read || error_on_no_value;
431 int32_t error_code_to_set = 0;
432 if (no_mem_to_read) {
433 // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
434 error_code_to_set = ITensorSummary::OUT_OF_MEMORY;
435 } else if (error_on_no_value) {
436 error_code_to_set = ITensorSummary::NO_VALUE;
437 }
438 if (set_is_needed) {
439 for (auto &wp : watchpoints_to_check) {
440 SetCheckWatchpointsResult(chunk_id, chunk_data, device_id, root_graph_id, exec_order, time_stamp,
441 qualified_tensor_name, tensor_slot, wp, device_id_val, root_graph_id_val,
442 parameter_list, error_code_to_set);
443 }
444 }
445 }
446
447 /*
448 * Feature group: Offline debugger.
449 * Target device group: Ascend, GPU.
450 * Runtime category: Old runtime, MindRT.
451 * Description: After finishing checking watchpoint, set the tensor to not-in-use status (for memory control
452 * feature) by pushing it to eviction candidate queue. So it can be evicted from memory anytime if the memory is
453 * required by other nodes' checking. If previous_tensor exists, change their status in a pair.
454 */
SetTensorToNotInUse(const std::shared_ptr<TensorData> & tensor,const void * previous_tensor_ptr)455 void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr) {
456 // set the tensor into not-in-use status in tensor_loader.
457 auto tensor_name = tensor->GetName();
458 std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
459 std::to_string(tensor->GetRootGraphId()) + ":" +
460 std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
461 AppendToCacheEvictQueue(key_name_in_cache);
462 if (previous_tensor_ptr != nullptr) {
463 AppendToCacheEvictQueue(key_name_in_cache + ":prev");
464 }
465 }
466 #endif
467
468 #ifndef OFFLINE_DBG_MODE
469 /*
470 * Feature group: Online debugger.
471 * Target device group: Ascend, GPU.
472 * Runtime category: Old runtime, MindRT.
473 * Description: Compares the current root graph id with the given graph id and returns false if they are not equal
474 * for GPU mindRT and Ascend. Otherwise, it returns true. The objectives of this function are: 1) Check if tensor's
475 * root_graph_id is different from current_root_graph_id and skip checkwatchpoint for the tensor if these values are
476 * different. 2) Set prev_tensor_ptr to nullptr if current_root_graph_id is different from prev_root_graph_id. 3) Skip
477 * reading tensor if tensor's root_graph_id is different from current_root_graph_id.
478 */
CompareCurrentRootGraph(uint32_t id) const479 bool DebugServices::CompareCurrentRootGraph(uint32_t id) const {
480 auto debugger = Debugger::GetInstance();
481 MS_EXCEPTION_IF_NULL(debugger);
482 auto ms_context = MsContext::GetInstance();
483 MS_EXCEPTION_IF_NULL(ms_context);
484 std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
485 auto cur_root_graph_id = debugger->GetCurrentRootGraphId();
486 if ((device_target == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
487 device_target == kAscendDevice) {
488 if (cur_root_graph_id != id) {
489 return false;
490 }
491 }
492 return true;
493 }
494
495 /*
496 * Feature group: Online debugger.
497 * Target device group: Ascend, GPU.
498 * Runtime category: Old runtime, MindRT.
499 * Description: Returns the previous tensor pointer if the current root graph id is equal to previous root graph id and
500 * prev_tensor_data is not nullptr.
501 */
PreparePrevTensor(uint64_t * prev_num_elements,const std::string & tensor_name)502 const void *DebugServices::PreparePrevTensor(uint64_t *prev_num_elements, const std::string &tensor_name) {
503 std::shared_ptr<TensorData> prev_tensor_data;
504 auto debugger = Debugger::GetInstance();
505 MS_EXCEPTION_IF_NULL(debugger);
506 if (!CompareCurrentRootGraph(debugger->GetPrevRootGraphId())) {
507 // not supporting watchpoints that need prev tensor for multi root graph networks.
508 MS_LOG(DEBUG) << "Previous root graph is different from current root graph, setting prev_tensor to nullptr.";
509 prev_tensor_data = nullptr;
510 } else {
511 prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
512 }
513 if (prev_tensor_data) {
514 *prev_num_elements = prev_tensor_data->GetNumElements();
515 return prev_tensor_data->GetDataPtr();
516 }
517 return nullptr;
518 }
519 #endif
520
CheckHistoryErrorCode(int * error_code,bool history_not_found) const521 void DebugServices::CheckHistoryErrorCode(int *error_code, bool history_not_found) const {
522 // check history error_code only for offline debugger
523 if (history_not_found) {
524 *error_code = ITensorSummary::HISTORY_NOT_FOUND; // error code for history not found
525 }
526 }
527
528 /*
529 * Feature group: Offline debugger, Online debugger.
530 * Target device group: Ascend, GPU.
531 * Runtime category: Old runtime, MindRT.
532 * Description: For all the tensors in the given chunk, reads the tensors, checks all the watchpoints and sets the
533 * watchpoint hit result. Checkwatchpoint process might be affected by memory limit, whether the read tensor was
534 * successfully and whether we have a multi root graph scenario. All of aforementioned checks are done in this function.
535 */
CheckWatchpointsForTensor(ChunkData * chunk_data,ProcessedNPYFiles * const processed_npy_files,std::vector<std::shared_ptr<TensorData>> * const tensor_list,int begin,int end,int chunk_id,const bool init_dbg_suspend,const bool step_end,const bool recheck,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id,bool error_on_no_value)536 void DebugServices::CheckWatchpointsForTensor(ChunkData *chunk_data, ProcessedNPYFiles *const processed_npy_files,
537 std::vector<std::shared_ptr<TensorData>> *const tensor_list, int begin,
538 int end, int chunk_id, const bool init_dbg_suspend, const bool step_end,
539 const bool recheck, std::vector<unsigned int> *const device_id,
540 std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
541 int list_size = tensor_list->size();
542 if (end > list_size) {
543 end = list_size;
544 }
545 for (int i = begin; i < end; i++) {
546 auto &tensor = (*tensor_list)[i];
547 const auto tensor_name = tensor->GetName();
548 const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
549 const auto tensor_slot = std::to_string(tensor->GetSlot());
550 std::vector<watchpoint_t> watchpoints_to_check;
551 std::string qualified_tensor_name;
552 bool previous_iter_tensor_needed = false;
553 AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor, &previous_iter_tensor_needed,
554 &qualified_tensor_name, &watchpoints_to_check);
555 // no wp set on current tensor
556 if (watchpoints_to_check.empty()) {
557 continue;
558 }
559 #ifdef OFFLINE_DBG_MODE
560 // read data in offline mode
561 bool no_mem_to_read = false;
562 std::vector<std::shared_ptr<TensorData>> result_list;
563 ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
564 std::vector<unsigned int>{tensor->GetDeviceId()},
565 std::vector<unsigned int>{tensor->GetIteration()},
566 std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
567 processed_npy_files, &result_list, false, &no_mem_to_read);
568 tensor = result_list[0];
569 if (tensor->GetByteSize() == 0) {
570 CheckOutofMemoryandNoValue(no_mem_to_read, error_on_no_value, watchpoints_to_check, chunk_id, chunk_data,
571 device_id, root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(),
572 qualified_tensor_name, tensor_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(),
573 std::vector<parameter_t>());
574 tensor.reset();
575 continue;
576 }
577 #endif
578 // no elements to analyze
579 if (tensor->GetByteSize() == 0) {
580 continue;
581 }
582 (chunk_data->chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
583 int tensor_dtype = tensor->GetType();
584 uint64_t num_elements = tensor->GetNumElements();
585 uint64_t prev_num_elements = 0;
586 const void *previous_tensor_ptr = nullptr;
587 #ifdef OFFLINE_DBG_MODE
588 bool history_not_found = 0;
589 previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found);
590 #else
591 if (!CompareCurrentRootGraph(tensor->GetRootGraphId())) {
592 MS_LOG(DEBUG)
593 << "Current root_graph_id is different from tensor's root_graph_id, skipping checkwatchpoints for tensor: "
594 << tensor->GetName();
595 continue;
596 }
597 previous_tensor_ptr = PreparePrevTensor(&prev_num_elements, tensor_name);
598 #endif
599 std::unique_ptr<ITensorSummary> base_summary_ptr;
600 if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
601 base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
602 if (base_summary_ptr != nullptr) {
603 base_summary_ptr->SummarizeTensor(watchpoints_to_check);
604 }
605 }
606 for (auto &wp : watchpoints_to_check) {
607 bool is_hit = false;
608 int error_code = 0;
609 std::vector<parameter_t> parameter_list = {};
610 if (wp.condition.type == IS_OVERFLOW) {
611 is_hit =
612 CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration());
613 } else if (base_summary_ptr != nullptr) {
614 auto item = base_summary_ptr->IsWatchpointHit(wp);
615 is_hit = std::get<ITensorSummary::eHitPos>(item);
616 error_code = std::get<ITensorSummary::eErrorCodePos>(item);
617 #ifdef OFFLINE_DBG_MODE
618 CheckHistoryErrorCode(&error_code, history_not_found);
619 #endif
620 parameter_list = std::get<ITensorSummary::eParamListPos>(item);
621 }
622 AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
623 if (is_hit || error_code != 0) {
624 SetCheckWatchpointsResult(chunk_id, chunk_data, device_id, root_graph_id, tensor->GetExecutionOrder(),
625 tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, wp, tensor->GetDeviceId(),
626 tensor->GetRootGraphId(), parameter_list, error_code);
627 }
628 }
629 #ifdef OFFLINE_DBG_MODE
630 SetTensorToNotInUse(tensor, previous_tensor_ptr);
631 // in offline mode remove the need for the data
632 tensor.reset();
633 #endif
634 (void)tensor_processed_count_.fetch_add(1, std::memory_order_relaxed);
635 }
636 }
637
638 /*
639 * Feature group: Offline debugger, Online debugger.
640 * Target device group: Ascend, GPU.
641 * Runtime category: Old runtime, MindRT.
642 * Description: This function checks the watchpoints for the given tensor list by dividing the tensor list into chunks.
643 * Each chunk is handled by a separate thread and then the result of check watchpoint for each thread is gathered and
644 * sorted. In the end, the time for checking the watchpoint in the current step is reported.
645 */
CheckWatchpoints(std::vector<std::string> * const name,std::vector<std::string> * const slot,std::vector<int> * const condition,std::vector<unsigned int> * const watchpoint_id,std::vector<std::vector<parameter_t>> * const parameters,std::vector<int32_t> * const error_codes,ProcessedNPYFiles * const processed_npy_files,std::vector<std::shared_ptr<TensorData>> * const tensor_list,const bool init_dbg_suspend,const bool step_end,const bool recheck,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id,bool error_on_no_value)646 void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::vector<std::string> *const slot,
647 std::vector<int> *const condition, std::vector<unsigned int> *const watchpoint_id,
648 std::vector<std::vector<parameter_t>> *const parameters,
649 std::vector<int32_t> *const error_codes,
650 ProcessedNPYFiles *const processed_npy_files,
651 std::vector<std::shared_ptr<TensorData>> *const tensor_list,
652 const bool init_dbg_suspend, const bool step_end, const bool recheck,
653 std::vector<unsigned int> *const device_id,
654 std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
655 std::lock_guard<std::mutex> lg(lock_);
656 auto t1 = std::chrono::high_resolution_clock::now();
657 if (watchpoint_table_.empty()) {
658 return;
659 }
660 // vector to store execution order of tensors hit
661 std::vector<int> exec_order;
662 std::vector<std::string> time_stamps;
663 size_t tensor_list_size = tensor_list->size();
664 uint64_t tensor_list_byte_size = 0;
665 MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
666 if (tensor_list_size == 0) {
667 return;
668 }
669 if (IS_OUTPUT_ON(mindspore::kInfo)) {
670 wp_progress_enabled_ = true;
671 wp_progress_thread_ =
672 std::make_unique<std::thread>([this, tensor_list_size]() { CheckWatchpointProgress(tensor_list_size); });
673 }
674 const size_t thread_num_with_mem = 16;
675 const size_t thread_num_without_mem = 32;
676 // default value for number of threads
677 const size_t default_thread_num =
678 tensor_loader_->EnableMemoryControl() ? thread_num_with_mem : thread_num_without_mem;
679 size_t max_thread_num = default_thread_num;
680 if (max_thread_num > tensor_list_size) {
681 max_thread_num = tensor_list_size;
682 }
683 MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
684 size_t chunk_size = tensor_list_size / max_thread_num;
685 size_t remainder = tensor_list_size % max_thread_num;
686 ChunkData chunk_data;
687 chunk_data.chunk_exec_orders.resize(max_thread_num);
688 chunk_data.chunk_names.resize(max_thread_num);
689 chunk_data.chunk_slots.resize(max_thread_num);
690 chunk_data.chunk_conditions.resize(max_thread_num);
691 chunk_data.chunk_watchpoint_id.resize(max_thread_num);
692 chunk_data.chunk_parameters.resize(max_thread_num);
693 chunk_data.chunk_error_codes.resize(max_thread_num);
694 chunk_data.chunk_device_id.resize(max_thread_num);
695 chunk_data.chunk_root_graph_id.resize(max_thread_num);
696 chunk_data.chunk_tensor_byte_size.resize(max_thread_num);
697 std::fill(chunk_data.chunk_tensor_byte_size.begin(), chunk_data.chunk_tensor_byte_size.end(), 0);
698 chunk_data.chunk_time_stamp.resize(max_thread_num);
699
700 std::vector<std::future<void>> tensor_future_vec;
701 size_t begin = 0;
702 size_t end = begin;
703 for (size_t i = 0; i < max_thread_num; i++) {
704 end += chunk_size;
705 if (remainder > 0) {
706 end++;
707 remainder--;
708 }
709 (void)tensor_future_vec.emplace_back(std::async(
710 std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_data, processed_npy_files,
711 tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck, device_id, root_graph_id, error_on_no_value));
712 begin = end;
713 }
714
715 SortWatchpointsInfo(&tensor_future_vec, &exec_order, &time_stamps, &tensor_list_byte_size, name, slot,
716
717 condition, watchpoint_id, parameters, error_codes, &chunk_data, device_id, root_graph_id);
718
719 auto t2 = std::chrono::high_resolution_clock::now();
720 std::chrono::duration<double, std::milli> ms_double = t2 - t1;
721 MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
722 MS_LOG(INFO) << "CheckWatchpoints Took: " << std::fixed << std::setprecision(precision)
723 << (ms_double.count()) / ms_to_s << "s";
724 if (IS_OUTPUT_ON(mindspore::kInfo) && wp_progress_thread_ && wp_progress_thread_->joinable()) {
725 wp_progress_enabled_ = false;
726 wp_progress_thread_->join();
727 MS_LOG(INFO) << "Join wp_progress_thread_.";
728 }
729 }
730
CheckWatchpointProgress(size_t tensor_list_size)731 void DebugServices::CheckWatchpointProgress(size_t tensor_list_size) {
732 while (wp_progress_enabled_ && (tensor_processed_count_ != tensor_list_size)) {
733 MS_LOG(INFO) << "CheckWatchpoint progress: " << tensor_processed_count_ << " tensor processed out of "
734 << tensor_list_size;
735 std::this_thread::sleep_for(std::chrono::milliseconds(wp_progress_period));
736 }
737 }
738
739 /*
740 * Feature group: Offline debugger, Online debugger.
741 * Target device group: Ascend, GPU.
742 * Runtime category: Old runtime, MindRT.
743 * Description: Sorts the result of watchpoint hit for the online and offline debugger. This sorting for the online
744 * debugger is based on the execution order and for the offline debugger is based on the time stamp.
745 */
SortWatchpointsInfo(std::vector<std::future<void>> * const tensor_future_vec,std::vector<int> * const exec_order,std::vector<std::string> * const time_stamps,uint64_t * const tensor_list_byte_size,std::vector<std::string> * const name,std::vector<std::string> * const slot,std::vector<int> * const condition,std::vector<unsigned int> * const watchpoint_id,std::vector<std::vector<parameter_t>> * const parameters,std::vector<int32_t> * const error_codes,ChunkData * chunk_data,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id) const746 void DebugServices::SortWatchpointsInfo(std::vector<std::future<void>> *const tensor_future_vec,
747 std::vector<int> *const exec_order, std::vector<std::string> *const time_stamps,
748 uint64_t *const tensor_list_byte_size, std::vector<std::string> *const name,
749 std::vector<std::string> *const slot, std::vector<int> *const condition,
750 std::vector<unsigned int> *const watchpoint_id,
751 std::vector<std::vector<parameter_t>> *const parameters,
752 std::vector<int32_t> *const error_codes, ChunkData *chunk_data,
753 std::vector<unsigned int> *const device_id,
754 std::vector<unsigned int> *const root_graph_id) const {
755 for (unsigned int i = 0; i < (*tensor_future_vec).size(); i++) {
756 (*tensor_future_vec)[i].wait();
757 (*tensor_future_vec)[i].get();
758 for (unsigned int j = 0; j < (chunk_data->chunk_exec_orders)[i].size(); j++) {
759 #ifndef OFFLINE_DBG_MODE
760 // if the execution order is repeated,inserts the new one before the others with same execution order.
761 std::vector<int>::iterator iter =
762 std::lower_bound(exec_order->begin(), exec_order->end(), (chunk_data->chunk_exec_orders)[i][j]);
763 int position = iter - exec_order->begin();
764 (void)exec_order->emplace(iter, (chunk_data->chunk_exec_orders)[i][j]);
765 #endif
766 #ifdef OFFLINE_DBG_MODE
767 std::vector<std::string>::iterator iter =
768 std::lower_bound(time_stamps->begin(), time_stamps->end(), (chunk_data->chunk_time_stamp)[i][j]);
769 int position = iter - time_stamps->begin();
770 (void)time_stamps->emplace(iter, (chunk_data->chunk_time_stamp)[i][j]);
771 #endif
772 (void)name->emplace(name->begin() + position, (chunk_data->chunk_names)[i][j]);
773 (void)slot->emplace(slot->begin() + position, (chunk_data->chunk_slots)[i][j]);
774 (void)condition->emplace(condition->begin() + position, (chunk_data->chunk_conditions)[i][j]);
775 (void)watchpoint_id->emplace(watchpoint_id->begin() + position, (chunk_data->chunk_watchpoint_id)[i][j]);
776 if (device_id != nullptr) {
777 (void)device_id->emplace(device_id->begin() + position, (chunk_data->chunk_device_id)[i][j]);
778 }
779 if (root_graph_id != nullptr) {
780 (void)root_graph_id->emplace(root_graph_id->begin() + position, (chunk_data->chunk_root_graph_id)[i][j]);
781 }
782 (void)parameters->emplace(parameters->begin() + position, (chunk_data->chunk_parameters)[i][j]);
783 (void)error_codes->emplace(error_codes->begin() + position, (chunk_data->chunk_error_codes)[i][j]);
784 }
785 // free the memory for used vectors
786 std::vector<int>().swap((chunk_data->chunk_exec_orders)[i]);
787 std::vector<std::string>().swap((chunk_data->chunk_time_stamp)[i]);
788 std::vector<std::string>().swap((chunk_data->chunk_names)[i]);
789 std::vector<std::string>().swap((chunk_data->chunk_slots)[i]);
790 std::vector<int>().swap((chunk_data->chunk_conditions)[i]);
791 std::vector<unsigned int>().swap((chunk_data->chunk_watchpoint_id)[i]);
792 std::vector<std::vector<parameter_t>>().swap((chunk_data->chunk_parameters)[i]);
793 std::vector<int32_t>().swap((chunk_data->chunk_error_codes)[i]);
794 std::vector<unsigned int>().swap((chunk_data->chunk_device_id)[i]);
795 std::vector<unsigned int>().swap((chunk_data->chunk_root_graph_id)[i]);
796 if ((*tensor_list_byte_size) > UINT64_MAX - (chunk_data->chunk_tensor_byte_size)[i]) {
797 MS_LOG(WARNING) << (*tensor_list_byte_size) << " + " << (chunk_data->chunk_tensor_byte_size)[i]
798 << " would lead to integer overflow!";
799 (*tensor_list_byte_size) = UINT64_MAX;
800 } else {
801 (*tensor_list_byte_size) += (chunk_data->chunk_tensor_byte_size)[i];
802 }
803 }
804 }
805
806 #ifdef OFFLINE_DBG_MODE
807 /*
808 * Feature group: Offline debugger.
809 * Target device group: Ascend, GPU.
810 * Runtime category: Old runtime, MindRT.
811 * Description: Read tensor info from the given file. If memory control feature is configured to be enabled, it checks
812 * if the tensor can fit in memory before reading. There are two situations to return false: 1)tensor size is greater
813 * than the total preset memory limit. 2) Evicting all NOT-In-USE tensors from tensor_list_map_ cannot make enough room
814 * for the tensor.
815 */
ReadTensorFromNpy(const std::string & tensor_name,const std::string & file_name,std::string * const tensor_type,std::size_t * const size,std::vector<int64_t> * const shape,char ** const data_buffer,bool * no_mem_to_read,bool is_base_request)816 void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name,
817 std::string *const tensor_type, std::size_t *const size,
818 std::vector<int64_t> *const shape, char **const data_buffer, bool *no_mem_to_read,
819 bool is_base_request) {
820 std::ifstream infile;
821 std::string file_path = file_name;
822 MS_LOG(INFO) << "Reading in file: " << file_path;
823 infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
824 if (!infile.is_open()) {
825 MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno;
826 const int kMaxFilenameLength = 128;
827 char err_info[kMaxFilenameLength];
828 auto ret = strerror_r(errno, err_info, sizeof(err_info));
829 if (ret != kStrErrorNone) {
830 MS_LOG(ERROR) << " ErrInfo:" << ret;
831 }
832 return;
833 }
834 const int substr_len = 2;
835 const int header_len_offset = 8;
836 const int header_offset = 9;
837 const int header_len_buffer_size = 2;
838 const int type_offset = 10;
839 // get header length
840 (void)infile.seekg(0, std::ios::beg);
841 auto header_len_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len_buffer_size);
842 if (!infile.read(header_len_buffer->data(), header_len_offset + header_len_buffer_size)) {
843 MS_LOG(ERROR) << "Failed to parse header length from " << file_path;
844 return;
845 }
846 uint16_t header_len = *reinterpret_cast<uint16_t *>(header_len_buffer->data() + header_len_offset);
847 header_len_buffer.reset();
848 // read in header
849 (void)infile.seekg(0, std::ios::beg);
850 auto header_buffer = std::make_unique<std::vector<char>>(header_offset + header_len);
851 if (!infile.read(header_buffer->data(), header_offset + header_len)) {
852 MS_LOG(ERROR) << "Failed to read header from " << file_path;
853 return;
854 }
855 std::string header(header_buffer->data() + header_offset, header_len);
856 header_buffer.reset();
857 std::size_t type_i = header.find("descr") + type_offset;
858 if (header.length() < type_i + substr_len) {
859 MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length();
860 return;
861 }
862 *tensor_type = header.substr(type_i, substr_len);
863 std::size_t shape_i_open = header.find("(");
864 std::size_t shape_i_close = header.find(")");
865 std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
866 std::string intermediate;
867 std::stringstream check_shape(shape_str);
868 MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
869 while (getline(check_shape, intermediate, ',')) {
870 int64_t shape_d = 0;
871 if (!CheckStoi(&shape_d, intermediate)) {
872 MS_LOG(INFO) << "Failed to get the shape from file: " << file_name << ", error in convert the string "
873 << intermediate << " into an integer.";
874 return;
875 }
876 shape->push_back(shape_d);
877 }
878 std::size_t word_size = 0;
879 if (!CheckStoul(&word_size, std::string(1, (*tensor_type)[1]))) {
880 MS_LOG(INFO) << "Failed to get the word_size from file: " << file_name << ", error in convert the string "
881 << (*tensor_type)[1] << " into an integer.";
882 return;
883 }
884 std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
885 std::size_t data_size = data_len * word_size;
886 *size = data_size;
887 if (data_size == 0 || is_base_request) {
888 // for base request, reading the header is enough.
889 return;
890 }
891 // Check memory available before loading tensor into host.
892 bool has_enough_memory = true;
893 if (tensor_loader_->EnableMemoryControl()) {
894 has_enough_memory = tensor_loader_->CheckMemoryAvailable(tensor_name, data_size);
895 }
896 if (!has_enough_memory) {
897 MS_LOG(ERROR) << "No enough memory available for loading " << tensor_name << " into host memory.";
898 *no_mem_to_read = true;
899 } else {
900 (void)infile.seekg(header_len + type_offset);
901 *data_buffer = new char[data_size];
902 if ((*data_buffer) == nullptr || !infile.read(*data_buffer, data_size)) {
903 MS_LOG(ERROR) << "Unable to get tensor data from npy";
904 }
905 }
906 }
907
908 /*
909 * Feature group: Offline debugger.
910 * Target device group: Ascend.
911 * Runtime category: Old runtime, MindRT.
912 * Description: This function is to convert files in each directory from device format to host format and append the
913 * converted npy file name into NPYFilePool. It's for Ascend async dump only.
914 */
ConvertToHostFormat(const DirMap & dir_to_files_map,NPYFilePool * const result_list) const915 void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, NPYFilePool *const result_list) const {
916 for (auto const &d : dir_to_files_map) {
917 std::vector<std::string> files_to_convert_in_dir;
918 std::vector<std::string> files_after_convert_in_dir;
919 std::string dump_key = d.first;
920 for (auto const &item : d.second) {
921 std::string file_name = std::get<0>(item);
922 std::string file_name_without_scope = std::get<1>(item);
923
924 // skip the file that was converted to npy already.
925 if (std::all_of(result_list->begin(), result_list->end(), [&file_name_without_scope](std::string file_found) {
926 return file_found.find(file_name_without_scope) == std::string::npos;
927 })) {
928 // Full path for conversion.
929 (void)files_to_convert_in_dir.emplace_back(dump_key + "/" + file_name);
930 (void)files_after_convert_in_dir.emplace_back(file_name_without_scope);
931 }
932 }
933 MS_LOG(INFO) << "Number of files to convert: " << files_to_convert_in_dir.size();
934 if (!files_to_convert_in_dir.empty()) {
935 // Look for the installation path to the convert_async package. If not found, throw exception and terminate the
936 // later task.
937 auto t1 = std::chrono::high_resolution_clock::now();
938 {
939 pybind11::gil_scoped_acquire acquire;
940 try {
941 auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
942 auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
943 (void)convert_obj.attr("convert_files")();
944 } catch (pybind11::error_already_set &e) {
945 MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
946 }
947 }
948 auto t2 = std::chrono::high_resolution_clock::now();
949 std::chrono::duration<double, std::milli> ms_double = t2 - t1;
950 MS_LOG(INFO) << "convert files Took: " << std::fixed << std::setprecision(precision)
951 << (ms_double.count()) / ms_to_s << "s";
952 ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list);
953 }
954 }
955 }
956
957 /*
958 * Feature group: Offline debugger.
959 * Target device group: Ascend.
960 * Runtime category: Old runtime, MindRT.
961 * Description: This function is to iterate through dump directory (dump_key) and search all the converted npy files and
962 * append into NPYFilePool. It's for Ascend async dump only.
963 */
ProcessConvertToHostFormat(const std::vector<std::string> & files_after_convert_in_dir,const std::string & dump_key,NPYFilePool * const result_list) const964 void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
965 const std::string &dump_key, NPYFilePool *const result_list) const {
966 std::string real_dump_iter_dir = RealPath(dump_key);
967 DIR *d_handle = opendir(real_dump_iter_dir.c_str());
968 if (d_handle == nullptr) {
969 MS_LOG(INFO) << "Directory " << real_dump_iter_dir << " does not exist in ConvertToHostFormat.";
970 return;
971 }
972 struct dirent *dir = nullptr;
973 while ((dir = readdir(d_handle)) != nullptr) {
974 std::string name = real_dump_iter_dir + std::string("/") + std::string(dir->d_name);
975 if (!IsRegFile(name)) {
976 continue;
977 }
978 std::string candidate = dir->d_name;
979 for (const std::string &file_to_find : files_after_convert_in_dir) {
980 if (candidate.find(file_to_find + ".") != std::string::npos && candidate.rfind(kNpyExt) != std::string::npos) {
981 // we found a converted file for this op
982 std::string found_file = dump_key + "/" + candidate;
983 (void)result_list->insert(found_file);
984 }
985 }
986 }
987 (void)closedir(d_handle);
988 }
989
990 /*
991 * Feature group: Offline debugger.
992 * Target device group: Ascend, GPU.
993 * Runtime category: Old runtime, MindRT.
994 * Description: Node name string prefixes with scope and separates with slash "/". While the npy files in the tensor
995 * dump path do not include scope in their name. The objective of this function is to remove scope from the node name to
996 * match the file.
997 */
GetNodeNameWithoutScope(const std::string & dump_style_name)998 std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
999 if (dump_style_name.empty()) {
1000 return "";
1001 }
1002 std::size_t last_scope_marker;
1003 std::string delim = "/";
1004 last_scope_marker = dump_style_name.rfind(delim);
1005 if (last_scope_marker == std::string::npos) {
1006 return dump_style_name;
1007 }
1008 return dump_style_name.substr(last_scope_marker + delim.size());
1009 }
1010
1011 /*
1012 * Feature group: Offline debugger.
1013 * Target device group: Ascend.
1014 * Runtime category: Old runtime, MindRT.
1015 * Description: This function is to search and prepare the target npy file to be read for each node. If the found file
1016 * is already npy format, push it to NPYFilePool; Otherwise, use conversion tool in convert_async.py to transfer it to
1017 * npy format beforehand.
1018 */
ConvertReadTensors(std::vector<std::string> backend_name,std::vector<size_t> slot,std::vector<unsigned int> device_id,std::vector<unsigned int> iteration,std::vector<unsigned int> root_graph_id,NPYFilePool * const result_list)1019 void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
1020 std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
1021 std::vector<unsigned int> root_graph_id, NPYFilePool *const result_list) {
1022 DirMap dir_to_files_map;
1023 for (unsigned int i = 0; i < backend_name.size(); i++) {
1024 // form prefix of the tensor file to read from graph pb node name
1025 std::string dump_style_kernel_name = backend_name[i];
1026
1027 // remove slot from name
1028 std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
1029 dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
1030
1031 MS_LOG(INFO) << "Dump style kernel_name: " << dump_style_kernel_name << ", slot is: " << slot[i];
1032 std::string prefix_dump_file_name = GetNodeNameWithoutScope(dump_style_kernel_name);
1033
1034 std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
1035 std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
1036
1037 // if node name is constant, skip
1038 if (prefix_dump_file_name.length() > strlen(constant_prefix) &&
1039 prefix_dump_file_name.substr(0, strlen(constant_prefix)).compare(constant_prefix) == 0) {
1040 continue;
1041 }
1042 // search files in dir for the one that meets the filename prefix and read the file into memory
1043 std::string abspath = RealPath(specific_dump_dir);
1044 auto preprocess_async_result = PreProcessDumpDirAsync(abspath);
1045 bool is_success = std::get<0>(preprocess_async_result);
1046 if (!is_success) {
1047 // directory does not exist
1048 return;
1049 }
1050 ProcessConvertList(std::get<1>(preprocess_async_result), prefix_dump_file_name, specific_dump_dir,
1051 &dir_to_files_map, result_list);
1052 }
1053 ConvertToHostFormat(dir_to_files_map, result_list);
1054 }
1055
ConvertWatchPointNodes(const DumpFileMap & dump_dir_mapped_files,const std::vector<ProtoDump> & proto_dump,const std::string & specific_dump_dir,NPYFilePool * const result_list) const1056 void DebugServices::ConvertWatchPointNodes(const DumpFileMap &dump_dir_mapped_files,
1057 const std::vector<ProtoDump> &proto_dump,
1058 const std::string &specific_dump_dir, NPYFilePool *const result_list) const {
1059 DirMap dir_to_files_map;
1060 for (const auto &node : proto_dump) {
1061 std::string dump_name = node.dump_name;
1062 // search files in dir for the one that meets the filename prefix and read the file into memory
1063 std::string abspath = RealPath(specific_dump_dir);
1064 DIR *d = opendir(abspath.c_str());
1065 if (d == nullptr) {
1066 MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
1067 return;
1068 }
1069 ProcessConvertList(dump_dir_mapped_files, dump_name, specific_dump_dir, &dir_to_files_map, result_list);
1070 (void)closedir(d);
1071 }
1072 ConvertToHostFormat(dir_to_files_map, result_list);
1073 }
1074
1075 /*
1076 * Feature group: Offline debugger.
1077 * Target device group: Ascend.
1078 * Runtime category: Old runtime, MindRT.
1079 * Description: This function is to search the dump dir and separate npy files from bin files in async dump dir.
1080 */
PreProcessDumpDirAsync(const std::string & specific_dump_dir) const1081 DebugServices::AsyncPreProcessResult DebugServices::PreProcessDumpDirAsync(const std::string &specific_dump_dir) const {
1082 // DumpFileMap for each specific dump dir (including rank, graph_id and iteration)
1083 DumpFileMap dump_dir_mapped_files;
1084 AsyncPreProcessResult async_result;
1085 DIR *d = opendir(specific_dump_dir.c_str());
1086 if (d == nullptr) {
1087 MS_LOG(ERROR) << "Specific dump dir does not exit for preprocessing: " << specific_dump_dir;
1088 std::get<0>(async_result) = false;
1089 std::get<1>(async_result) = dump_dir_mapped_files;
1090 return async_result;
1091 }
1092 struct dirent *dir = nullptr;
1093 while ((dir = readdir(d)) != nullptr) {
1094 std::string file_name = dir->d_name;
1095 std::string file_path = specific_dump_dir + std::string("/") + file_name;
1096 if (!IsRegFile(file_path)) {
1097 continue;
1098 }
1099 bool is_txt = file_name.rfind(".txt") != std::string::npos;
1100 if (is_txt) {
1101 // txt files in dump dir contain the list of failed converted npy files.
1102 MS_LOG(DEBUG) << "Skipping txt file: " << file_name;
1103 continue;
1104 }
1105 std::string op_name;
1106 bool is_npy = file_name.rfind(kNpyExt) != std::string::npos;
1107 auto first_dot = file_name.find('.');
1108
1109 const int kSeventhFromRight = 7;
1110 size_t pos = file_name.rfind(".");
1111 for (int cnt = 1; cnt < kSeventhFromRight; cnt++) {
1112 pos = file_name.rfind(".", pos - 1);
1113 }
1114 size_t seventh_last_dot = pos;
1115
1116 if (seventh_last_dot != std::string::npos && first_dot != std::string::npos && seventh_last_dot > first_dot) {
1117 // name_to_match is between first dot and seventh last dot.
1118 // if op_type is parameter, the op_name can have dots.
1119 op_name = file_name.substr(first_dot + 1, seventh_last_dot - first_dot - 1);
1120 }
1121
1122 if (is_npy) {
1123 // push back the file_name with specific dump dir
1124 (dump_dir_mapped_files[specific_dump_dir].npy_files[op_name]).push_back(file_path);
1125 } else {
1126 // push back the file_name without specific dump dir. dump dir is the map key.
1127 dump_dir_mapped_files[specific_dump_dir].bin_files.push_back(file_name);
1128 }
1129 }
1130 (void)closedir(d);
1131 std::get<0>(async_result) = true;
1132 std::get<1>(async_result) = dump_dir_mapped_files;
1133 return async_result;
1134 }
1135
1136 /*
1137 * Feature group: Offline debugger.
1138 * Target device group: Ascend, GPU.
1139 * Runtime category: Old runtime, MindRT.
1140 * Description: This function is to search the dump dir for npy files.
1141 */
PreProcessDumpDirSync(const std::string & specific_dump_dir) const1142 DebugServices::NPYFilePool DebugServices::PreProcessDumpDirSync(const std::string &specific_dump_dir) const {
1143 // npy format:
1144 // {dump_path}/{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{output_or_input_string}.{slot}.{format}.npy
1145 NPYFilePool npy_files;
1146 DIR *d = opendir(specific_dump_dir.c_str());
1147 if (d == nullptr) {
1148 MS_LOG(ERROR) << "Specific dump dir does not exit for preprocessing: " << specific_dump_dir;
1149 return npy_files;
1150 }
1151 struct dirent *dir = nullptr;
1152 while ((dir = readdir(d)) != nullptr) {
1153 std::string file_name = dir->d_name;
1154 std::string file_path = specific_dump_dir + std::string("/") + file_name;
1155 if (!IsRegFile(file_path)) {
1156 continue;
1157 }
1158 bool is_npy = file_name.rfind(kNpyExt) != std::string::npos;
1159 if (is_npy) {
1160 (void)npy_files.insert(file_path);
1161 }
1162 }
1163 (void)closedir(d);
1164 return npy_files;
1165 }
1166
ProcessConvertList(const DumpFileMap & dump_dir_mapped_files,const std::string & prefix_dump_file_name,const std::string & specific_dump_dir,DirMap * dir_to_files_map,NPYFilePool * const result_list) const1167 void DebugServices::ProcessConvertList(const DumpFileMap &dump_dir_mapped_files,
1168 const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
1169 DirMap *dir_to_files_map, NPYFilePool *const result_list) const {
1170 MS_EXCEPTION_IF_NULL(dir_to_files_map);
1171 auto it = dump_dir_mapped_files.find(specific_dump_dir);
1172 if (it == dump_dir_mapped_files.end()) {
1173 // no matched file
1174 MS_LOG(ERROR) << "Pre-Process is not done correctly for :" << specific_dump_dir;
1175 return;
1176 }
1177 auto bin_files = (it->second).bin_files;
1178 auto npy_files = (it->second).npy_files;
1179
1180 for (size_t i = 0; i < bin_files.size(); i++) {
1181 std::string file_name = bin_files[i];
1182 std::string file_name_w_o_perfix = file_name;
1183 auto type_pos = file_name.find('.');
1184 // adding dot to avoid problematic matching in the scope.
1185 if (type_pos == std::string::npos ||
1186 file_name.find(prefix_dump_file_name + ".", type_pos + 1) == std::string::npos) {
1187 continue;
1188 }
1189 std::size_t second_dot = file_name.find(".", file_name.find(prefix_dump_file_name + ".", type_pos + 1));
1190 (void)file_name_w_o_perfix.replace(type_pos + 1, second_dot - type_pos - 1, prefix_dump_file_name);
1191 // if file matches prefix and is in device format add to candidate files to convert.
1192 (*dir_to_files_map)[specific_dump_dir].push_back(std::make_tuple(file_name, file_name_w_o_perfix));
1193 }
1194 // Add the already converted npy files to result_list
1195 if (npy_files.find(prefix_dump_file_name) != npy_files.end()) {
1196 (void)std::copy(npy_files[prefix_dump_file_name].begin(), npy_files[prefix_dump_file_name].end(),
1197 std::inserter(*result_list, result_list->end()));
1198 }
1199 }
1200
GetTensorDataInfoAsync(const std::vector<ProtoDump> & proto_dump,const std::string & specific_dump_dir,uint32_t iteration,uint32_t device_id,uint32_t root_graph_id,const ProcessedNPYFiles & processed_async_files,std::vector<std::shared_ptr<TensorData>> * const tensor_list)1201 void DebugServices::GetTensorDataInfoAsync(const std::vector<ProtoDump> &proto_dump,
1202 const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
1203 uint32_t root_graph_id, const ProcessedNPYFiles &processed_async_files,
1204 std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
1205 auto it = processed_async_files.find(specific_dump_dir);
1206 if (it == processed_async_files.end()) {
1207 MS_LOG(DEBUG) << "no npy file was found for dump directory: " << specific_dump_dir;
1208 return;
1209 }
1210 auto processed_files_for_dir = it->second;
1211 for (auto &node : proto_dump) {
1212 std::vector<size_t> slot_list;
1213 std::string dump_name = node.dump_name;
1214 bool output_flag = node.is_output;
1215
1216 for (const auto &dump_file_attr : processed_files_for_dir) {
1217 if (dump_file_attr.name_to_match == dump_name && dump_file_attr.is_output == output_flag) {
1218 slot_list.push_back(dump_file_attr.slot);
1219 }
1220 }
1221 for (auto slot : slot_list) {
1222 // add a TensorData entry (data will be read when needed)
1223 std::vector<int64_t> shape;
1224 std::string orig_name = node.origin_node_name;
1225 auto tensor_data = std::make_shared<TensorData>();
1226 tensor_data->SetName(orig_name);
1227 tensor_data->SetExecutionOrder(0);
1228 tensor_data->SetSlot(slot);
1229 tensor_data->SetIteration(iteration);
1230 tensor_data->SetDeviceId(device_id);
1231 tensor_data->SetRootGraphId(root_graph_id);
1232 tensor_data->SetDataPtr(nullptr);
1233 tensor_data->SetByteSize(0);
1234 tensor_data->SetType("");
1235 tensor_data->SetShape(shape);
1236 tensor_data->SetIsOutput(output_flag);
1237 tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
1238
1239 tensor_list->push_back(tensor_data);
1240 }
1241 }
1242 }
1243
1244 /*
1245 * Feature group: Offline debugger.
1246 * Target device group: Ascend, GPU.
1247 * Runtime category: Old runtime, MindRT.
1248 * Description: This function extracts the attributes like op_name and time stamp from npy file name and is used for
1249 * both sync and async dump.
1250 */
ProcessNPYFilePool(const NPYFilePool & npy_file_pool) const1251 DebugServices::ProcessedNPYFiles DebugServices::ProcessNPYFilePool(const NPYFilePool &npy_file_pool) const {
1252 // npy file format: node_type.node_name.task_id.stream_id.timestamp.output_input.slot.format.npy
1253 ProcessedNPYFiles processed_files;
1254 if (npy_file_pool.empty()) {
1255 MS_LOG(WARNING) << "ProcessNPYFilePool was called for an empty NPYFilePool.";
1256 return processed_files;
1257 }
1258 for (const std::string &file_name : npy_file_pool) {
1259 std::string file_name_to_check = file_name;
1260 std::string specific_dump_dir;
1261 DumpFileAttr dump_file_attr;
1262 std::string output_str;
1263 std::string slot_str;
1264 auto delim = file_name.rfind("/");
1265 if (delim != std::string::npos) {
1266 specific_dump_dir = file_name.substr(0, delim);
1267 file_name_to_check = file_name.substr(delim + 1);
1268 }
1269 std::vector<std::tuple<size_t, size_t, std::string *>> attr_to_match;
1270 size_t first_dot = file_name_to_check.find(".");
1271 size_t last_dot = file_name_to_check.rfind(kNpyExt);
1272 size_t second_last_dot = file_name_to_check.rfind(".", last_dot - 1);
1273 size_t third_last_dot = file_name_to_check.rfind(".", second_last_dot - 1);
1274 size_t fourth_last_dot = file_name_to_check.rfind(".", third_last_dot - 1);
1275 size_t fifth_last_dot = file_name_to_check.rfind(".", fourth_last_dot - 1);
1276 size_t sixth_last_dot = file_name_to_check.rfind(".", fifth_last_dot - 1);
1277 size_t seventh_last_dot = file_name_to_check.rfind(".", sixth_last_dot - 1);
1278 // name_to_match is between first dot and seventh last dot.
1279 // if op_type is parameter, the op_name can have dots.
1280 auto tuple = std::make_tuple(first_dot, seventh_last_dot, &dump_file_attr.name_to_match);
1281 attr_to_match.push_back(tuple);
1282 // slot is between second and third dot from end of the file name.
1283 tuple = std::make_tuple(third_last_dot, second_last_dot, &slot_str);
1284 attr_to_match.push_back(tuple);
1285 // time stamp is between fourth and fifth dot from end of the file name.
1286 tuple = std::make_tuple(fifth_last_dot, fourth_last_dot, &dump_file_attr.time_stamp);
1287 attr_to_match.push_back(tuple);
1288 // output is between third and fourth dot from end of the file name.
1289 tuple = std::make_tuple(fourth_last_dot, third_last_dot, &output_str);
1290 attr_to_match.push_back(tuple);
1291 for (auto &match_item : attr_to_match) {
1292 CheckStringMatch(std::get<DebugServices::START_POS>(match_item), std::get<DebugServices::END_POS>(match_item),
1293 std::get<DebugServices::STR_POS>(match_item), file_name_to_check);
1294 }
1295
1296 if (!slot_str.empty() && !CheckStoull(&dump_file_attr.slot, slot_str)) {
1297 MS_LOG(INFO) << "Failed to get the slot from file_name: " << file_name_to_check
1298 << ", error in convert the string " << slot_str << " into an integer.";
1299 }
1300 dump_file_attr.is_output = (output_str == "output");
1301 dump_file_attr.file_path = file_name_to_check;
1302 processed_files[specific_dump_dir].push_back(dump_file_attr);
1303 }
1304 return processed_files;
1305 }
1306
1307 /*
1308 * Feature group: Offline debugger.
1309 * Target device group: Ascend, GPU.
1310 * Runtime category: Old runtime, MindRT.
1311 * Description: For the two possible modes (rank and graph), this function returns the rank_id or graph_id extracted
1312 * from the given directory name otherwise, it returns UINT32_MAX to identify an invalid rank or graph id.
1313 */
GetRankOrGraphId(const std::string & mode,const std::string & name)1314 uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) {
1315 std::regex re;
1316 if (mode == "rank") {
1317 re = "^rank_([0-9]+)$";
1318 } else if (mode == "graph") {
1319 re = "^([0-9]+)$";
1320 }
1321 std::smatch tokens;
1322 if (regex_match(name, tokens, re)) {
1323 return std::stoi(tokens[1]);
1324 } else {
1325 return UINT32_MAX;
1326 }
1327 }
1328
GetDumpRankIdList()1329 std::vector<uint32_t> DebugServices::GetDumpRankIdList() {
1330 std::vector<uint32_t> rank_id_list;
1331 std::string dump_dir = GetDumpDir();
1332 DIR *d_handle = opendir(dump_dir.c_str());
1333 if (d_handle == nullptr) {
1334 MS_LOG(ERROR) << "Dump directory does not exist.";
1335 return rank_id_list;
1336 }
1337 struct dirent *dir = nullptr;
1338 while ((dir = readdir(d_handle)) != nullptr) {
1339 struct stat st;
1340 std::string name = dump_dir + std::string("/") + std::string(dir->d_name);
1341 int ret = stat(name.c_str(), &st);
1342 if (ret != 0) {
1343 MS_LOG(ERROR) << "stat error, ret is: " << ret;
1344 (void)closedir(d_handle);
1345 return rank_id_list;
1346 }
1347 if (S_ISDIR(st.st_mode)) {
1348 std::string rank_dir_name = dir->d_name;
1349 uint32_t rank_id = GetRankOrGraphId("rank", rank_dir_name);
1350 if (rank_id != UINT32_MAX) {
1351 rank_id_list.push_back(rank_id);
1352 }
1353 }
1354 }
1355 (void)closedir(d_handle);
1356 return rank_id_list;
1357 }
1358
1359 /*
1360 * Feature group: Offline debugger.
1361 * Target device group: Ascend, GPU.
1362 * Runtime category: Old runtime, MindRT.
1363 * Description: Searches the current dump directory and for each rank_id in rank_id_list extracts the existing
1364 * graph_ids. Then the history file is read for all the extracted graph_ids.
1365 */
CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list)1366 void DebugServices::CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list) {
1367 std::string net_name = GetNetName();
1368 std::string dump_dir = GetDumpDir();
1369 for (uint32_t rank_id : rank_id_list) {
1370 std::string path = dump_dir + "/rank_" + std::to_string(rank_id) + "/" + net_name;
1371 std::string abspath = RealPath(path);
1372 DIR *d_handle_rank = opendir(abspath.c_str());
1373 if (d_handle_rank == nullptr) {
1374 MS_LOG(ERROR) << "Directory for rank_id: " << rank_id << " does not exist.";
1375 continue;
1376 }
1377 struct dirent *direc = nullptr;
1378 while ((direc = readdir(d_handle_rank)) != nullptr) {
1379 struct stat st;
1380 std::string name = abspath + std::string("/") + std::string(direc->d_name);
1381 int ret = stat(name.c_str(), &st);
1382 if (ret != 0) {
1383 MS_LOG(ERROR) << "stat error, ret is: " << ret;
1384 (void)closedir(d_handle_rank);
1385 return;
1386 }
1387 if (S_ISDIR(st.st_mode)) {
1388 std::string graph_dir = direc->d_name;
1389 if (graph_dir == "." || graph_dir == "..") {
1390 continue;
1391 }
1392 uint32_t graph_id = GetRankOrGraphId("graph", graph_dir);
1393 if (graph_id != UINT32_MAX) {
1394 ReadGraphsHistory(rank_id, graph_id);
1395 }
1396 }
1397 }
1398 (void)closedir(d_handle_rank);
1399 }
1400 }
1401
SetGraphsHistory()1402 void DebugServices::SetGraphsHistory() {
1403 // extract rank_id_list
1404 std::vector<uint32_t> rank_id_list = GetDumpRankIdList();
1405 // for each rank_id extract the graph_id list and set the dump version
1406 // and for each graph read the graph history file
1407 CheckDumpGraphIdList(rank_id_list);
1408 }
1409
1410 /*
1411 * Feature group: Offline debugger.
1412 * Target device group: Ascend, GPU.
1413 * Runtime category: Old runtime, MindRT.
1414 * Description: Reads the graph history file (containing iteration numbers in which the graph was executed) and stores
1415 * the data in graphs_run_history_ for the given rank and graph id.
1416 */
ReadGraphsHistory(uint32_t rank_id,uint32_t root_graph_id)1417 void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) {
1418 std::tuple<uint32_t, uint32_t> rank_and_graph(rank_id, root_graph_id);
1419 if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) {
1420 // graph history was already stored for this rank_id and graph_id
1421 return;
1422 }
1423 std::string exec_order_path = GetDumpDir() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
1424 std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv";
1425 DIR *d_handle = opendir(exec_order_path.c_str());
1426 if (d_handle == nullptr) {
1427 MS_LOG(ERROR) << "Execution order directory does not exist.";
1428 return;
1429 }
1430 // read file and store the info
1431 std::string full_path = exec_order_path + "/" + file_to_check;
1432 std::string checked_path = RealPath(full_path);
1433 if (!checked_path.empty()) {
1434 ReadGraphRunIter(checked_path, rank_and_graph);
1435 }
1436 (void)closedir(d_handle);
1437 }
1438
1439 /*
1440 * Feature group: Offline debugger.
1441 * Target device group: Ascend, GPU.
1442 * Runtime category: Old runtime, MindRT.
1443 * Description: Returns a map with a tuple as the key (rank, graph) and a vector as the value. This vector contains a
1444 * tuple with two elements, the first element is the node name and the second element is whether the node is output or
1445 * not.
1446 */
GetAllWpNodes()1447 std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> DebugServices::GetAllWpNodes() {
1448 std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes;
1449 for (auto w_table_item : watchpoint_table_) {
1450 auto wp = std::get<1>(w_table_item);
1451 unsigned int index = 0;
1452 for (auto check_node : wp.check_node_list) {
1453 std::vector<uint32_t> ranks = std::get<1>(wp.check_node_device_list[index]);
1454 std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
1455 // graph represents root_graph for Ascend and kernel_graph for GPU
1456 for (auto rank : ranks) {
1457 for (auto graph : graphs) {
1458 std::tuple<uint32_t, uint32_t> key(rank, graph);
1459 (rank_and_graph_to_nodes)[key].push_back(check_node);
1460 }
1461 }
1462 index++;
1463 }
1464 }
1465 return rank_and_graph_to_nodes;
1466 }
1467
1468 /*
1469 * Feature group: Offline debugger.
1470 * Target device group: Ascend, GPU.
1471 * Runtime category: Old runtime, MindRT.
1472 * Description: For the given graph and rank id, reads the graph history file, stores all the run iterations for the
1473 * graph in a vector and inserts it to graphs_run_history_ map.
1474 */
ReadGraphRunIter(std::string file_path,std::tuple<uint32_t,uint32_t> rank_and_graph)1475 void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph) {
1476 std::ifstream infile;
1477 std::string line;
1478 infile.open(file_path.c_str());
1479 if (!infile.is_open()) {
1480 MS_LOG(ERROR) << "Failed to open file (In ReadGraphRunIter) " << file_path << " Errno:" << errno;
1481 const int kMaxFilenameLength = NAME_MAX;
1482 char err_info[kMaxFilenameLength];
1483 if (strerror_r(errno, err_info, sizeof(err_info)) != kStrErrorNone) {
1484 MS_LOG(ERROR) << " ErrInfo:" << strerror_r(errno, err_info, sizeof(err_info));
1485 }
1486
1487 return;
1488 }
1489 std::vector<uint32_t> run_iters_vec;
1490 while (std::getline(infile, line)) {
1491 uint32_t iter;
1492 std::stringstream ss(line);
1493 ss >> iter;
1494 run_iters_vec.push_back(iter);
1495 }
1496 (void)graphs_run_history_.emplace(
1497 std::pair<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>>(rank_and_graph, run_iters_vec));
1498 }
1499
1500 /*
1501 * Feature group: Offline debugger.
1502 * Target device group: Ascend, GPU.
1503 * Runtime category: Old runtime, MindRT.
1504 * Description: Creates a tensor_data object and sets its variables based on the function arguments and add the tensor
1505 * to the tensor_list_map_.
1506 */
AddToTensorData(const std::string & backend_name,const std::string & time_stamp,const std::size_t slot,const unsigned int iteration,const unsigned int device_id,const unsigned int root_graph_id,const bool is_output,const std::size_t data_size,const std::string & type_name,const std::vector<int64_t> & shape,char * buffer,std::vector<std::shared_ptr<TensorData>> * const result_list)1507 void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
1508 const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
1509 const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
1510 const std::string &type_name, const std::vector<int64_t> &shape, char *buffer,
1511 std::vector<std::shared_ptr<TensorData>> *const result_list) {
1512 // call LoadNewTensor to store tensor in internal cache
1513 auto tensor_data = std::make_shared<TensorData>();
1514 tensor_data->SetName(backend_name);
1515 tensor_data->SetExecutionOrder(0);
1516 tensor_data->SetSlot(slot);
1517 tensor_data->SetIteration(iteration);
1518 tensor_data->SetDeviceId(device_id);
1519 tensor_data->SetRootGraphId(root_graph_id);
1520 tensor_data->SetIsOutput(is_output);
1521 if (buffer != nullptr) {
1522 tensor_data->SetDataPtr(buffer);
1523 } else {
1524 tensor_data->SetDataPtr(nullptr);
1525 }
1526 tensor_data->SetByteSize(data_size);
1527 tensor_data->SetType(type_name);
1528 tensor_data->SetShape(shape);
1529 tensor_data->SetTimeStamp(time_stamp);
1530 tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
1531 if (data_size > 0) {
1532 (void)tensor_loader_->LoadNewTensor(tensor_data, false);
1533 }
1534
1535 // add to result_list
1536 result_list->push_back(tensor_data);
1537 }
1538
GetNewestFileIndex(std::vector<std::string> matched_time_stamps)1539 int GetNewestFileIndex(std::vector<std::string> matched_time_stamps) {
1540 // given the vector of matched_time_stamps, get the index of the newest time stamp.
1541 // this index is used to find the corresponding matched_path.
1542 if (matched_time_stamps.empty()) {
1543 return -1;
1544 }
1545 auto it = std::max_element(matched_time_stamps.begin(), matched_time_stamps.end());
1546 int index = it - matched_time_stamps.begin();
1547 return index;
1548 }
1549
1550 /*
1551 * Feature group: Offline debugger.
1552 * Target device group: Ascend, GPU.
1553 * Runtime category: Old runtime, MindRT.
1554 * Description: Search files in NPYFilePool (async and async mode) for the one that meets the filename
1555 * prefix and read the file into memory.
1556 */
ReadDumpedTensor(std::vector<std::string> backend_name,std::vector<size_t> slot,std::vector<unsigned int> device_id,std::vector<unsigned int> iteration,std::vector<unsigned int> root_graph_id,const std::vector<bool> & is_output,ProcessedNPYFiles * const processed_npy_files,std::vector<std::shared_ptr<TensorData>> * const result_list,bool is_base_request,bool * no_mem_to_read)1557 void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
1558 std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
1559 std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
1560 ProcessedNPYFiles *const processed_npy_files,
1561 std::vector<std::shared_ptr<TensorData>> *const result_list, bool is_base_request,
1562 bool *no_mem_to_read) {
1563 for (unsigned int i = 0; i < backend_name.size(); i++) {
1564 // form prefix of the tensor file to read from graph pb node name
1565 std::string dump_style_kernel_name = backend_name[i];
1566
1567 // remove slot from name
1568 std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
1569 dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
1570
1571 std::string specific_dump_dir;
1572 bool is_cst = false;
1573 // prefix_dump_to_check is node name used to find corresponding dump file.
1574 std::string prefix_dump_to_check = GetNodeNameWithoutScope(dump_style_kernel_name);
1575 // if node name has prefix of "Default--data-", consider as constant, search in cst folder
1576 if (prefix_dump_to_check.length() > strlen(constant_prefix) &&
1577 prefix_dump_to_check.substr(0, strlen(constant_prefix)).compare(constant_prefix) == 0) {
1578 specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
1579 std::to_string(root_graph_id[i]) + "/constants";
1580 is_cst = true;
1581 const std::string prefix = "Default--";
1582 prefix_dump_to_check = prefix_dump_to_check.substr(prefix.length());
1583 } else {
1584 specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
1585 std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
1586 }
1587 MS_LOG(INFO) << "specific_dump_dir " << specific_dump_dir;
1588 if ((is_sync_mode_ || is_cst) && processed_npy_files->find(specific_dump_dir) == processed_npy_files->end()) {
1589 // This case happens when ReadDumpedTensor is called from GetPrevTensor function.
1590 NPYFilePool npy_files = PreProcessDumpDirSync(specific_dump_dir);
1591 *processed_npy_files = ProcessNPYFilePool(npy_files);
1592 }
1593 ReadDumpedTensorUtils(specific_dump_dir, prefix_dump_to_check, backend_name[i], slot[i], device_id[i], iteration[i],
1594 root_graph_id[i], is_output[i], *processed_npy_files, result_list, no_mem_to_read,
1595 is_base_request);
1596 }
1597 }
1598 /*
1599 * Feature group: Offline debugger.
1600 * Target device group: Ascend, GPU.
1601 * Runtime category: Old runtime, MindRT.
1602 * Description: For both sync and async dump, gets the newest matched file path and reads the npy file and add the
1603 * tenosr_data object to tensor_list_map_. If there is no matched file, an empty tensor_data object is created with
1604 * data_size = 0, empty shape and nullptr buffer.
1605 */
ReadFileAndAddToTensor(const bool found,const std::vector<std::string> & matched_paths,const std::vector<std::string> & matched_time_stamps,const std::string & backend_name,const unsigned int device_id,const unsigned int root_graph_id,bool is_output,size_t slot,bool * no_mem_to_read,unsigned int iteration,std::vector<std::shared_ptr<TensorData>> * result_list,bool is_base_request)1606 void DebugServices::ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,
1607 const std::vector<std::string> &matched_time_stamps,
1608 const std::string &backend_name, const unsigned int device_id,
1609 const unsigned int root_graph_id, bool is_output, size_t slot,
1610 bool *no_mem_to_read, unsigned int iteration,
1611 std::vector<std::shared_ptr<TensorData>> *result_list,
1612 bool is_base_request) {
1613 std::string time_stamp = "";
1614 std::string result_path = "";
1615 std::string type_name = "";
1616 size_t data_size = 0;
1617 std::vector<int64_t> shape;
1618 char *buffer = nullptr;
1619 if (found) {
1620 int index = GetNewestFileIndex(matched_time_stamps);
1621 if (index >= 0) {
1622 result_path = matched_paths[index];
1623 time_stamp = matched_time_stamps[index];
1624 }
1625
1626 std::string key_name_in_cache = backend_name + ":" + std::to_string(device_id) + ":" +
1627 std::to_string(root_graph_id) + ":" + std::to_string(is_output) + ":" +
1628 std::to_string(slot);
1629 ReadTensorFromNpy(key_name_in_cache, result_path, &type_name, &data_size, &shape, &buffer, no_mem_to_read,
1630 is_base_request);
1631 AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, data_size,
1632 type_name, shape, buffer, result_list);
1633 } else {
1634 AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape,
1635 buffer, result_list);
1636 MS_LOG(INFO) << "Target tensor has not been found.";
1637 }
1638 }
1639
1640 /*
1641 * Feature group: Offline debugger.
1642 * Target device group: Ascend.
1643 * Runtime category: Old runtime, MindRT.
1644 * Description: Iterates through all the processed npy files for the current specific_dump_dir and looks for the files
1645 * that match the node_name for dump, read the newest file and add the related tensor_data object.
1646 */
ReadDumpedTensorUtils(const std::string & specific_dump_dir,const std::string & prefix_dump_to_check,const std::string & backend_name,size_t slot,unsigned int device_id,unsigned int iteration,unsigned int root_graph_id,bool is_output,const ProcessedNPYFiles & processed_npy_files,std::vector<std::shared_ptr<TensorData>> * result_list,bool * no_mem_to_read,bool is_base_request)1647 void DebugServices::ReadDumpedTensorUtils(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
1648 const std::string &backend_name, size_t slot, unsigned int device_id,
1649 unsigned int iteration, unsigned int root_graph_id, bool is_output,
1650 const ProcessedNPYFiles &processed_npy_files,
1651 std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read,
1652 bool is_base_request) {
1653 bool found = false;
1654 std::vector<std::string> matched_paths;
1655 std::vector<std::string> matched_time_stamps;
1656 auto it = processed_npy_files.find(specific_dump_dir);
1657 // If there is no npy file found we still need to add tensor data with size 0.
1658 if (it == processed_npy_files.end()) {
1659 MS_LOG(WARNING) << "no npy files was found for dump directory: " << specific_dump_dir;
1660 } else {
1661 auto processed_files_for_dir = it->second;
1662 for (const auto &dump_file_attr : processed_files_for_dir) {
1663 std::string file_name_to_check = dump_file_attr.file_path;
1664 std::string full_path = specific_dump_dir + "/" + file_name_to_check;
1665
1666 if (dump_file_attr.name_to_match == prefix_dump_to_check && (dump_file_attr.slot == slot) &&
1667 (is_output == dump_file_attr.is_output)) {
1668 matched_paths.push_back(full_path);
1669 matched_time_stamps.push_back(dump_file_attr.time_stamp);
1670 found = true;
1671 }
1672 }
1673 }
1674 ReadFileAndAddToTensor(found, matched_paths, matched_time_stamps, backend_name, device_id, root_graph_id, is_output,
1675 slot, no_mem_to_read, iteration, result_list, is_base_request);
1676 }
1677
1678 /*
1679 * Feature group: Offline debugger.
1680 * Target device group: Ascend, GPU.
1681 * Runtime category: Old runtime, MindRT.
1682 * Description: Gets a list of the nodes that should be monitored, creates a vector called proto_to_dump with nodes'
1683 * original names and dump style names. Then, for each node, it creates an empty tensor_data object with data_byte_size
1684 * = 0 and data_ptr = nullptr and add it to the tensor_list (for both sync and async dump). This tensor_list is used for
1685 * checkwatchpoint functions.
1686 */
ReadNeededDumpedTensors(unsigned int iteration,ProcessedNPYFiles * const processed_npy_files,bool error_on_no_value)1687 std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
1688 unsigned int iteration, ProcessedNPYFiles *const processed_npy_files, bool error_on_no_value) {
1689 // get a list of nodes and the devices they are on to monitor
1690 std::vector<std::shared_ptr<TensorData>> tensor_list;
1691 std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes =
1692 GetAllWpNodes();
1693 // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
1694 // as they are found
1695 for (auto const &rank_and_graph_item : rank_and_graph_to_nodes) {
1696 std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
1697 uint32_t rank_id = std::get<0>(rank_and_graph);
1698 uint32_t root_graph_id = std::get<1>(rank_and_graph);
1699 MS_LOG(INFO) << "Get tensor files for rank_id: " << rank_id << ", root_graph_id: " << root_graph_id;
1700 std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
1701 std::to_string(root_graph_id) + "/" + IterationString(iteration);
1702 std::string real_dump_dir = RealPath(specific_dump_dir);
1703 if (real_dump_dir.empty()) {
1704 MS_LOG(INFO) << "Dump dir " << specific_dump_dir << " doesn't exist. Skit it.";
1705 continue;
1706 }
1707 std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
1708 std::vector<ProtoDump> proto_to_dump;
1709
1710 // convert node names to dump style
1711 for (auto node : wp_nodes) {
1712 std::string orig_name = std::get<0>(node);
1713 // Remove the scope from the fully qualified name to compare for both sync and async case.
1714 std::string dump_style_name = GetNodeNameWithoutScope(orig_name);
1715
1716 bool node_is_out = std::get<1>(node);
1717 ProtoDump dump_proto;
1718 dump_proto.origin_node_name = orig_name;
1719 dump_proto.dump_name = dump_style_name;
1720 dump_proto.is_output = node_is_out;
1721
1722 if (std::find(proto_to_dump.begin(), proto_to_dump.end(), dump_proto) == proto_to_dump.end()) {
1723 proto_to_dump.push_back(dump_proto);
1724 }
1725 }
1726 if (is_sync_mode_) {
1727 // search files in dir for the one that meets the filename prefix and read the file into memory
1728 NPYFilePool npy_files = PreProcessDumpDirSync(real_dump_dir);
1729 auto processed_npy_files_in_rank = ProcessNPYFilePool(npy_files);
1730 processed_npy_files->insert(processed_npy_files_in_rank.begin(), processed_npy_files_in_rank.end());
1731 ProcessTensorDataSync(proto_to_dump, real_dump_dir, *processed_npy_files, iteration, rank_id, root_graph_id,
1732 &tensor_list, error_on_no_value);
1733 } else {
1734 auto preprocess_async_result = PreProcessDumpDirAsync(real_dump_dir);
1735 // convert all files in proto_to_dump to npy and add to pool of async file names
1736 NPYFilePool async_file_pool;
1737 ConvertWatchPointNodes(std::get<1>(preprocess_async_result), proto_to_dump, real_dump_dir, &async_file_pool);
1738 auto processed_npy_files_in_rank = ProcessNPYFilePool(async_file_pool);
1739 processed_npy_files->insert(processed_npy_files_in_rank.begin(), processed_npy_files_in_rank.end());
1740 GetTensorDataInfoAsync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, *processed_npy_files,
1741 &tensor_list);
1742 }
1743 }
1744
1745 return tensor_list;
1746 }
1747
1748 /*
1749 * Feature group: Offline debugger.
1750 * Target device group: Ascend, GPU.
1751 * Runtime category: Old runtime, MindRT.
1752 * Description: Iterates through the dump directory and for each file it looks for a match in the file name with node
1753 * names in proto_to_dump vector.
1754 */
ProcessTensorDataSync(const std::vector<ProtoDump> & proto_to_dump,const std::string & specific_dump_dir,ProcessedNPYFiles processed_npy_files,unsigned int iteration,unsigned int device_id,unsigned int root_graph_id,std::vector<std::shared_ptr<TensorData>> * const tensor_list,bool error_on_no_value)1755 void DebugServices::ProcessTensorDataSync(const std::vector<ProtoDump> &proto_to_dump,
1756 const std::string &specific_dump_dir, ProcessedNPYFiles processed_npy_files,
1757 unsigned int iteration, unsigned int device_id, unsigned int root_graph_id,
1758 std::vector<std::shared_ptr<TensorData>> *const tensor_list,
1759 bool error_on_no_value) {
1760 ProcessedNPYFiles::const_iterator it = processed_npy_files.find(specific_dump_dir);
1761 if (it == processed_npy_files.end()) {
1762 if (error_on_no_value) {
1763 MS_LOG(ERROR) << "no npy files was found for dump directory: " << specific_dump_dir;
1764 }
1765 return;
1766 }
1767 auto processed_files_for_dir = it->second;
1768 for (const auto &dump_file_attr : processed_files_for_dir) {
1769 for (auto &node : proto_to_dump) {
1770 std::string dump_name = node.dump_name;
1771 if (dump_name == dump_file_attr.name_to_match && node.is_output == dump_file_attr.is_output) {
1772 size_t slot = dump_file_attr.slot;
1773 std::vector<int64_t> shape;
1774 std::string orig_name = node.origin_node_name;
1775 bool output_flag = node.is_output;
1776
1777 AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape, nullptr,
1778 tensor_list);
1779 break;
1780 }
1781 }
1782 }
1783 }
1784
IterationString(unsigned int iteration) const1785 std::string DebugServices::IterationString(unsigned int iteration) const {
1786 std::string iteration_string;
1787 bool init_dbg_suspend = (iteration == std::numeric_limits<unsigned int>::max());
1788 if (init_dbg_suspend) {
1789 iteration_string = "init";
1790 } else {
1791 iteration_string = std::to_string(iteration);
1792 }
1793 return iteration_string;
1794 }
1795 #endif
1796
1797 /*
1798 * Feature group: Online debugger.
1799 * Target device group: Ascend, GPU.
1800 * Runtime category: Old runtime, MindRT.
1801 * Description: Searches for tensor in the loaded tensors, if the tensor is found and tensor's root_graph_id is equal to
1802 * current root_graph_id, it updates the given vectors.
1803 */
ReadNodesTensors(const std::vector<std::string> & name,std::vector<std::string> * const ret_name,std::vector<const char * > * const data_ptr,std::vector<ssize_t> * const data_size,std::vector<unsigned int> * const dtype,std::vector<std::vector<int64_t>> * const shape)1804 void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
1805 std::vector<const char *> *const data_ptr, std::vector<ssize_t> *const data_size,
1806 std::vector<unsigned int> *const dtype,
1807 std::vector<std::vector<int64_t>> *const shape) {
1808 std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
1809 tensor_loader_->SearchTensors(name, &result_list);
1810
1811 for (auto result : result_list) {
1812 if (std::get<1>(result) == nullptr) {
1813 continue;
1814 }
1815 #ifndef OFFLINE_DBG_MODE
1816 auto debugger = Debugger::GetInstance();
1817 MS_EXCEPTION_IF_NULL(debugger);
1818 if (!CompareCurrentRootGraph(std::get<1>(result)->GetRootGraphId())) {
1819 MS_LOG(INFO) << "tensor root_graph_id: " << std::get<1>(result)->GetRootGraphId()
1820 << " is different from cur_root_graph_id: " << debugger->GetCurrentRootGraphId() << ".";
1821 MS_LOG(INFO) << "Not reading tensor: " << std::get<0>(result) << ".";
1822 }
1823 #endif
1824 (void)ret_name->emplace_back(std::get<0>(result));
1825 (void)data_ptr->emplace_back(std::get<1>(result)->GetDataPtr());
1826 (void)data_size->emplace_back(std::get<1>(result)->GetByteSize());
1827 (void)dtype->emplace_back(std::get<1>(result)->GetType());
1828 (void)shape->emplace_back(std::get<1>(result)->GetShape());
1829 }
1830 }
1831
SearchNodesTensors(const std::vector<std::string> & name,std::vector<std::tuple<std::string,std::shared_ptr<TensorData>>> * result_list)1832 void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
1833 std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
1834 if (result_list == nullptr) {
1835 MS_LOG(DEBUG) << "result_list is nullptr.";
1836 return;
1837 }
1838 tensor_loader_->SearchTensors(name, result_list);
1839 }
1840
1841 #ifndef OFFLINE_DBG_MODE
IsWatchPoint(const std::string & kernel_name,const CNodePtr & kernel) const1842 bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
1843 bool ret = false;
1844 for (auto w_table_item : watchpoint_table_) {
1845 auto check_node_list = std::get<1>(w_table_item).check_node_list;
1846 for (auto check_node : check_node_list) {
1847 std::string w_name = std::get<0>(check_node);
1848 bool w_type = std::get<1>(check_node);
1849 if ((w_type == true &&
1850 ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
1851 (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
1852 ret = true;
1853 return ret;
1854 }
1855 }
1856 }
1857 return ret;
1858 }
1859
IsWatchPointNodeInput(const std::string & w_name,const CNodePtr & kernel) const1860 bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
1861 if (kernel != nullptr && w_name.length() > 0) {
1862 auto input_size = common::AnfAlgo::GetInputTensorNum(kernel);
1863 for (size_t j = 0; j < input_size; ++j) {
1864 auto input_kernel = kernel->input(j + 1);
1865 std::string input_kernel_name = GetKernelNodeName(input_kernel);
1866 auto found = w_name.find_last_of('/');
1867 if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name) {
1868 return true;
1869 }
1870 }
1871 return false;
1872 } else {
1873 return false;
1874 }
1875 }
1876 #endif
1877
GetTensor() const1878 std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
1879
GetTensor(const std::string & tensor_name) const1880 std::shared_ptr<TensorData> DebugServices::GetTensor(const std::string &tensor_name) const {
1881 return tensor_loader_->GetTensor(tensor_name);
1882 }
1883
EmptyCurrentTensor()1884 void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
1885
1886 #ifndef OFFLINE_DBG_MODE
DumpTensorToFile(const std::string & filepath,const std::string & tensor_name,size_t slot) const1887 bool DebugServices::DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const {
1888 return tensor_loader_->DumpTensorToFile(filepath, tensor_name, slot);
1889 }
1890 #endif
1891
LoadNewTensor(const std::shared_ptr<TensorData> & tensor,bool keep_prev)1892 bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
1893 return tensor_loader_->LoadNewTensor(tensor, keep_prev);
1894 }
1895
1896 /*
1897 * Feature group: Offline debugger.
1898 * Target device group: Ascend, GPU.
1899 * Runtime category: Old runtime, MindRT.
1900 * Description: Returns the previous iteration in which tensor's graph was executed, if the current step is the first
1901 * run iteration for the graph or graph history file is not available it returns UINT32_MAX to identify invalid
1902 * prev_iteration.
1903 */
GetPrevIteration(const std::shared_ptr<TensorData> & tensor)1904 uint32_t DebugServices::GetPrevIteration(const std::shared_ptr<TensorData> &tensor) {
1905 uint32_t prev_iter;
1906 uint32_t rank_id = tensor->GetDeviceId();
1907 uint32_t root_graph_id = tensor->GetRootGraphId();
1908 std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(rank_id, root_graph_id);
1909 if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
1910 return UINT32_MAX;
1911 }
1912 auto it = std::find(graphs_run_history_[rank_and_graph].begin(), graphs_run_history_[rank_and_graph].end(),
1913 tensor->GetIteration());
1914 if (it == graphs_run_history_[rank_and_graph].end()) {
1915 // The graph is not executed in that iteration
1916 return UINT32_MAX;
1917 } else if (it == graphs_run_history_[rank_and_graph].begin()) {
1918 // current iteration is the first iteration that the graph was run
1919 // no prev iter is available
1920 MS_LOG(DEBUG) << "Iteration: " << tensor->GetIteration()
1921 << " is the first run iteration for tensor: " << tensor->GetName();
1922 return UINT32_MAX;
1923 }
1924 (void)it--;
1925 prev_iter = *it;
1926 tensor->SetPrevIteration(prev_iter);
1927 return prev_iter;
1928 }
1929
ResetLoadedTensors()1930 void DebugServices::ResetLoadedTensors() {
1931 wp_id_cache_.clear();
1932 MS_LOG(INFO) << "Resetting loaded tensors";
1933 tensor_loader_->MoveParametersCurrentToPrev();
1934 tensor_loader_->EmptyCurrentTensor();
1935 // will move parameters from previous to current map
1936 tensor_loader_->SwapCurrentPrev();
1937 overflow_ops_.clear();
1938 }
1939
1940 #ifndef OFFLINE_DBG_MODE
GetNodeTensor(const CNodePtr & kernel)1941 std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
1942 MS_EXCEPTION_IF_NULL(kernel);
1943 std::vector<std::shared_ptr<TensorData>> result;
1944 auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
1945 auto kernel_name = GetKernelNodeName(kernel);
1946 for (size_t j = 0; j < output_size; ++j) {
1947 auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
1948 auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
1949 if (tensor != nullptr) {
1950 result.push_back(tensor);
1951 }
1952 }
1953 return result;
1954 }
1955 #endif
1956
GetOnlineOpOverflowDir()1957 std::string GetOnlineOpOverflowDir() {
1958 // only called for online debugger mode
1959 // get operator overflow directory for current iteration
1960 std::string overflow_bin_path = "";
1961 #ifndef OFFLINE_DBG_MODE
1962 if (DumpJsonParser::GetInstance().path().empty()) {
1963 MS_LOG(INFO) << "Dump config is not set.";
1964 return "";
1965 }
1966 auto debugger = Debugger::GetInstance();
1967 MS_EXCEPTION_IF_NULL(debugger);
1968 auto cur_graph = debugger->GetGraphPtr();
1969 if (cur_graph == nullptr) {
1970 return "";
1971 }
1972 overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(cur_graph->root_graph_id());
1973 auto realpath = FileUtils::GetRealPath(overflow_bin_path.c_str());
1974 if (!realpath.has_value()) {
1975 MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
1976 return "";
1977 }
1978 overflow_bin_path = realpath.value() + '/';
1979 #endif
1980 return overflow_bin_path;
1981 }
1982
GetOverflowTaskStreamId(const std::string & overflow_bin_path,std::vector<std::pair<uint64_t,uint64_t>> * task_stream_hits) const1983 void DebugServices::GetOverflowTaskStreamId(const std::string &overflow_bin_path,
1984 std::vector<std::pair<uint64_t, uint64_t>> *task_stream_hits) const {
1985 MS_EXCEPTION_IF_NULL(task_stream_hits);
1986 const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
1987 MS_LOG(INFO) << "Processing debug_files path: " << overflow_bin_path;
1988 DIR *d = opendir(overflow_bin_path.c_str());
1989 if (d == nullptr) {
1990 MS_LOG(INFO) << "Overflow bin directory does not exist!";
1991 } else {
1992 struct dirent *dir = nullptr;
1993 while ((dir = readdir(d)) != nullptr) {
1994 std::string file_name = dir->d_name;
1995 if (file_name.rfind(overflow_file_prefix, 0) != 0) {
1996 continue;
1997 }
1998 std::string file_path = overflow_bin_path + std::string("/") + file_name;
1999 if (IsRegFile(file_path)) {
2000 // detect overflow bin file
2001 uint64_t task_id = 0;
2002 uint64_t stream_id = 0;
2003 if (!GetTaskIdStreamId(file_name, overflow_file_prefix, &task_id, &stream_id)) {
2004 continue;
2005 }
2006 MS_LOG(INFO) << "Overflow bin file" << file_name << ", task_id " << task_id << ", stream_id " << stream_id
2007 << ".";
2008 task_stream_hits->push_back(std::make_pair(task_id, stream_id));
2009 }
2010 }
2011 (void)closedir(d);
2012 }
2013 }
2014
GetTaskStreamIdNodeMap(const std::string & tensors_path,std::map<std::pair<uint64_t,uint64_t>,std::string> * task_stream_to_opnames) const2015 void DebugServices::GetTaskStreamIdNodeMap(
2016 const std::string &tensors_path, std::map<std::pair<uint64_t, uint64_t>, std::string> *task_stream_to_opnames) const {
2017 MS_EXCEPTION_IF_NULL(task_stream_to_opnames);
2018 MS_LOG(INFO) << "Processing debug_files path: " << tensors_path;
2019 const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
2020 DIR *d = opendir(tensors_path.c_str());
2021 if (d == nullptr) {
2022 MS_LOG(INFO) << "Tensors directory does not exist!";
2023 } else {
2024 struct dirent *dir = nullptr;
2025 while ((dir = readdir(d)) != nullptr) {
2026 std::string file_name = dir->d_name;
2027 if (file_name.rfind(overflow_file_prefix, 0) == 0) {
2028 MS_LOG(INFO) << "File: " << file_name << "is not a tensor file, skip it.";
2029 continue;
2030 }
2031 std::string file_path = tensors_path + std::string("/") + file_name;
2032 if (IsRegFile(file_path)) {
2033 // attempt to read the file
2034 std::ifstream infile;
2035 infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
2036 if (!infile.is_open()) {
2037 MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno;
2038 continue;
2039 }
2040 std::string node_name;
2041 uint64_t task_id = 0;
2042 uint64_t stream_id = 0;
2043 // detect overflow bin file, regular bin file or npy file
2044 bool success_parse = GetAttrsFromFilename(file_name, &node_name, &task_id, &stream_id);
2045 if (success_parse) {
2046 task_stream_to_opnames->insert({std::make_pair(task_id, stream_id), node_name});
2047 }
2048 infile.close();
2049 }
2050 }
2051 (void)closedir(d);
2052 }
2053 }
2054
AddOpOverflowOpNames(const std::string & overflow_bin_path,const std::string & tensors_path,std::vector<std::string> * op_names) const2055 void DebugServices::AddOpOverflowOpNames(const std::string &overflow_bin_path, const std::string &tensors_path,
2056 std::vector<std::string> *op_names) const {
2057 MS_EXCEPTION_IF_NULL(op_names);
2058 std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
2059 std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
2060 GetOverflowTaskStreamId(overflow_bin_path, &task_stream_hit);
2061 GetTaskStreamIdNodeMap(tensors_path, &task_stream_to_opname);
2062
2063 // find the op_names with an overflow hit
2064 for (auto &task_stream : task_stream_hit) {
2065 auto op_name = task_stream_to_opname[task_stream];
2066 if (!op_name.empty()) {
2067 MS_LOG(INFO) << "Operation overflow detected in " << op_name;
2068 op_names->push_back(op_name);
2069 }
2070 }
2071 }
2072
2073 /*
2074 * Feature group: Online debugger, Offline debugger.
2075 * Target device group: Ascend.
2076 * Runtime category: Old runtime, MindRT.
2077 * Description: Checks whether for the given node the operator overflow happened or not by checking the overflow
2078 * directory. This function is for async mode only.
2079 */
CheckOpOverflow(std::string node_name_to_find,unsigned int device_id,unsigned int root_graph_id,unsigned int iteration)2080 bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
2081 unsigned int iteration) {
2082 if (is_sync_mode_) {
2083 return false;
2084 }
2085 std::string overflow_bin_path = "";
2086 std::string tensors_path = "";
2087 #ifndef OFFLINE_DBG_MODE
2088 overflow_bin_path = GetOnlineOpOverflowDir();
2089 tensors_path = overflow_bin_path;
2090 #else
2091 overflow_bin_path =
2092 dump_dir_ + "/rank_" + std::to_string(device_id) + "/debug_files/" + IterationString(iteration) + "/";
2093 overflow_bin_path = RealPath(overflow_bin_path);
2094 MS_LOG(INFO) << "overflow_bin_path: " << overflow_bin_path;
2095 tensors_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
2096 std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
2097 tensors_path = RealPath(tensors_path);
2098 if (overflow_bin_path.empty()) {
2099 overflow_bin_path = tensors_path;
2100 }
2101 #endif
2102 if (overflow_bin_path.empty() || tensors_path.empty()) {
2103 MS_LOG(INFO) << "Get real path failed for overflow_bin_path or tensors path.";
2104 return false;
2105 }
2106 // remove kernel_graph_#
2107 std::string op_name_find_with_path = RemoveKernelGraphPrefix(node_name_to_find);
2108 std::replace(op_name_find_with_path.begin(), op_name_find_with_path.end(), '/', '_');
2109
2110 // remove path
2111 size_t last_slash = node_name_to_find.rfind("/");
2112 std::string op_name_find = "";
2113 if (last_slash != std::string::npos) {
2114 op_name_find = node_name_to_find.substr(last_slash + 1);
2115 }
2116
2117 std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
2118 std::vector<std::string> op_names;
2119
2120 std::lock_guard<std::mutex> lg(overflow_wp_lock_);
2121 MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
2122 auto found_overflows = overflow_ops_.find(overflow_bin_path);
2123 if (found_overflows != overflow_ops_.end()) {
2124 MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
2125 op_names = overflow_ops_[overflow_bin_path];
2126 } else {
2127 AddOpOverflowOpNames(overflow_bin_path, tensors_path, &op_names);
2128 overflow_ops_[overflow_bin_path] = op_names;
2129 }
2130
2131 // determine if overflow wp has been triggered for the op name with path (from bin file)
2132 if (find(op_names.begin(), op_names.end(), op_name_find_with_path) != op_names.end()) {
2133 MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
2134 return true;
2135 }
2136
2137 // determine if overflow wp has been triggered for the op name (from npy file)
2138 if (find(op_names.begin(), op_names.end(), op_name_find) != op_names.end()) {
2139 MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
2140 return true;
2141 }
2142
2143 return false;
2144 }
2145
RemoveKernelGraphPrefix(std::string node_name_to_find) const2146 std::string DebugServices::RemoveKernelGraphPrefix(std::string node_name_to_find) const {
2147 std::string op_name_to_find = node_name_to_find;
2148 const std::string kernel_prefix = "kernel_graph_";
2149 if (node_name_to_find.rfind(kernel_prefix, 0) == 0) {
2150 auto start_of_op_name = node_name_to_find.find("/", kernel_prefix.length());
2151 if (start_of_op_name != std::string::npos) {
2152 op_name_to_find = node_name_to_find.substr(start_of_op_name + 1);
2153 }
2154 }
2155 return op_name_to_find;
2156 }
2157
GetTaskIdStreamId(std::string file_name,std::string overflow_file_prefix,uint64_t * const task_id,uint64_t * const stream_id) const2158 bool DebugServices::GetTaskIdStreamId(std::string file_name, std::string overflow_file_prefix, uint64_t *const task_id,
2159 uint64_t *const stream_id) const {
2160 size_t task_pos_start = overflow_file_prefix.length();
2161 size_t task_pos_end = file_name.find(".", task_pos_start);
2162 if (task_pos_end == std::string::npos) {
2163 MS_LOG(ERROR) << "Cannot extract task_id from filename: " << file_name;
2164 return false;
2165 }
2166
2167 size_t stream_pos_start = task_pos_end + 1;
2168 size_t stream_pos_end = file_name.find(".", stream_pos_start);
2169 if (stream_pos_end == std::string::npos) {
2170 MS_LOG(ERROR) << "Cannot extract stream_id from filename: " << file_name;
2171 return false;
2172 }
2173
2174 std::string task_id_str = file_name.substr(task_pos_start, task_pos_end - task_pos_start);
2175 std::string stream_id_str = file_name.substr(stream_pos_start, stream_pos_end - stream_pos_start);
2176 if (!CheckStoull(task_id, task_id_str)) {
2177 MS_LOG(INFO) << "Failed to get the task_id from file_name: " << file_name << ", error in convert the string "
2178 << task_id_str << " into an integer.";
2179 return false;
2180 }
2181 if (!CheckStoull(stream_id, stream_id_str)) {
2182 MS_LOG(INFO) << "Failed to get the stream_id from file_name: " << file_name << ", error in convert the string "
2183 << stream_id_str << " into an integer.";
2184 return false;
2185 }
2186
2187 return true;
2188 }
2189
GetAttrsFromFilename(const std::string & file_name,std::string * const node_name,uint64_t * const task_id,uint64_t * const stream_id) const2190 bool DebugServices::GetAttrsFromFilename(const std::string &file_name, std::string *const node_name,
2191 uint64_t *const task_id, uint64_t *const stream_id) const {
2192 // get the node_name, task_id, and stream_id from dump filename in the following two formats:
2193 // 1. bin file: node_type.node_name.task_id.stream_id.timestamp
2194 // 2. npy file: node_type.node_name.task_id.stream_id.timestamp.output_input.slot.format.npy
2195 // Please note that node_name might contain dot (i.e. Parameter). So to search for the location of second dot, we need
2196 // to search the file name from right to left.
2197 size_t first_dot = file_name.find(".");
2198 size_t fourth_dot;
2199 if (file_name.rfind(kNpyExt) != std::string::npos) {
2200 // npy format file (converted file or A+M dump file)
2201 size_t pos = file_name.rfind(".");
2202 const int kFourthFromRight = 4;
2203 for (int cnt = 0; cnt < kFourthFromRight; cnt++) {
2204 pos = file_name.rfind(".", pos - 1);
2205 }
2206 fourth_dot = pos;
2207 } else {
2208 // bin format file
2209 fourth_dot = file_name.rfind(".");
2210 }
2211 size_t third_dot = file_name.rfind(".", fourth_dot - 1);
2212 size_t second_dot = file_name.rfind(".", third_dot - 1);
2213 // check if dots were found
2214 if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
2215 fourth_dot == std::string::npos) {
2216 return false;
2217 }
2218 // get node_name
2219 if (first_dot < second_dot) {
2220 *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
2221 } else {
2222 MS_LOG(ERROR) << "filename parse error to get node_name.";
2223 return false;
2224 }
2225 // get task id
2226 if (second_dot < third_dot) {
2227 std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
2228 if (!CheckStoull(task_id, extracted_task_id)) {
2229 MS_LOG(INFO) << "Failed to get the task_id from file_name: " << file_name << ", error in convert the string "
2230 << extracted_task_id << " into an integer.";
2231 return false;
2232 }
2233 } else {
2234 MS_LOG(ERROR) << "Filename <" << file_name << "> parse error to get task_id.";
2235 return false;
2236 }
2237 // get stream id
2238 if (third_dot < fourth_dot) {
2239 std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
2240 if (!CheckStoull(stream_id, extracted_stream_id)) {
2241 MS_LOG(INFO) << "Failed to get the stream_id from file_name: " << file_name << ", error in convert the string "
2242 << extracted_stream_id << " into an integer.";
2243 return false;
2244 }
2245 } else {
2246 MS_LOG(ERROR) << "Filename <" << file_name << "> parse error to get stream_id.";
2247 return false;
2248 }
2249
2250 return true;
2251 }
2252
RealPath(const std::string & input_path) const2253 std::string DebugServices::RealPath(const std::string &input_path) const {
2254 if (input_path.length() >= PATH_MAX) {
2255 MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
2256 }
2257
2258 size_t path_split_pos = input_path.find_last_of('/');
2259
2260 // get real path
2261 char real_path[PATH_MAX] = {0};
2262
2263 // input_path is dir + file_name
2264 if (path_split_pos != std::string::npos) {
2265 std::string prefix_path = input_path.substr(0, path_split_pos);
2266 std::string file_name = input_path.substr(path_split_pos);
2267
2268 if (file_name.length() > NAME_MAX) {
2269 MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX;
2270 }
2271 if (realpath(prefix_path.c_str(), real_path) == nullptr) {
2272 MS_LOG(INFO) << "The dir " << prefix_path << " does not exist.";
2273 return "";
2274 }
2275
2276 return std::string(real_path) + file_name;
2277 }
2278
2279 // input_path is only file_name
2280 if (input_path.length() > NAME_MAX) {
2281 MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX;
2282 }
2283 if (realpath(input_path.c_str(), real_path) == nullptr) {
2284 MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created.";
2285 }
2286
2287 return std::string(real_path);
2288 }
2289
TensorExistsInCurrent(const std::string & tensor_name)2290 bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
2291 return tensor_loader_->TensorExistsInCurrent(tensor_name);
2292 }
MoveTensorCurrentToPrev(const std::string & tensor_name)2293 void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
2294 tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
2295 }
2296
AppendToCacheEvictQueue(const std::string & tensor_name)2297 void DebugServices::AppendToCacheEvictQueue(const std::string &tensor_name) {
2298 if (tensor_loader_->EnableMemoryControl()) {
2299 tensor_loader_->AppendToCacheEvictQueue(tensor_name);
2300 }
2301 }
2302
SetNetName(std::string net_name)2303 void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
2304
GetNetName()2305 std::string DebugServices::GetNetName() { return net_name_; }
2306
SetDumpDir(std::string dump_dir)2307 void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
2308
GetDumpDir()2309 std::string DebugServices::GetDumpDir() { return dump_dir_; }
2310
SetSyncMode(bool is_sync_mode)2311 void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
2312
GetSyncMode() const2313 bool DebugServices::GetSyncMode() const { return is_sync_mode_; }
2314
SetMemLimit(uint64_t max_mem_size)2315 void DebugServices::SetMemLimit(uint64_t max_mem_size) { tensor_loader_->SetMemTotal(max_mem_size); }
2316
2317 } // namespace mindspore
2318