• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "debug/debug_services.h"
17 #include <dirent.h>
18 #include <algorithm>
19 #include <functional>
20 #include <fstream>
21 #include <future>
22 #include <thread>
23 #include <iterator>
24 #include <map>
25 #include <numeric>
26 #include <limits>
27 #include <unordered_set>
28 #include <utility>
29 #include <regex>
30 #include <iomanip>
31 #include "openssl/md5.h"
32 #include "pybind11/stl.h"
33 #ifndef OFFLINE_DBG_MODE
34 #include "include/common/debug/common.h"
35 #include "include/backend/debug/debugger/debugger.h"
36 #include "include/common/debug/anf_dump_utils.h"
37 #include "include/common/utils/anfalgo.h"
38 #endif
39 #include "debug/utils.h"
40 #include "nlohmann/json.hpp"
41 #include "debug/debugger/tensor_summary.h"
42 #include "utils/file_utils.h"
43 #include "include/backend/anf_runtime_algorithm.h"
44 #include "mindspore/core/utils/ms_utils.h"
45 #include "include/backend/debug/data_dump/dump_json_parser.h"
46 
47 namespace mindspore {
48 namespace {
49 static constexpr const char constant_prefix[] = "Default--data-";
50 static constexpr const char kNpyExt[] = ".npy";
51 constexpr float ms_to_s = 1000.0;
52 constexpr int precision = 2;
53 #ifndef OFFLINE_DBG_MODE
54 constexpr int md5_bit_wide = 2;
55 constexpr int md5_len = 32;
56 #endif
57 static constexpr int32_t wp_progress_period = 300;
58 #ifdef __APPLE__
59 constexpr int kStrErrorNone = 0;
60 #else
61 constexpr char *kStrErrorNone = nullptr;
62 #endif
63 }  // namespace
64 
IsRegFile(const std::string & file_path)65 bool IsRegFile(const std::string &file_path) {
66   struct stat st;
67   int ret = stat(file_path.c_str(), &st);
68   if (ret != 0) {
69     MS_LOG(ERROR) << "stat error for " << file_path << ", ret is: " << ret;
70     return false;
71   }
72   return S_ISREG(st.st_mode);
73 }
74 
75 #ifndef OFFLINE_DBG_MODE
openssl_md5(char * input,char * output,int64_t len)76 void openssl_md5(char *input, char *output, int64_t len) {
77   unsigned char digest[MD5_DIGEST_LENGTH];
78   MD5(reinterpret_cast<unsigned char *>(input), len, reinterpret_cast<unsigned char *>(digest));
79   for (int i = 0; i < MD5_DIGEST_LENGTH; i++) {
80     int rest_len = md5_len + 1 - i * md5_bit_wide;
81     auto ret =
82       snprintf_s(&output[i * md5_bit_wide], rest_len, md5_bit_wide, "%02x", static_cast<unsigned int>(digest[i]));
83     if (ret < 0) {
84       MS_LOG(ERROR) << "snprintf_s encountered an error when record md5, which may lead to incorrect MD5 value in the "
85                        "statistic.csv file.";
86     } else if (ret >= rest_len) {
87       MS_LOG(ERROR) << "snprintf_s output is truncated when record md5, which may lead to incorrect MD5 value in the "
88                        "statistic.csv file.";
89     }
90   }
91 }
92 #endif
93 
DebugServices()94 DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
95 
DebugServices(const DebugServices & other)96 DebugServices::DebugServices(const DebugServices &other) {
97   wp_id_cache_ = other.wp_id_cache_;
98   net_name_ = other.net_name_;
99   dump_dir_ = other.dump_dir_;
100   is_sync_mode_ = other.is_sync_mode_;
101   tensor_loader_ = other.tensor_loader_;
102   watchpoint_table_ = other.watchpoint_table_;
103 }
104 
operator =(const DebugServices & other)105 DebugServices &DebugServices::operator=(const DebugServices &other) {
106   if (this != &other) {
107     tensor_loader_ = other.tensor_loader_;
108     watchpoint_table_ = other.watchpoint_table_;
109   }
110   return *this;
111 }
112 
113 /*
114  * Feature group: Online debugger, Offline debugger.
115  * Target device group: Ascend, GPU.
116  * Runtime category: Old runtime, MindRT.
117  * Description: Create a watchpoint_t object and set the watchpoint's variables and add the watchpoint to the
118  * watchpoint_table.
119  */
AddWatchpoint(int id,int watch_condition,float parameter,const std::vector<std::tuple<std::string,bool>> & check_node_list,const std::vector<parameter_t> & parameter_list,const std::vector<std::tuple<std::string,std::vector<uint32_t>>> * check_node_device_list,const std::vector<std::tuple<std::string,std::vector<uint32_t>>> * check_node_graph_list)120 void DebugServices::AddWatchpoint(
121   int id, int watch_condition, float parameter, const std::vector<std::tuple<std::string, bool>> &check_node_list,
122   const std::vector<parameter_t> &parameter_list,
123   const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
124   const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
125   std::lock_guard<std::mutex> lg(lock_);
126 
127   watchpoint_t watchpoint_item;
128   if (id < 0) {
129     MS_LOG(EXCEPTION) << "The watchpoint id should be an integer greater then 0, but got " << id;
130   }
131   watchpoint_item.id = static_cast<unsigned int>(id);
132   watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
133   watchpoint_item.condition.parameter = parameter;
134   watchpoint_item.check_node_list = check_node_list;
135   // For offline debugger check_node_device_list is not nullptr.
136   if (check_node_device_list != nullptr) {
137     watchpoint_item.check_node_device_list = *check_node_device_list;
138   }
139   // For offline debugger check_node_graph_list is not nullptr.
140   if (check_node_graph_list != nullptr) {
141     watchpoint_item.check_node_graph_list = *check_node_graph_list;
142   }
143   watchpoint_item.parameter_list = parameter_list;
144   watchpoint_table_[id] = watchpoint_item;
145 }
146 
RemoveWatchpoint(unsigned int id)147 void DebugServices::RemoveWatchpoint(unsigned int id) {
148   std::lock_guard<std::mutex> lg(lock_);
149   (void)watchpoint_table_.erase(id);
150 }
151 
152 /*
153  * Feature group: Online debugger, Offline debugger.
154  * Target device group: Ascend, GPU.
155  * Runtime category: Old runtime, MindRT.
156  * Description: Returns a tensor summary unique pointer based on the given tensor_dtype, returns nullptr if the type is
157  * not supported.
158  */
GetSummaryPtr(const std::shared_ptr<TensorData> & tensor,const void * const previous_tensor_ptr,uint64_t num_elements,uint64_t prev_num_elements,int tensor_dtype)159 std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
160                                               const void *const previous_tensor_ptr, uint64_t num_elements,
161                                               uint64_t prev_num_elements, int tensor_dtype) {
162   MS_EXCEPTION_IF_NULL(tensor);
163   switch (tensor_dtype) {
164     case DbgDataType::DT_UINT8: {
165       return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
166                                                       prev_num_elements);
167     }
168     case DbgDataType::DT_INT8: {
169       return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
170                                                      prev_num_elements);
171     }
172     case DbgDataType::DT_UINT16: {
173       return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
174                                                        prev_num_elements);
175     }
176     case DbgDataType::DT_INT16: {
177       return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
178                                                       prev_num_elements);
179     }
180     case DbgDataType::DT_UINT32: {
181       return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
182                                                        prev_num_elements);
183     }
184     case DbgDataType::DT_INT32:
185     case DbgDataType::DT_BASE_INT: {
186       return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
187                                                       prev_num_elements);
188     }
189     case DbgDataType::DT_UINT64: {
190       return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
191                                                        prev_num_elements);
192     }
193     case DbgDataType::DT_INT64: {
194       return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
195                                                       prev_num_elements);
196     }
197     case DbgDataType::DT_FLOAT16: {
198       return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
199                                                       prev_num_elements);
200     }
201     case DbgDataType::DT_BFLOAT16: {
202       return std::make_unique<TensorSummary<bfloat16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
203                                                        prev_num_elements);
204     }
205     case DbgDataType::DT_FLOAT32:
206     case DbgDataType::DT_BASE_FLOAT: {
207       return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
208                                                     prev_num_elements);
209     }
210     case DbgDataType::DT_FLOAT64: {
211       return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
212                                                      prev_num_elements);
213     }
214     case DbgDataType::DT_BOOL: {
215       return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
216                                                    prev_num_elements);
217     }
218     default:
219       MS_LOG(INFO) << "Unsupported tensor type";
220       // return a null pointer
221       return std::unique_ptr<TensorSummary<int32_t>>{};
222   }
223 }
224 
225 /*
226  * Feature group: Online debugger, Offline debugger.
227  * Target device group: Ascend, GPU.
228  * Runtime category: Old runtime, MindRT.
229  * Description: Returns TensorStat for the given tensor based on the base_summary_ptr.
230  */
GetTensorStatistics(const std::shared_ptr<TensorData> & tensor)231 DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
232   if (tensor == nullptr) {
233     MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
234     TensorStat empty_tensor_stat_data;
235     return empty_tensor_stat_data;
236   }
237   std::unique_ptr<ITensorSummary> base_summary_ptr;
238   void *previous_tensor_ptr = nullptr;
239   base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), 0, tensor->GetType());
240   if (base_summary_ptr == nullptr) {
241     MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
242     TensorStat empty_tensor_stat_data;
243     return empty_tensor_stat_data;
244   }
245   std::string md5 = "";
246   MSLogTime msTime;
247 #ifndef OFFLINE_DBG_MODE
248   auto statistic_category = DumpJsonParser::GetInstance().statistic_category();
249   if (std::find(statistic_category.begin(), statistic_category.end(), "md5") != statistic_category.end()) {
250     msTime.Start();
251     char md5str[33];
252     auto ret = memset_s(md5str, sizeof(md5str), '\0', sizeof(md5str));
253     if (ret != EOK) {
254       MS_LOG(ERROR) << "Failed to call memset_s, skip record MD5.";
255     } else {
256       openssl_md5(const_cast<char *>(tensor->GetDataPtr()), md5str, tensor->GetByteSize());
257       md5 = std::string(md5str);
258     }
259     msTime.End();
260     MS_LOG(DEBUG) << "Calc md5 costs time : " << msTime.GetRunTimeUS() << " microseconds.";
261   }
262 #endif
263   msTime.Start();
264   base_summary_ptr->TensorStatistics(tensor->GetType());
265   msTime.End();
266   MS_LOG(DEBUG) << "Calc statistic costs time : " << msTime.GetRunTimeUS() << " microseconds.";
267   TensorStat tensor_stat_data(
268     tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
269     base_summary_ptr->max_value(), base_summary_ptr->min_value(), base_summary_ptr->avg_value(),
270     base_summary_ptr->count(), base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
271     base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(), base_summary_ptr->pos_inf_count(),
272     base_summary_ptr->zero_count(), base_summary_ptr->l2_value(), md5);
273 
274   return tensor_stat_data;
275 }
276 
277 #ifdef OFFLINE_DBG_MODE
278 /*
279  * Feature group: Offline debugger.
280  * Target device group: Ascend, GPU.
281  * Runtime category: Old runtime, MindRT.
282  * Description: Returns previous_tensor_ptr if graph hisotry file is found and the current iteration is not the first
283  * run iteration for tensor's graph.
284  */
GetPrevTensor(const std::shared_ptr<TensorData> & tensor,bool previous_iter_tensor_needed,uint64_t * prev_num_elements,bool * history_not_found)285 const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
286                                          uint64_t *prev_num_elements, bool *history_not_found) {
287   MS_EXCEPTION_IF_NULL(tensor);
288   const void *previous_tensor_ptr = nullptr;
289   std::shared_ptr<TensorData> tensor_prev;
290   std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(tensor->GetDeviceId(), tensor->GetRootGraphId());
291   if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
292     *history_not_found = 1;
293     MS_LOG(DEBUG) << "Graph run history is not available for graph: " << tensor->GetRootGraphId();
294   } else if (previous_iter_tensor_needed && GetPrevIteration(tensor) != UINT32_MAX) {
295     // when prev_tensor is not available, the prev iteration is set to UINT32_MAX
296     // read data in offline mode
297     NPYFilePool file_paths;
298     ProcessedNPYFiles processed_npy_files;
299     if (!is_sync_mode_) {
300       ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
301                          std::vector<unsigned int>{tensor->GetDeviceId()},
302                          std::vector<unsigned int>{tensor->GetPrevIteration()},
303                          std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
304       processed_npy_files = ProcessNPYFilePool(file_paths);
305     }
306     std::vector<std::shared_ptr<TensorData>> result_list_prev;
307     ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
308                      std::vector<unsigned int>{tensor->GetDeviceId()},
309                      std::vector<unsigned int>{tensor->GetPrevIteration()},
310                      std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
311                      &processed_npy_files, &result_list_prev, false);
312     tensor_prev = result_list_prev[0];
313     if (tensor_prev->GetByteSize() == 0) {
314       tensor_prev.reset();
315     } else {
316       previous_tensor_ptr = tensor_prev->GetDataPtr();
317       *prev_num_elements = tensor_prev->GetNumElements();
318     }
319   }
320   return previous_tensor_ptr;
321 }
322 #endif
323 
324 /*
325  * Feature group: Offline debugger, Online debugger.
326  * Target device group: Ascend, GPU.
327  * Runtime category: Old runtime, MindRT.
328  * Description: Goes through all the watchpoints in the watchpoint table. If the current tensor is in the list of
329  * check_nodes, that watchpoint is added to the vector of watchpoint_to_check (vector of watchpoints that should be
330  * checked for the current tensor) .
331  */
AddWatchPointsToCheck(bool init_dbg_suspend,bool step_end,bool recheck,const std::shared_ptr<TensorData> & tensor,bool * previous_iter_tensor_needed,std::string * const qualified_tensor_name,std::vector<watchpoint_t> * const watchpoints_to_check)332 void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
333                                           const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
334                                           std::string *const qualified_tensor_name,
335                                           std::vector<watchpoint_t> *const watchpoints_to_check) {
336   if (tensor == nullptr) {
337     MS_LOG(DEBUG) << "tensor is nullptr.";
338     return;
339   }
340   const auto tensor_name = tensor->GetName();
341   const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
342   const auto tensor_device_id = tensor->GetDeviceId();
343   const auto tensor_root_graph_id = tensor->GetRootGraphId();
344   for (auto w_table_item : watchpoint_table_) {
345     auto wp = std::get<1>(w_table_item);
346     // check ONLY init conditions on initial suspended state.
347     // skip other conditions on initial suspended state
348     if (init_dbg_suspend && (wp.condition.type != INIT)) {
349       continue;
350     }
351     // skip init condition if not init suspend
352     if ((wp.condition.type == INIT) && !init_dbg_suspend) {
353       continue;
354     }
355     // check change conditions only on step end.
356     if (wp.change_condition() && !step_end) {
357       continue;
358     }
359     // if recheck, ignore the cache results and reanalyze everything.
360     // if not a recheck, check only unanalyzed tensors
361     if (!recheck) {
362       std::lock_guard<std::mutex> lg(wp_lock_);
363       bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
364       if (wp_cache_hit) {
365         continue;
366       }
367     }
368     std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot, tensor_device_id, tensor_root_graph_id);
369     if (!found.empty()) {
370       *qualified_tensor_name = found;
371       watchpoints_to_check->push_back(w_table_item.second);
372 #ifdef OFFLINE_DBG_MODE
373       if (wp.change_condition()) {
374         *previous_iter_tensor_needed = true;
375       }
376 #endif
377     }
378   }
379 }
380 
AddAnalyzedTensorToCache(const bool recheck,const unsigned int id,const std::string & tensor_name)381 void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
382                                              const std::string &tensor_name) {
383   // add analyzed tensor to cache
384   if (!recheck) {
385     std::lock_guard<std::mutex> lg(wp_lock_);
386     (void)wp_id_cache_[tensor_name].insert(id);
387   }
388 }
389 
SetCheckWatchpointsResult(const int chunk_id,ChunkData * chunk_data,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id,const int exec_order,const std::string time_stamp,const std::string & qualified_tensor_name,const std::string & tensor_slot,const watchpoint_t & wp,const unsigned int device_id_val,const unsigned int root_graph_id_val,const std::vector<parameter_t> & parameter_list,const int32_t error_code) const390 void DebugServices::SetCheckWatchpointsResult(const int chunk_id, ChunkData *chunk_data,
391                                               std::vector<unsigned int> *const device_id,
392                                               std::vector<unsigned int> *const root_graph_id, const int exec_order,
393                                               const std::string time_stamp, const std::string &qualified_tensor_name,
394                                               const std::string &tensor_slot, const watchpoint_t &wp,
395                                               const unsigned int device_id_val, const unsigned int root_graph_id_val,
396                                               const std::vector<parameter_t> &parameter_list,
397                                               const int32_t error_code) const {
398   (void)(chunk_data->chunk_exec_orders)[chunk_id].emplace_back(exec_order);
399   (void)(chunk_data->chunk_names)[chunk_id].emplace_back(qualified_tensor_name);
400   (void)(chunk_data->chunk_slots)[chunk_id].emplace_back(tensor_slot);
401   (void)(chunk_data->chunk_conditions)[chunk_id].emplace_back(wp.condition.type);
402   (void)(chunk_data->chunk_watchpoint_id)[chunk_id].emplace_back(wp.id);
403   if (device_id != nullptr) {
404     (void)(chunk_data->chunk_device_id)[chunk_id].emplace_back(device_id_val);
405   }
406   if (root_graph_id != nullptr) {
407     (void)(chunk_data->chunk_root_graph_id)[chunk_id].emplace_back(root_graph_id_val);
408   }
409   (void)(chunk_data->chunk_parameters)[chunk_id].emplace_back(parameter_list);
410   (void)(chunk_data->chunk_error_codes)[chunk_id].emplace_back(error_code);
411   (void)(chunk_data->chunk_time_stamp)[chunk_id].emplace_back(time_stamp);
412 }
413 
414 #ifdef OFFLINE_DBG_MODE
415 /*
416  * Feature group: Offline debugger.
417  * Target device group: Ascend, GPU.
418  * Runtime category: Old runtime, MindRT.
419  * Description: Sets and checks the OUT_OF_MEMORY error_code (for memory limit feature) and  NO_VALUE error_code (for
420  * new python API feature). Sets checkwatchpoint results.
421  */
CheckOutofMemoryandNoValue(const bool no_mem_to_read,const bool error_on_no_value,const std::vector<watchpoint_t> watchpoints_to_check,int chunk_id,ChunkData * chunk_data,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id,const int exec_order,const std::string time_stamp,const std::string & qualified_tensor_name,const std::string & tensor_slot,const unsigned int device_id_val,const unsigned int root_graph_id_val,const std::vector<parameter_t> & parameter_list) const422 void DebugServices::CheckOutofMemoryandNoValue(const bool no_mem_to_read, const bool error_on_no_value,
423                                                const std::vector<watchpoint_t> watchpoints_to_check, int chunk_id,
424                                                ChunkData *chunk_data, std::vector<unsigned int> *const device_id,
425                                                std::vector<unsigned int> *const root_graph_id, const int exec_order,
426                                                const std::string time_stamp, const std::string &qualified_tensor_name,
427                                                const std::string &tensor_slot, const unsigned int device_id_val,
428                                                const unsigned int root_graph_id_val,
429                                                const std::vector<parameter_t> &parameter_list) const {
430   bool set_is_needed = no_mem_to_read || error_on_no_value;
431   int32_t error_code_to_set = 0;
432   if (no_mem_to_read) {
433     // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
434     error_code_to_set = ITensorSummary::OUT_OF_MEMORY;
435   } else if (error_on_no_value) {
436     error_code_to_set = ITensorSummary::NO_VALUE;
437   }
438   if (set_is_needed) {
439     for (auto &wp : watchpoints_to_check) {
440       SetCheckWatchpointsResult(chunk_id, chunk_data, device_id, root_graph_id, exec_order, time_stamp,
441                                 qualified_tensor_name, tensor_slot, wp, device_id_val, root_graph_id_val,
442                                 parameter_list, error_code_to_set);
443     }
444   }
445 }
446 
447 /*
448  * Feature group: Offline debugger.
449  * Target device group: Ascend, GPU.
450  * Runtime category: Old runtime, MindRT.
451  * Description: After finishing checking watchpoint, set the tensor to not-in-use status (for memory control
452  * feature) by pushing it to eviction candidate queue. So it can be evicted from memory anytime if the memory is
453  * required by other nodes' checking. If previous_tensor exists, change their status in a pair.
454  */
SetTensorToNotInUse(const std::shared_ptr<TensorData> & tensor,const void * previous_tensor_ptr)455 void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr) {
456   // set the tensor into not-in-use status in tensor_loader.
457   auto tensor_name = tensor->GetName();
458   std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
459                                   std::to_string(tensor->GetRootGraphId()) + ":" +
460                                   std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
461   AppendToCacheEvictQueue(key_name_in_cache);
462   if (previous_tensor_ptr != nullptr) {
463     AppendToCacheEvictQueue(key_name_in_cache + ":prev");
464   }
465 }
466 #endif
467 
468 #ifndef OFFLINE_DBG_MODE
469 /*
470  * Feature group: Online debugger.
471  * Target device group: Ascend, GPU.
472  * Runtime category: Old runtime, MindRT.
473  * Description: Compares the current root graph id with the given graph id and returns false if they are not equal
474  * for GPU mindRT and Ascend. Otherwise, it returns true. The objectives of this function are: 1) Check if tensor's
475  * root_graph_id is different from current_root_graph_id and skip checkwatchpoint for the tensor if these values are
476  * different. 2) Set prev_tensor_ptr to nullptr if current_root_graph_id is different from prev_root_graph_id. 3) Skip
477  * reading tensor if tensor's root_graph_id is different from current_root_graph_id.
478  */
CompareCurrentRootGraph(uint32_t id) const479 bool DebugServices::CompareCurrentRootGraph(uint32_t id) const {
480   auto debugger = Debugger::GetInstance();
481   MS_EXCEPTION_IF_NULL(debugger);
482   auto ms_context = MsContext::GetInstance();
483   MS_EXCEPTION_IF_NULL(ms_context);
484   std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
485   auto cur_root_graph_id = debugger->GetCurrentRootGraphId();
486   if ((device_target == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
487       device_target == kAscendDevice) {
488     if (cur_root_graph_id != id) {
489       return false;
490     }
491   }
492   return true;
493 }
494 
495 /*
496  * Feature group: Online debugger.
497  * Target device group: Ascend, GPU.
498  * Runtime category: Old runtime, MindRT.
499  * Description: Returns the previous tensor pointer if the current root graph id is equal to previous root graph id and
500  * prev_tensor_data is not nullptr.
501  */
PreparePrevTensor(uint64_t * prev_num_elements,const std::string & tensor_name)502 const void *DebugServices::PreparePrevTensor(uint64_t *prev_num_elements, const std::string &tensor_name) {
503   std::shared_ptr<TensorData> prev_tensor_data;
504   auto debugger = Debugger::GetInstance();
505   MS_EXCEPTION_IF_NULL(debugger);
506   if (!CompareCurrentRootGraph(debugger->GetPrevRootGraphId())) {
507     // not supporting watchpoints that need prev tensor for multi root graph networks.
508     MS_LOG(DEBUG) << "Previous root graph is different from current root graph, setting prev_tensor to nullptr.";
509     prev_tensor_data = nullptr;
510   } else {
511     prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
512   }
513   if (prev_tensor_data) {
514     *prev_num_elements = prev_tensor_data->GetNumElements();
515     return prev_tensor_data->GetDataPtr();
516   }
517   return nullptr;
518 }
519 #endif
520 
CheckHistoryErrorCode(int * error_code,bool history_not_found) const521 void DebugServices::CheckHistoryErrorCode(int *error_code, bool history_not_found) const {
522   // check history error_code only for offline debugger
523   if (history_not_found) {
524     *error_code = ITensorSummary::HISTORY_NOT_FOUND;  // error code for history not found
525   }
526 }
527 
528 /*
529  * Feature group: Offline debugger, Online debugger.
530  * Target device group: Ascend, GPU.
531  * Runtime category: Old runtime, MindRT.
532  * Description: For all the tensors in the given chunk, reads the tensors, checks all the watchpoints and sets the
533  * watchpoint hit result. Checkwatchpoint process might be affected by memory limit, whether the read tensor was
534  * successfully and whether we have a multi root graph scenario. All of aforementioned checks are done in this function.
535  */
CheckWatchpointsForTensor(ChunkData * chunk_data,ProcessedNPYFiles * const processed_npy_files,std::vector<std::shared_ptr<TensorData>> * const tensor_list,int begin,int end,int chunk_id,const bool init_dbg_suspend,const bool step_end,const bool recheck,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id,bool error_on_no_value)536 void DebugServices::CheckWatchpointsForTensor(ChunkData *chunk_data, ProcessedNPYFiles *const processed_npy_files,
537                                               std::vector<std::shared_ptr<TensorData>> *const tensor_list, int begin,
538                                               int end, int chunk_id, const bool init_dbg_suspend, const bool step_end,
539                                               const bool recheck, std::vector<unsigned int> *const device_id,
540                                               std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
541   int list_size = tensor_list->size();
542   if (end > list_size) {
543     end = list_size;
544   }
545   for (int i = begin; i < end; i++) {
546     auto &tensor = (*tensor_list)[i];
547     const auto tensor_name = tensor->GetName();
548     const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
549     const auto tensor_slot = std::to_string(tensor->GetSlot());
550     std::vector<watchpoint_t> watchpoints_to_check;
551     std::string qualified_tensor_name;
552     bool previous_iter_tensor_needed = false;
553     AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor, &previous_iter_tensor_needed,
554                           &qualified_tensor_name, &watchpoints_to_check);
555     // no wp set on current tensor
556     if (watchpoints_to_check.empty()) {
557       continue;
558     }
559 #ifdef OFFLINE_DBG_MODE
560     // read data in offline mode
561     bool no_mem_to_read = false;
562     std::vector<std::shared_ptr<TensorData>> result_list;
563     ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
564                      std::vector<unsigned int>{tensor->GetDeviceId()},
565                      std::vector<unsigned int>{tensor->GetIteration()},
566                      std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
567                      processed_npy_files, &result_list, false, &no_mem_to_read);
568     tensor = result_list[0];
569     if (tensor->GetByteSize() == 0) {
570       CheckOutofMemoryandNoValue(no_mem_to_read, error_on_no_value, watchpoints_to_check, chunk_id, chunk_data,
571                                  device_id, root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(),
572                                  qualified_tensor_name, tensor_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(),
573                                  std::vector<parameter_t>());
574       tensor.reset();
575       continue;
576     }
577 #endif
578     // no elements to analyze
579     if (tensor->GetByteSize() == 0) {
580       continue;
581     }
582     (chunk_data->chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
583     int tensor_dtype = tensor->GetType();
584     uint64_t num_elements = tensor->GetNumElements();
585     uint64_t prev_num_elements = 0;
586     const void *previous_tensor_ptr = nullptr;
587 #ifdef OFFLINE_DBG_MODE
588     bool history_not_found = 0;
589     previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements, &history_not_found);
590 #else
591     if (!CompareCurrentRootGraph(tensor->GetRootGraphId())) {
592       MS_LOG(DEBUG)
593         << "Current root_graph_id is different from tensor's root_graph_id, skipping checkwatchpoints for tensor: "
594         << tensor->GetName();
595       continue;
596     }
597     previous_tensor_ptr = PreparePrevTensor(&prev_num_elements, tensor_name);
598 #endif
599     std::unique_ptr<ITensorSummary> base_summary_ptr;
600     if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
601       base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
602       if (base_summary_ptr != nullptr) {
603         base_summary_ptr->SummarizeTensor(watchpoints_to_check);
604       }
605     }
606     for (auto &wp : watchpoints_to_check) {
607       bool is_hit = false;
608       int error_code = 0;
609       std::vector<parameter_t> parameter_list = {};
610       if (wp.condition.type == IS_OVERFLOW) {
611         is_hit =
612           CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration());
613       } else if (base_summary_ptr != nullptr) {
614         auto item = base_summary_ptr->IsWatchpointHit(wp);
615         is_hit = std::get<ITensorSummary::eHitPos>(item);
616         error_code = std::get<ITensorSummary::eErrorCodePos>(item);
617 #ifdef OFFLINE_DBG_MODE
618         CheckHistoryErrorCode(&error_code, history_not_found);
619 #endif
620         parameter_list = std::get<ITensorSummary::eParamListPos>(item);
621       }
622       AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
623       if (is_hit || error_code != 0) {
624         SetCheckWatchpointsResult(chunk_id, chunk_data, device_id, root_graph_id, tensor->GetExecutionOrder(),
625                                   tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, wp, tensor->GetDeviceId(),
626                                   tensor->GetRootGraphId(), parameter_list, error_code);
627       }
628     }
629 #ifdef OFFLINE_DBG_MODE
630     SetTensorToNotInUse(tensor, previous_tensor_ptr);
631     // in offline mode remove the need for the data
632     tensor.reset();
633 #endif
634     (void)tensor_processed_count_.fetch_add(1, std::memory_order_relaxed);
635   }
636 }
637 
638 /*
639  * Feature group: Offline debugger, Online debugger.
640  * Target device group: Ascend, GPU.
641  * Runtime category: Old runtime, MindRT.
642  * Description: This function checks the watchpoints for the given tensor list by dividing the tensor list into chunks.
643  * Each chunk is handled by a separate thread and then the result of check watchpoint for each thread is gathered and
644  * sorted. In the end, the time for checking the watchpoint in the current step is reported.
645  */
CheckWatchpoints(std::vector<std::string> * const name,std::vector<std::string> * const slot,std::vector<int> * const condition,std::vector<unsigned int> * const watchpoint_id,std::vector<std::vector<parameter_t>> * const parameters,std::vector<int32_t> * const error_codes,ProcessedNPYFiles * const processed_npy_files,std::vector<std::shared_ptr<TensorData>> * const tensor_list,const bool init_dbg_suspend,const bool step_end,const bool recheck,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id,bool error_on_no_value)646 void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::vector<std::string> *const slot,
647                                      std::vector<int> *const condition, std::vector<unsigned int> *const watchpoint_id,
648                                      std::vector<std::vector<parameter_t>> *const parameters,
649                                      std::vector<int32_t> *const error_codes,
650                                      ProcessedNPYFiles *const processed_npy_files,
651                                      std::vector<std::shared_ptr<TensorData>> *const tensor_list,
652                                      const bool init_dbg_suspend, const bool step_end, const bool recheck,
653                                      std::vector<unsigned int> *const device_id,
654                                      std::vector<unsigned int> *const root_graph_id, bool error_on_no_value) {
655   std::lock_guard<std::mutex> lg(lock_);
656   auto t1 = std::chrono::high_resolution_clock::now();
657   if (watchpoint_table_.empty()) {
658     return;
659   }
660   // vector to store execution order of tensors hit
661   std::vector<int> exec_order;
662   std::vector<std::string> time_stamps;
663   size_t tensor_list_size = tensor_list->size();
664   uint64_t tensor_list_byte_size = 0;
665   MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
666   if (tensor_list_size == 0) {
667     return;
668   }
669   if (IS_OUTPUT_ON(mindspore::kInfo)) {
670     wp_progress_enabled_ = true;
671     wp_progress_thread_ =
672       std::make_unique<std::thread>([this, tensor_list_size]() { CheckWatchpointProgress(tensor_list_size); });
673   }
674   const size_t thread_num_with_mem = 16;
675   const size_t thread_num_without_mem = 32;
676   // default value for number of threads
677   const size_t default_thread_num =
678     tensor_loader_->EnableMemoryControl() ? thread_num_with_mem : thread_num_without_mem;
679   size_t max_thread_num = default_thread_num;
680   if (max_thread_num > tensor_list_size) {
681     max_thread_num = tensor_list_size;
682   }
683   MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
684   size_t chunk_size = tensor_list_size / max_thread_num;
685   size_t remainder = tensor_list_size % max_thread_num;
686   ChunkData chunk_data;
687   chunk_data.chunk_exec_orders.resize(max_thread_num);
688   chunk_data.chunk_names.resize(max_thread_num);
689   chunk_data.chunk_slots.resize(max_thread_num);
690   chunk_data.chunk_conditions.resize(max_thread_num);
691   chunk_data.chunk_watchpoint_id.resize(max_thread_num);
692   chunk_data.chunk_parameters.resize(max_thread_num);
693   chunk_data.chunk_error_codes.resize(max_thread_num);
694   chunk_data.chunk_device_id.resize(max_thread_num);
695   chunk_data.chunk_root_graph_id.resize(max_thread_num);
696   chunk_data.chunk_tensor_byte_size.resize(max_thread_num);
697   std::fill(chunk_data.chunk_tensor_byte_size.begin(), chunk_data.chunk_tensor_byte_size.end(), 0);
698   chunk_data.chunk_time_stamp.resize(max_thread_num);
699 
700   std::vector<std::future<void>> tensor_future_vec;
701   size_t begin = 0;
702   size_t end = begin;
703   for (size_t i = 0; i < max_thread_num; i++) {
704     end += chunk_size;
705     if (remainder > 0) {
706       end++;
707       remainder--;
708     }
709     (void)tensor_future_vec.emplace_back(std::async(
710       std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_data, processed_npy_files,
711       tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck, device_id, root_graph_id, error_on_no_value));
712     begin = end;
713   }
714 
715   SortWatchpointsInfo(&tensor_future_vec, &exec_order, &time_stamps, &tensor_list_byte_size, name, slot,
716 
717                       condition, watchpoint_id, parameters, error_codes, &chunk_data, device_id, root_graph_id);
718 
719   auto t2 = std::chrono::high_resolution_clock::now();
720   std::chrono::duration<double, std::milli> ms_double = t2 - t1;
721   MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
722   MS_LOG(INFO) << "CheckWatchpoints Took: " << std::fixed << std::setprecision(precision)
723                << (ms_double.count()) / ms_to_s << "s";
724   if (IS_OUTPUT_ON(mindspore::kInfo) && wp_progress_thread_ && wp_progress_thread_->joinable()) {
725     wp_progress_enabled_ = false;
726     wp_progress_thread_->join();
727     MS_LOG(INFO) << "Join wp_progress_thread_.";
728   }
729 }
730 
CheckWatchpointProgress(size_t tensor_list_size)731 void DebugServices::CheckWatchpointProgress(size_t tensor_list_size) {
732   while (wp_progress_enabled_ && (tensor_processed_count_ != tensor_list_size)) {
733     MS_LOG(INFO) << "CheckWatchpoint progress: " << tensor_processed_count_ << " tensor processed out of "
734                  << tensor_list_size;
735     std::this_thread::sleep_for(std::chrono::milliseconds(wp_progress_period));
736   }
737 }
738 
739 /*
740  * Feature group: Offline debugger, Online debugger.
741  * Target device group: Ascend, GPU.
742  * Runtime category: Old runtime, MindRT.
743  * Description: Sorts the result of watchpoint hit for the online and offline debugger. This sorting for the online
744  * debugger is based on the execution order and for the offline debugger is based on the time stamp.
745  */
SortWatchpointsInfo(std::vector<std::future<void>> * const tensor_future_vec,std::vector<int> * const exec_order,std::vector<std::string> * const time_stamps,uint64_t * const tensor_list_byte_size,std::vector<std::string> * const name,std::vector<std::string> * const slot,std::vector<int> * const condition,std::vector<unsigned int> * const watchpoint_id,std::vector<std::vector<parameter_t>> * const parameters,std::vector<int32_t> * const error_codes,ChunkData * chunk_data,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id) const746 void DebugServices::SortWatchpointsInfo(std::vector<std::future<void>> *const tensor_future_vec,
747                                         std::vector<int> *const exec_order, std::vector<std::string> *const time_stamps,
748                                         uint64_t *const tensor_list_byte_size, std::vector<std::string> *const name,
749                                         std::vector<std::string> *const slot, std::vector<int> *const condition,
750                                         std::vector<unsigned int> *const watchpoint_id,
751                                         std::vector<std::vector<parameter_t>> *const parameters,
752                                         std::vector<int32_t> *const error_codes, ChunkData *chunk_data,
753                                         std::vector<unsigned int> *const device_id,
754                                         std::vector<unsigned int> *const root_graph_id) const {
755   for (unsigned int i = 0; i < (*tensor_future_vec).size(); i++) {
756     (*tensor_future_vec)[i].wait();
757     (*tensor_future_vec)[i].get();
758     for (unsigned int j = 0; j < (chunk_data->chunk_exec_orders)[i].size(); j++) {
759 #ifndef OFFLINE_DBG_MODE
760       // if the execution order is repeated,inserts the new one before the others with same execution order.
761       std::vector<int>::iterator iter =
762         std::lower_bound(exec_order->begin(), exec_order->end(), (chunk_data->chunk_exec_orders)[i][j]);
763       int position = iter - exec_order->begin();
764       (void)exec_order->emplace(iter, (chunk_data->chunk_exec_orders)[i][j]);
765 #endif
766 #ifdef OFFLINE_DBG_MODE
767       std::vector<std::string>::iterator iter =
768         std::lower_bound(time_stamps->begin(), time_stamps->end(), (chunk_data->chunk_time_stamp)[i][j]);
769       int position = iter - time_stamps->begin();
770       (void)time_stamps->emplace(iter, (chunk_data->chunk_time_stamp)[i][j]);
771 #endif
772       (void)name->emplace(name->begin() + position, (chunk_data->chunk_names)[i][j]);
773       (void)slot->emplace(slot->begin() + position, (chunk_data->chunk_slots)[i][j]);
774       (void)condition->emplace(condition->begin() + position, (chunk_data->chunk_conditions)[i][j]);
775       (void)watchpoint_id->emplace(watchpoint_id->begin() + position, (chunk_data->chunk_watchpoint_id)[i][j]);
776       if (device_id != nullptr) {
777         (void)device_id->emplace(device_id->begin() + position, (chunk_data->chunk_device_id)[i][j]);
778       }
779       if (root_graph_id != nullptr) {
780         (void)root_graph_id->emplace(root_graph_id->begin() + position, (chunk_data->chunk_root_graph_id)[i][j]);
781       }
782       (void)parameters->emplace(parameters->begin() + position, (chunk_data->chunk_parameters)[i][j]);
783       (void)error_codes->emplace(error_codes->begin() + position, (chunk_data->chunk_error_codes)[i][j]);
784     }
785     // free the memory for used vectors
786     std::vector<int>().swap((chunk_data->chunk_exec_orders)[i]);
787     std::vector<std::string>().swap((chunk_data->chunk_time_stamp)[i]);
788     std::vector<std::string>().swap((chunk_data->chunk_names)[i]);
789     std::vector<std::string>().swap((chunk_data->chunk_slots)[i]);
790     std::vector<int>().swap((chunk_data->chunk_conditions)[i]);
791     std::vector<unsigned int>().swap((chunk_data->chunk_watchpoint_id)[i]);
792     std::vector<std::vector<parameter_t>>().swap((chunk_data->chunk_parameters)[i]);
793     std::vector<int32_t>().swap((chunk_data->chunk_error_codes)[i]);
794     std::vector<unsigned int>().swap((chunk_data->chunk_device_id)[i]);
795     std::vector<unsigned int>().swap((chunk_data->chunk_root_graph_id)[i]);
796     if ((*tensor_list_byte_size) > UINT64_MAX - (chunk_data->chunk_tensor_byte_size)[i]) {
797       MS_LOG(WARNING) << (*tensor_list_byte_size) << " + " << (chunk_data->chunk_tensor_byte_size)[i]
798                       << " would lead to integer overflow!";
799       (*tensor_list_byte_size) = UINT64_MAX;
800     } else {
801       (*tensor_list_byte_size) += (chunk_data->chunk_tensor_byte_size)[i];
802     }
803   }
804 }
805 
806 #ifdef OFFLINE_DBG_MODE
807 /*
808  * Feature group: Offline debugger.
809  * Target device group: Ascend, GPU.
810  * Runtime category: Old runtime, MindRT.
811  * Description: Read tensor info from the given file. If memory control feature is configured to be enabled, it checks
812  * if the tensor can fit in memory before reading. There are two situations to return false: 1)tensor size is greater
813  * than the total preset memory limit. 2) Evicting all NOT-In-USE tensors from tensor_list_map_ cannot make enough room
814  * for the tensor.
815  */
ReadTensorFromNpy(const std::string & tensor_name,const std::string & file_name,std::string * const tensor_type,std::size_t * const size,std::vector<int64_t> * const shape,char ** const data_buffer,bool * no_mem_to_read,bool is_base_request)816 void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name,
817                                       std::string *const tensor_type, std::size_t *const size,
818                                       std::vector<int64_t> *const shape, char **const data_buffer, bool *no_mem_to_read,
819                                       bool is_base_request) {
820   std::ifstream infile;
821   std::string file_path = file_name;
822   MS_LOG(INFO) << "Reading in file: " << file_path;
823   infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
824   if (!infile.is_open()) {
825     MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno;
826     const int kMaxFilenameLength = 128;
827     char err_info[kMaxFilenameLength];
828     auto ret = strerror_r(errno, err_info, sizeof(err_info));
829     if (ret != kStrErrorNone) {
830       MS_LOG(ERROR) << " ErrInfo:" << ret;
831     }
832     return;
833   }
834   const int substr_len = 2;
835   const int header_len_offset = 8;
836   const int header_offset = 9;
837   const int header_len_buffer_size = 2;
838   const int type_offset = 10;
839   // get header length
840   (void)infile.seekg(0, std::ios::beg);
841   auto header_len_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len_buffer_size);
842   if (!infile.read(header_len_buffer->data(), header_len_offset + header_len_buffer_size)) {
843     MS_LOG(ERROR) << "Failed to parse header length from " << file_path;
844     return;
845   }
846   uint16_t header_len = *reinterpret_cast<uint16_t *>(header_len_buffer->data() + header_len_offset);
847   header_len_buffer.reset();
848   // read in header
849   (void)infile.seekg(0, std::ios::beg);
850   auto header_buffer = std::make_unique<std::vector<char>>(header_offset + header_len);
851   if (!infile.read(header_buffer->data(), header_offset + header_len)) {
852     MS_LOG(ERROR) << "Failed to read header from " << file_path;
853     return;
854   }
855   std::string header(header_buffer->data() + header_offset, header_len);
856   header_buffer.reset();
857   std::size_t type_i = header.find("descr") + type_offset;
858   if (header.length() < type_i + substr_len) {
859     MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length();
860     return;
861   }
862   *tensor_type = header.substr(type_i, substr_len);
863   std::size_t shape_i_open = header.find("(");
864   std::size_t shape_i_close = header.find(")");
865   std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
866   std::string intermediate;
867   std::stringstream check_shape(shape_str);
868   MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
869   while (getline(check_shape, intermediate, ',')) {
870     int64_t shape_d = 0;
871     if (!CheckStoi(&shape_d, intermediate)) {
872       MS_LOG(INFO) << "Failed to get the shape from file: " << file_name << ", error in convert the string "
873                    << intermediate << " into an integer.";
874       return;
875     }
876     shape->push_back(shape_d);
877   }
878   std::size_t word_size = 0;
879   if (!CheckStoul(&word_size, std::string(1, (*tensor_type)[1]))) {
880     MS_LOG(INFO) << "Failed to get the word_size from file: " << file_name << ", error in convert the string "
881                  << (*tensor_type)[1] << " into an integer.";
882     return;
883   }
884   std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
885   std::size_t data_size = data_len * word_size;
886   *size = data_size;
887   if (data_size == 0 || is_base_request) {
888     // for base request, reading the header is enough.
889     return;
890   }
891   // Check memory available before loading tensor into host.
892   bool has_enough_memory = true;
893   if (tensor_loader_->EnableMemoryControl()) {
894     has_enough_memory = tensor_loader_->CheckMemoryAvailable(tensor_name, data_size);
895   }
896   if (!has_enough_memory) {
897     MS_LOG(ERROR) << "No enough memory available for loading " << tensor_name << " into host memory.";
898     *no_mem_to_read = true;
899   } else {
900     (void)infile.seekg(header_len + type_offset);
901     *data_buffer = new char[data_size];
902     if ((*data_buffer) == nullptr || !infile.read(*data_buffer, data_size)) {
903       MS_LOG(ERROR) << "Unable to get tensor data from npy";
904     }
905   }
906 }
907 
908 /*
909  * Feature group: Offline debugger.
910  * Target device group: Ascend.
911  * Runtime category: Old runtime, MindRT.
912  * Description: This function is to convert files in each directory from device format to host format and append the
913  * converted npy file name into NPYFilePool. It's for Ascend async dump only.
914  */
ConvertToHostFormat(const DirMap & dir_to_files_map,NPYFilePool * const result_list) const915 void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, NPYFilePool *const result_list) const {
916   for (auto const &d : dir_to_files_map) {
917     std::vector<std::string> files_to_convert_in_dir;
918     std::vector<std::string> files_after_convert_in_dir;
919     std::string dump_key = d.first;
920     for (auto const &item : d.second) {
921       std::string file_name = std::get<0>(item);
922       std::string file_name_without_scope = std::get<1>(item);
923 
924       // skip the file that was converted to npy already.
925       if (std::all_of(result_list->begin(), result_list->end(), [&file_name_without_scope](std::string file_found) {
926             return file_found.find(file_name_without_scope) == std::string::npos;
927           })) {
928         // Full path for conversion.
929         (void)files_to_convert_in_dir.emplace_back(dump_key + "/" + file_name);
930         (void)files_after_convert_in_dir.emplace_back(file_name_without_scope);
931       }
932     }
933     MS_LOG(INFO) << "Number of files to convert: " << files_to_convert_in_dir.size();
934     if (!files_to_convert_in_dir.empty()) {
935       // Look for the installation path to the convert_async package. If not found, throw exception and terminate the
936       // later task.
937       auto t1 = std::chrono::high_resolution_clock::now();
938       {
939         pybind11::gil_scoped_acquire acquire;
940         try {
941           auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
942           auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
943           (void)convert_obj.attr("convert_files")();
944         } catch (pybind11::error_already_set &e) {
945           MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
946         }
947       }
948       auto t2 = std::chrono::high_resolution_clock::now();
949       std::chrono::duration<double, std::milli> ms_double = t2 - t1;
950       MS_LOG(INFO) << "convert files Took: " << std::fixed << std::setprecision(precision)
951                    << (ms_double.count()) / ms_to_s << "s";
952       ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list);
953     }
954   }
955 }
956 
957 /*
958  * Feature group: Offline debugger.
959  * Target device group: Ascend.
960  * Runtime category: Old runtime, MindRT.
961  * Description: This function is to iterate through dump directory (dump_key) and search all the converted npy files and
962  * append into NPYFilePool. It's for Ascend async dump only.
963  */
ProcessConvertToHostFormat(const std::vector<std::string> & files_after_convert_in_dir,const std::string & dump_key,NPYFilePool * const result_list) const964 void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
965                                                const std::string &dump_key, NPYFilePool *const result_list) const {
966   std::string real_dump_iter_dir = RealPath(dump_key);
967   DIR *d_handle = opendir(real_dump_iter_dir.c_str());
968   if (d_handle == nullptr) {
969     MS_LOG(INFO) << "Directory " << real_dump_iter_dir << " does not exist in ConvertToHostFormat.";
970     return;
971   }
972   struct dirent *dir = nullptr;
973   while ((dir = readdir(d_handle)) != nullptr) {
974     std::string name = real_dump_iter_dir + std::string("/") + std::string(dir->d_name);
975     if (!IsRegFile(name)) {
976       continue;
977     }
978     std::string candidate = dir->d_name;
979     for (const std::string &file_to_find : files_after_convert_in_dir) {
980       if (candidate.find(file_to_find + ".") != std::string::npos && candidate.rfind(kNpyExt) != std::string::npos) {
981         // we found a converted file for this op
982         std::string found_file = dump_key + "/" + candidate;
983         (void)result_list->insert(found_file);
984       }
985     }
986   }
987   (void)closedir(d_handle);
988 }
989 
990 /*
991  * Feature group: Offline debugger.
992  * Target device group: Ascend, GPU.
993  * Runtime category: Old runtime, MindRT.
994  * Description: Node name string prefixes with scope and separates with slash "/". While the npy files in the tensor
995  * dump path do not include scope in their name. The objective of this function is to remove scope from the node name to
996  * match the file.
997  */
GetNodeNameWithoutScope(const std::string & dump_style_name)998 std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
999   if (dump_style_name.empty()) {
1000     return "";
1001   }
1002   std::size_t last_scope_marker;
1003   std::string delim = "/";
1004   last_scope_marker = dump_style_name.rfind(delim);
1005   if (last_scope_marker == std::string::npos) {
1006     return dump_style_name;
1007   }
1008   return dump_style_name.substr(last_scope_marker + delim.size());
1009 }
1010 
1011 /*
1012  * Feature group: Offline debugger.
1013  * Target device group: Ascend.
1014  * Runtime category: Old runtime, MindRT.
1015  * Description: This function is to search and prepare the target npy file to be read for each node. If the found file
1016  * is already npy format, push it to NPYFilePool; Otherwise, use conversion tool in convert_async.py to transfer it to
1017  * npy format beforehand.
1018  */
ConvertReadTensors(std::vector<std::string> backend_name,std::vector<size_t> slot,std::vector<unsigned int> device_id,std::vector<unsigned int> iteration,std::vector<unsigned int> root_graph_id,NPYFilePool * const result_list)1019 void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
1020                                        std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
1021                                        std::vector<unsigned int> root_graph_id, NPYFilePool *const result_list) {
1022   DirMap dir_to_files_map;
1023   for (unsigned int i = 0; i < backend_name.size(); i++) {
1024     // form prefix of the tensor file to read from graph pb node name
1025     std::string dump_style_kernel_name = backend_name[i];
1026 
1027     // remove slot from name
1028     std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
1029     dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
1030 
1031     MS_LOG(INFO) << "Dump style kernel_name: " << dump_style_kernel_name << ", slot is: " << slot[i];
1032     std::string prefix_dump_file_name = GetNodeNameWithoutScope(dump_style_kernel_name);
1033 
1034     std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
1035                                     std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
1036 
1037     // if node name is constant, skip
1038     if (prefix_dump_file_name.length() > strlen(constant_prefix) &&
1039         prefix_dump_file_name.substr(0, strlen(constant_prefix)).compare(constant_prefix) == 0) {
1040       continue;
1041     }
1042     // search files in dir for the one that meets the filename prefix and read the file into memory
1043     std::string abspath = RealPath(specific_dump_dir);
1044     auto preprocess_async_result = PreProcessDumpDirAsync(abspath);
1045     bool is_success = std::get<0>(preprocess_async_result);
1046     if (!is_success) {
1047       // directory does not exist
1048       return;
1049     }
1050     ProcessConvertList(std::get<1>(preprocess_async_result), prefix_dump_file_name, specific_dump_dir,
1051                        &dir_to_files_map, result_list);
1052   }
1053   ConvertToHostFormat(dir_to_files_map, result_list);
1054 }
1055 
ConvertWatchPointNodes(const DumpFileMap & dump_dir_mapped_files,const std::vector<ProtoDump> & proto_dump,const std::string & specific_dump_dir,NPYFilePool * const result_list) const1056 void DebugServices::ConvertWatchPointNodes(const DumpFileMap &dump_dir_mapped_files,
1057                                            const std::vector<ProtoDump> &proto_dump,
1058                                            const std::string &specific_dump_dir, NPYFilePool *const result_list) const {
1059   DirMap dir_to_files_map;
1060   for (const auto &node : proto_dump) {
1061     std::string dump_name = node.dump_name;
1062     // search files in dir for the one that meets the filename prefix and read the file into memory
1063     std::string abspath = RealPath(specific_dump_dir);
1064     DIR *d = opendir(abspath.c_str());
1065     if (d == nullptr) {
1066       MS_LOG(INFO) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
1067       return;
1068     }
1069     ProcessConvertList(dump_dir_mapped_files, dump_name, specific_dump_dir, &dir_to_files_map, result_list);
1070     (void)closedir(d);
1071   }
1072   ConvertToHostFormat(dir_to_files_map, result_list);
1073 }
1074 
1075 /*
1076  * Feature group: Offline debugger.
1077  * Target device group: Ascend.
1078  * Runtime category: Old runtime, MindRT.
1079  * Description: This function is to search the dump dir and separate npy files from bin files in async dump dir.
1080  */
PreProcessDumpDirAsync(const std::string & specific_dump_dir) const1081 DebugServices::AsyncPreProcessResult DebugServices::PreProcessDumpDirAsync(const std::string &specific_dump_dir) const {
1082   // DumpFileMap for each specific dump dir (including rank, graph_id and iteration)
1083   DumpFileMap dump_dir_mapped_files;
1084   AsyncPreProcessResult async_result;
1085   DIR *d = opendir(specific_dump_dir.c_str());
1086   if (d == nullptr) {
1087     MS_LOG(ERROR) << "Specific dump dir does not exit for preprocessing: " << specific_dump_dir;
1088     std::get<0>(async_result) = false;
1089     std::get<1>(async_result) = dump_dir_mapped_files;
1090     return async_result;
1091   }
1092   struct dirent *dir = nullptr;
1093   while ((dir = readdir(d)) != nullptr) {
1094     std::string file_name = dir->d_name;
1095     std::string file_path = specific_dump_dir + std::string("/") + file_name;
1096     if (!IsRegFile(file_path)) {
1097       continue;
1098     }
1099     bool is_txt = file_name.rfind(".txt") != std::string::npos;
1100     if (is_txt) {
1101       // txt files in dump dir contain the list of failed converted npy files.
1102       MS_LOG(DEBUG) << "Skipping txt file: " << file_name;
1103       continue;
1104     }
1105     std::string op_name;
1106     bool is_npy = file_name.rfind(kNpyExt) != std::string::npos;
1107     auto first_dot = file_name.find('.');
1108 
1109     const int kSeventhFromRight = 7;
1110     size_t pos = file_name.rfind(".");
1111     for (int cnt = 1; cnt < kSeventhFromRight; cnt++) {
1112       pos = file_name.rfind(".", pos - 1);
1113     }
1114     size_t seventh_last_dot = pos;
1115 
1116     if (seventh_last_dot != std::string::npos && first_dot != std::string::npos && seventh_last_dot > first_dot) {
1117       // name_to_match is between first dot and seventh last dot.
1118       // if op_type is parameter, the op_name can have dots.
1119       op_name = file_name.substr(first_dot + 1, seventh_last_dot - first_dot - 1);
1120     }
1121 
1122     if (is_npy) {
1123       // push back the file_name with specific dump dir
1124       (dump_dir_mapped_files[specific_dump_dir].npy_files[op_name]).push_back(file_path);
1125     } else {
1126       // push back the file_name without specific dump dir. dump dir is the map key.
1127       dump_dir_mapped_files[specific_dump_dir].bin_files.push_back(file_name);
1128     }
1129   }
1130   (void)closedir(d);
1131   std::get<0>(async_result) = true;
1132   std::get<1>(async_result) = dump_dir_mapped_files;
1133   return async_result;
1134 }
1135 
1136 /*
1137  * Feature group: Offline debugger.
1138  * Target device group: Ascend, GPU.
1139  * Runtime category: Old runtime, MindRT.
1140  * Description: This function is to search the dump dir for npy files.
1141  */
PreProcessDumpDirSync(const std::string & specific_dump_dir) const1142 DebugServices::NPYFilePool DebugServices::PreProcessDumpDirSync(const std::string &specific_dump_dir) const {
1143   // npy format:
1144   // {dump_path}/{op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{output_or_input_string}.{slot}.{format}.npy
1145   NPYFilePool npy_files;
1146   DIR *d = opendir(specific_dump_dir.c_str());
1147   if (d == nullptr) {
1148     MS_LOG(ERROR) << "Specific dump dir does not exit for preprocessing: " << specific_dump_dir;
1149     return npy_files;
1150   }
1151   struct dirent *dir = nullptr;
1152   while ((dir = readdir(d)) != nullptr) {
1153     std::string file_name = dir->d_name;
1154     std::string file_path = specific_dump_dir + std::string("/") + file_name;
1155     if (!IsRegFile(file_path)) {
1156       continue;
1157     }
1158     bool is_npy = file_name.rfind(kNpyExt) != std::string::npos;
1159     if (is_npy) {
1160       (void)npy_files.insert(file_path);
1161     }
1162   }
1163   (void)closedir(d);
1164   return npy_files;
1165 }
1166 
ProcessConvertList(const DumpFileMap & dump_dir_mapped_files,const std::string & prefix_dump_file_name,const std::string & specific_dump_dir,DirMap * dir_to_files_map,NPYFilePool * const result_list) const1167 void DebugServices::ProcessConvertList(const DumpFileMap &dump_dir_mapped_files,
1168                                        const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
1169                                        DirMap *dir_to_files_map, NPYFilePool *const result_list) const {
1170   MS_EXCEPTION_IF_NULL(dir_to_files_map);
1171   auto it = dump_dir_mapped_files.find(specific_dump_dir);
1172   if (it == dump_dir_mapped_files.end()) {
1173     // no matched file
1174     MS_LOG(ERROR) << "Pre-Process is not done correctly for :" << specific_dump_dir;
1175     return;
1176   }
1177   auto bin_files = (it->second).bin_files;
1178   auto npy_files = (it->second).npy_files;
1179 
1180   for (size_t i = 0; i < bin_files.size(); i++) {
1181     std::string file_name = bin_files[i];
1182     std::string file_name_w_o_perfix = file_name;
1183     auto type_pos = file_name.find('.');
1184     // adding dot to avoid problematic matching in the scope.
1185     if (type_pos == std::string::npos ||
1186         file_name.find(prefix_dump_file_name + ".", type_pos + 1) == std::string::npos) {
1187       continue;
1188     }
1189     std::size_t second_dot = file_name.find(".", file_name.find(prefix_dump_file_name + ".", type_pos + 1));
1190     (void)file_name_w_o_perfix.replace(type_pos + 1, second_dot - type_pos - 1, prefix_dump_file_name);
1191     // if file matches prefix and is in device format add to candidate files to convert.
1192     (*dir_to_files_map)[specific_dump_dir].push_back(std::make_tuple(file_name, file_name_w_o_perfix));
1193   }
1194   // Add the already converted npy files to result_list
1195   if (npy_files.find(prefix_dump_file_name) != npy_files.end()) {
1196     (void)std::copy(npy_files[prefix_dump_file_name].begin(), npy_files[prefix_dump_file_name].end(),
1197                     std::inserter(*result_list, result_list->end()));
1198   }
1199 }
1200 
GetTensorDataInfoAsync(const std::vector<ProtoDump> & proto_dump,const std::string & specific_dump_dir,uint32_t iteration,uint32_t device_id,uint32_t root_graph_id,const ProcessedNPYFiles & processed_async_files,std::vector<std::shared_ptr<TensorData>> * const tensor_list)1201 void DebugServices::GetTensorDataInfoAsync(const std::vector<ProtoDump> &proto_dump,
1202                                            const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
1203                                            uint32_t root_graph_id, const ProcessedNPYFiles &processed_async_files,
1204                                            std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
1205   auto it = processed_async_files.find(specific_dump_dir);
1206   if (it == processed_async_files.end()) {
1207     MS_LOG(DEBUG) << "no npy file was found for dump directory: " << specific_dump_dir;
1208     return;
1209   }
1210   auto processed_files_for_dir = it->second;
1211   for (auto &node : proto_dump) {
1212     std::vector<size_t> slot_list;
1213     std::string dump_name = node.dump_name;
1214     bool output_flag = node.is_output;
1215 
1216     for (const auto &dump_file_attr : processed_files_for_dir) {
1217       if (dump_file_attr.name_to_match == dump_name && dump_file_attr.is_output == output_flag) {
1218         slot_list.push_back(dump_file_attr.slot);
1219       }
1220     }
1221     for (auto slot : slot_list) {
1222       // add a TensorData entry (data will be read when needed)
1223       std::vector<int64_t> shape;
1224       std::string orig_name = node.origin_node_name;
1225       auto tensor_data = std::make_shared<TensorData>();
1226       tensor_data->SetName(orig_name);
1227       tensor_data->SetExecutionOrder(0);
1228       tensor_data->SetSlot(slot);
1229       tensor_data->SetIteration(iteration);
1230       tensor_data->SetDeviceId(device_id);
1231       tensor_data->SetRootGraphId(root_graph_id);
1232       tensor_data->SetDataPtr(nullptr);
1233       tensor_data->SetByteSize(0);
1234       tensor_data->SetType("");
1235       tensor_data->SetShape(shape);
1236       tensor_data->SetIsOutput(output_flag);
1237       tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
1238 
1239       tensor_list->push_back(tensor_data);
1240     }
1241   }
1242 }
1243 
1244 /*
1245  * Feature group: Offline debugger.
1246  * Target device group: Ascend, GPU.
1247  * Runtime category: Old runtime, MindRT.
1248  * Description: This function extracts the attributes like op_name and time stamp from npy file name and is used for
1249  * both sync and async dump.
1250  */
ProcessNPYFilePool(const NPYFilePool & npy_file_pool) const1251 DebugServices::ProcessedNPYFiles DebugServices::ProcessNPYFilePool(const NPYFilePool &npy_file_pool) const {
1252   // npy file format: node_type.node_name.task_id.stream_id.timestamp.output_input.slot.format.npy
1253   ProcessedNPYFiles processed_files;
1254   if (npy_file_pool.empty()) {
1255     MS_LOG(WARNING) << "ProcessNPYFilePool was called for an empty NPYFilePool.";
1256     return processed_files;
1257   }
1258   for (const std::string &file_name : npy_file_pool) {
1259     std::string file_name_to_check = file_name;
1260     std::string specific_dump_dir;
1261     DumpFileAttr dump_file_attr;
1262     std::string output_str;
1263     std::string slot_str;
1264     auto delim = file_name.rfind("/");
1265     if (delim != std::string::npos) {
1266       specific_dump_dir = file_name.substr(0, delim);
1267       file_name_to_check = file_name.substr(delim + 1);
1268     }
1269     std::vector<std::tuple<size_t, size_t, std::string *>> attr_to_match;
1270     size_t first_dot = file_name_to_check.find(".");
1271     size_t last_dot = file_name_to_check.rfind(kNpyExt);
1272     size_t second_last_dot = file_name_to_check.rfind(".", last_dot - 1);
1273     size_t third_last_dot = file_name_to_check.rfind(".", second_last_dot - 1);
1274     size_t fourth_last_dot = file_name_to_check.rfind(".", third_last_dot - 1);
1275     size_t fifth_last_dot = file_name_to_check.rfind(".", fourth_last_dot - 1);
1276     size_t sixth_last_dot = file_name_to_check.rfind(".", fifth_last_dot - 1);
1277     size_t seventh_last_dot = file_name_to_check.rfind(".", sixth_last_dot - 1);
1278     // name_to_match is between first dot and seventh last dot.
1279     // if op_type is parameter, the op_name can have dots.
1280     auto tuple = std::make_tuple(first_dot, seventh_last_dot, &dump_file_attr.name_to_match);
1281     attr_to_match.push_back(tuple);
1282     // slot is between second and third dot from end of the file name.
1283     tuple = std::make_tuple(third_last_dot, second_last_dot, &slot_str);
1284     attr_to_match.push_back(tuple);
1285     // time stamp is between fourth and fifth dot from end of the file name.
1286     tuple = std::make_tuple(fifth_last_dot, fourth_last_dot, &dump_file_attr.time_stamp);
1287     attr_to_match.push_back(tuple);
1288     // output is between third and fourth dot from end of the file name.
1289     tuple = std::make_tuple(fourth_last_dot, third_last_dot, &output_str);
1290     attr_to_match.push_back(tuple);
1291     for (auto &match_item : attr_to_match) {
1292       CheckStringMatch(std::get<DebugServices::START_POS>(match_item), std::get<DebugServices::END_POS>(match_item),
1293                        std::get<DebugServices::STR_POS>(match_item), file_name_to_check);
1294     }
1295 
1296     if (!slot_str.empty() && !CheckStoull(&dump_file_attr.slot, slot_str)) {
1297       MS_LOG(INFO) << "Failed to get the slot from file_name: " << file_name_to_check
1298                    << ", error in convert the string " << slot_str << " into an integer.";
1299     }
1300     dump_file_attr.is_output = (output_str == "output");
1301     dump_file_attr.file_path = file_name_to_check;
1302     processed_files[specific_dump_dir].push_back(dump_file_attr);
1303   }
1304   return processed_files;
1305 }
1306 
1307 /*
1308  * Feature group: Offline debugger.
1309  * Target device group: Ascend, GPU.
1310  * Runtime category: Old runtime, MindRT.
1311  * Description: For the two possible modes (rank and graph), this function returns the rank_id or graph_id extracted
1312  * from the given directory name otherwise, it returns UINT32_MAX to identify an invalid rank or graph id.
1313  */
GetRankOrGraphId(const std::string & mode,const std::string & name)1314 uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) {
1315   std::regex re;
1316   if (mode == "rank") {
1317     re = "^rank_([0-9]+)$";
1318   } else if (mode == "graph") {
1319     re = "^([0-9]+)$";
1320   }
1321   std::smatch tokens;
1322   if (regex_match(name, tokens, re)) {
1323     return std::stoi(tokens[1]);
1324   } else {
1325     return UINT32_MAX;
1326   }
1327 }
1328 
GetDumpRankIdList()1329 std::vector<uint32_t> DebugServices::GetDumpRankIdList() {
1330   std::vector<uint32_t> rank_id_list;
1331   std::string dump_dir = GetDumpDir();
1332   DIR *d_handle = opendir(dump_dir.c_str());
1333   if (d_handle == nullptr) {
1334     MS_LOG(ERROR) << "Dump directory does not exist.";
1335     return rank_id_list;
1336   }
1337   struct dirent *dir = nullptr;
1338   while ((dir = readdir(d_handle)) != nullptr) {
1339     struct stat st;
1340     std::string name = dump_dir + std::string("/") + std::string(dir->d_name);
1341     int ret = stat(name.c_str(), &st);
1342     if (ret != 0) {
1343       MS_LOG(ERROR) << "stat error, ret is: " << ret;
1344       (void)closedir(d_handle);
1345       return rank_id_list;
1346     }
1347     if (S_ISDIR(st.st_mode)) {
1348       std::string rank_dir_name = dir->d_name;
1349       uint32_t rank_id = GetRankOrGraphId("rank", rank_dir_name);
1350       if (rank_id != UINT32_MAX) {
1351         rank_id_list.push_back(rank_id);
1352       }
1353     }
1354   }
1355   (void)closedir(d_handle);
1356   return rank_id_list;
1357 }
1358 
1359 /*
1360  * Feature group: Offline debugger.
1361  * Target device group: Ascend, GPU.
1362  * Runtime category: Old runtime, MindRT.
1363  * Description: Searches the current dump directory and for each rank_id in rank_id_list extracts the existing
1364  * graph_ids. Then the history file is read for all the extracted graph_ids.
1365  */
CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list)1366 void DebugServices::CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list) {
1367   std::string net_name = GetNetName();
1368   std::string dump_dir = GetDumpDir();
1369   for (uint32_t rank_id : rank_id_list) {
1370     std::string path = dump_dir + "/rank_" + std::to_string(rank_id) + "/" + net_name;
1371     std::string abspath = RealPath(path);
1372     DIR *d_handle_rank = opendir(abspath.c_str());
1373     if (d_handle_rank == nullptr) {
1374       MS_LOG(ERROR) << "Directory for rank_id: " << rank_id << " does not exist.";
1375       continue;
1376     }
1377     struct dirent *direc = nullptr;
1378     while ((direc = readdir(d_handle_rank)) != nullptr) {
1379       struct stat st;
1380       std::string name = abspath + std::string("/") + std::string(direc->d_name);
1381       int ret = stat(name.c_str(), &st);
1382       if (ret != 0) {
1383         MS_LOG(ERROR) << "stat error, ret is: " << ret;
1384         (void)closedir(d_handle_rank);
1385         return;
1386       }
1387       if (S_ISDIR(st.st_mode)) {
1388         std::string graph_dir = direc->d_name;
1389         if (graph_dir == "." || graph_dir == "..") {
1390           continue;
1391         }
1392         uint32_t graph_id = GetRankOrGraphId("graph", graph_dir);
1393         if (graph_id != UINT32_MAX) {
1394           ReadGraphsHistory(rank_id, graph_id);
1395         }
1396       }
1397     }
1398     (void)closedir(d_handle_rank);
1399   }
1400 }
1401 
SetGraphsHistory()1402 void DebugServices::SetGraphsHistory() {
1403   // extract rank_id_list
1404   std::vector<uint32_t> rank_id_list = GetDumpRankIdList();
1405   // for each rank_id extract the graph_id list and set the dump version
1406   // and for each graph read the graph history file
1407   CheckDumpGraphIdList(rank_id_list);
1408 }
1409 
1410 /*
1411  * Feature group: Offline debugger.
1412  * Target device group: Ascend, GPU.
1413  * Runtime category: Old runtime, MindRT.
1414  * Description: Reads the graph history file (containing iteration numbers in which the graph was executed) and stores
1415  * the data in graphs_run_history_ for the given rank and graph id.
1416  */
ReadGraphsHistory(uint32_t rank_id,uint32_t root_graph_id)1417 void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) {
1418   std::tuple<uint32_t, uint32_t> rank_and_graph(rank_id, root_graph_id);
1419   if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) {
1420     // graph history was already stored for this rank_id and graph_id
1421     return;
1422   }
1423   std::string exec_order_path = GetDumpDir() + "/rank_" + std::to_string(rank_id) + "/execution_order/";
1424   std::string file_to_check = "ms_global_execution_order_graph_" + std::to_string(root_graph_id) + ".csv";
1425   DIR *d_handle = opendir(exec_order_path.c_str());
1426   if (d_handle == nullptr) {
1427     MS_LOG(ERROR) << "Execution order directory does not exist.";
1428     return;
1429   }
1430   // read file and store the info
1431   std::string full_path = exec_order_path + "/" + file_to_check;
1432   std::string checked_path = RealPath(full_path);
1433   if (!checked_path.empty()) {
1434     ReadGraphRunIter(checked_path, rank_and_graph);
1435   }
1436   (void)closedir(d_handle);
1437 }
1438 
1439 /*
1440  * Feature group: Offline debugger.
1441  * Target device group: Ascend, GPU.
1442  * Runtime category: Old runtime, MindRT.
1443  * Description: Returns a map with a tuple as the key (rank, graph) and a vector as the value. This vector contains a
1444  * tuple with two elements, the first element is the node name and the second element is whether the node is output or
1445  * not.
1446  */
GetAllWpNodes()1447 std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> DebugServices::GetAllWpNodes() {
1448   std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes;
1449   for (auto w_table_item : watchpoint_table_) {
1450     auto wp = std::get<1>(w_table_item);
1451     unsigned int index = 0;
1452     for (auto check_node : wp.check_node_list) {
1453       std::vector<uint32_t> ranks = std::get<1>(wp.check_node_device_list[index]);
1454       std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
1455       // graph represents root_graph for Ascend and kernel_graph for GPU
1456       for (auto rank : ranks) {
1457         for (auto graph : graphs) {
1458           std::tuple<uint32_t, uint32_t> key(rank, graph);
1459           (rank_and_graph_to_nodes)[key].push_back(check_node);
1460         }
1461       }
1462       index++;
1463     }
1464   }
1465   return rank_and_graph_to_nodes;
1466 }
1467 
1468 /*
1469  * Feature group: Offline debugger.
1470  * Target device group: Ascend, GPU.
1471  * Runtime category: Old runtime, MindRT.
1472  * Description: For the given graph and rank id, reads the graph history file, stores all the run iterations for the
1473  * graph in a vector and inserts it to graphs_run_history_ map.
1474  */
ReadGraphRunIter(std::string file_path,std::tuple<uint32_t,uint32_t> rank_and_graph)1475 void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph) {
1476   std::ifstream infile;
1477   std::string line;
1478   infile.open(file_path.c_str());
1479   if (!infile.is_open()) {
1480     MS_LOG(ERROR) << "Failed to open file (In ReadGraphRunIter) " << file_path << " Errno:" << errno;
1481     const int kMaxFilenameLength = NAME_MAX;
1482     char err_info[kMaxFilenameLength];
1483     if (strerror_r(errno, err_info, sizeof(err_info)) != kStrErrorNone) {
1484       MS_LOG(ERROR) << " ErrInfo:" << strerror_r(errno, err_info, sizeof(err_info));
1485     }
1486 
1487     return;
1488   }
1489   std::vector<uint32_t> run_iters_vec;
1490   while (std::getline(infile, line)) {
1491     uint32_t iter;
1492     std::stringstream ss(line);
1493     ss >> iter;
1494     run_iters_vec.push_back(iter);
1495   }
1496   (void)graphs_run_history_.emplace(
1497     std::pair<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>>(rank_and_graph, run_iters_vec));
1498 }
1499 
1500 /*
1501  * Feature group: Offline debugger.
1502  * Target device group: Ascend, GPU.
1503  * Runtime category: Old runtime, MindRT.
1504  * Description: Creates a tensor_data object and sets its variables based on the function arguments and add the tensor
1505  * to the tensor_list_map_.
1506  */
AddToTensorData(const std::string & backend_name,const std::string & time_stamp,const std::size_t slot,const unsigned int iteration,const unsigned int device_id,const unsigned int root_graph_id,const bool is_output,const std::size_t data_size,const std::string & type_name,const std::vector<int64_t> & shape,char * buffer,std::vector<std::shared_ptr<TensorData>> * const result_list)1507 void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
1508                                     const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
1509                                     const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
1510                                     const std::string &type_name, const std::vector<int64_t> &shape, char *buffer,
1511                                     std::vector<std::shared_ptr<TensorData>> *const result_list) {
1512   // call LoadNewTensor to store tensor in internal cache
1513   auto tensor_data = std::make_shared<TensorData>();
1514   tensor_data->SetName(backend_name);
1515   tensor_data->SetExecutionOrder(0);
1516   tensor_data->SetSlot(slot);
1517   tensor_data->SetIteration(iteration);
1518   tensor_data->SetDeviceId(device_id);
1519   tensor_data->SetRootGraphId(root_graph_id);
1520   tensor_data->SetIsOutput(is_output);
1521   if (buffer != nullptr) {
1522     tensor_data->SetDataPtr(buffer);
1523   } else {
1524     tensor_data->SetDataPtr(nullptr);
1525   }
1526   tensor_data->SetByteSize(data_size);
1527   tensor_data->SetType(type_name);
1528   tensor_data->SetShape(shape);
1529   tensor_data->SetTimeStamp(time_stamp);
1530   tensor_data->SetPrevIteration(GetPrevIteration(tensor_data));
1531   if (data_size > 0) {
1532     (void)tensor_loader_->LoadNewTensor(tensor_data, false);
1533   }
1534 
1535   // add to result_list
1536   result_list->push_back(tensor_data);
1537 }
1538 
GetNewestFileIndex(std::vector<std::string> matched_time_stamps)1539 int GetNewestFileIndex(std::vector<std::string> matched_time_stamps) {
1540   // given the vector of matched_time_stamps, get the index of the newest time stamp.
1541   // this index is used to find the corresponding matched_path.
1542   if (matched_time_stamps.empty()) {
1543     return -1;
1544   }
1545   auto it = std::max_element(matched_time_stamps.begin(), matched_time_stamps.end());
1546   int index = it - matched_time_stamps.begin();
1547   return index;
1548 }
1549 
1550 /*
1551  * Feature group: Offline debugger.
1552  * Target device group: Ascend, GPU.
1553  * Runtime category: Old runtime, MindRT.
1554  * Description: Search files in NPYFilePool (async and async mode) for the one that meets the filename
1555  * prefix and read the file into memory.
1556  */
ReadDumpedTensor(std::vector<std::string> backend_name,std::vector<size_t> slot,std::vector<unsigned int> device_id,std::vector<unsigned int> iteration,std::vector<unsigned int> root_graph_id,const std::vector<bool> & is_output,ProcessedNPYFiles * const processed_npy_files,std::vector<std::shared_ptr<TensorData>> * const result_list,bool is_base_request,bool * no_mem_to_read)1557 void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
1558                                      std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
1559                                      std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
1560                                      ProcessedNPYFiles *const processed_npy_files,
1561                                      std::vector<std::shared_ptr<TensorData>> *const result_list, bool is_base_request,
1562                                      bool *no_mem_to_read) {
1563   for (unsigned int i = 0; i < backend_name.size(); i++) {
1564     // form prefix of the tensor file to read from graph pb node name
1565     std::string dump_style_kernel_name = backend_name[i];
1566 
1567     // remove slot from name
1568     std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
1569     dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
1570 
1571     std::string specific_dump_dir;
1572     bool is_cst = false;
1573     // prefix_dump_to_check is node name used to find corresponding dump file.
1574     std::string prefix_dump_to_check = GetNodeNameWithoutScope(dump_style_kernel_name);
1575     // if node name has prefix of "Default--data-", consider as constant, search in cst folder
1576     if (prefix_dump_to_check.length() > strlen(constant_prefix) &&
1577         prefix_dump_to_check.substr(0, strlen(constant_prefix)).compare(constant_prefix) == 0) {
1578       specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
1579                           std::to_string(root_graph_id[i]) + "/constants";
1580       is_cst = true;
1581       const std::string prefix = "Default--";
1582       prefix_dump_to_check = prefix_dump_to_check.substr(prefix.length());
1583     } else {
1584       specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
1585                           std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
1586     }
1587     MS_LOG(INFO) << "specific_dump_dir " << specific_dump_dir;
1588     if ((is_sync_mode_ || is_cst) && processed_npy_files->find(specific_dump_dir) == processed_npy_files->end()) {
1589       // This case happens when ReadDumpedTensor is called from GetPrevTensor function.
1590       NPYFilePool npy_files = PreProcessDumpDirSync(specific_dump_dir);
1591       *processed_npy_files = ProcessNPYFilePool(npy_files);
1592     }
1593     ReadDumpedTensorUtils(specific_dump_dir, prefix_dump_to_check, backend_name[i], slot[i], device_id[i], iteration[i],
1594                           root_graph_id[i], is_output[i], *processed_npy_files, result_list, no_mem_to_read,
1595                           is_base_request);
1596   }
1597 }
1598 /*
1599  * Feature group: Offline debugger.
1600  * Target device group: Ascend, GPU.
1601  * Runtime category: Old runtime, MindRT.
1602  * Description: For both sync and async dump, gets the newest matched file path and reads the npy file and add the
1603  * tenosr_data object to tensor_list_map_. If there is no matched file, an empty tensor_data object is created with
1604  * data_size = 0, empty shape and nullptr buffer.
1605  */
ReadFileAndAddToTensor(const bool found,const std::vector<std::string> & matched_paths,const std::vector<std::string> & matched_time_stamps,const std::string & backend_name,const unsigned int device_id,const unsigned int root_graph_id,bool is_output,size_t slot,bool * no_mem_to_read,unsigned int iteration,std::vector<std::shared_ptr<TensorData>> * result_list,bool is_base_request)1606 void DebugServices::ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,
1607                                            const std::vector<std::string> &matched_time_stamps,
1608                                            const std::string &backend_name, const unsigned int device_id,
1609                                            const unsigned int root_graph_id, bool is_output, size_t slot,
1610                                            bool *no_mem_to_read, unsigned int iteration,
1611                                            std::vector<std::shared_ptr<TensorData>> *result_list,
1612                                            bool is_base_request) {
1613   std::string time_stamp = "";
1614   std::string result_path = "";
1615   std::string type_name = "";
1616   size_t data_size = 0;
1617   std::vector<int64_t> shape;
1618   char *buffer = nullptr;
1619   if (found) {
1620     int index = GetNewestFileIndex(matched_time_stamps);
1621     if (index >= 0) {
1622       result_path = matched_paths[index];
1623       time_stamp = matched_time_stamps[index];
1624     }
1625 
1626     std::string key_name_in_cache = backend_name + ":" + std::to_string(device_id) + ":" +
1627                                     std::to_string(root_graph_id) + ":" + std::to_string(is_output) + ":" +
1628                                     std::to_string(slot);
1629     ReadTensorFromNpy(key_name_in_cache, result_path, &type_name, &data_size, &shape, &buffer, no_mem_to_read,
1630                       is_base_request);
1631     AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, data_size,
1632                     type_name, shape, buffer, result_list);
1633   } else {
1634     AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape,
1635                     buffer, result_list);
1636     MS_LOG(INFO) << "Target tensor has not been found.";
1637   }
1638 }
1639 
1640 /*
1641  * Feature group: Offline debugger.
1642  * Target device group: Ascend.
1643  * Runtime category: Old runtime, MindRT.
1644  * Description: Iterates through all the processed npy files for the current specific_dump_dir and looks for the files
1645  * that match the node_name for dump, read the newest file and add the related tensor_data object.
1646  */
ReadDumpedTensorUtils(const std::string & specific_dump_dir,const std::string & prefix_dump_to_check,const std::string & backend_name,size_t slot,unsigned int device_id,unsigned int iteration,unsigned int root_graph_id,bool is_output,const ProcessedNPYFiles & processed_npy_files,std::vector<std::shared_ptr<TensorData>> * result_list,bool * no_mem_to_read,bool is_base_request)1647 void DebugServices::ReadDumpedTensorUtils(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
1648                                           const std::string &backend_name, size_t slot, unsigned int device_id,
1649                                           unsigned int iteration, unsigned int root_graph_id, bool is_output,
1650                                           const ProcessedNPYFiles &processed_npy_files,
1651                                           std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read,
1652                                           bool is_base_request) {
1653   bool found = false;
1654   std::vector<std::string> matched_paths;
1655   std::vector<std::string> matched_time_stamps;
1656   auto it = processed_npy_files.find(specific_dump_dir);
1657   // If there is no npy file found we still need to add tensor data with size 0.
1658   if (it == processed_npy_files.end()) {
1659     MS_LOG(WARNING) << "no npy files was found for dump directory: " << specific_dump_dir;
1660   } else {
1661     auto processed_files_for_dir = it->second;
1662     for (const auto &dump_file_attr : processed_files_for_dir) {
1663       std::string file_name_to_check = dump_file_attr.file_path;
1664       std::string full_path = specific_dump_dir + "/" + file_name_to_check;
1665 
1666       if (dump_file_attr.name_to_match == prefix_dump_to_check && (dump_file_attr.slot == slot) &&
1667           (is_output == dump_file_attr.is_output)) {
1668         matched_paths.push_back(full_path);
1669         matched_time_stamps.push_back(dump_file_attr.time_stamp);
1670         found = true;
1671       }
1672     }
1673   }
1674   ReadFileAndAddToTensor(found, matched_paths, matched_time_stamps, backend_name, device_id, root_graph_id, is_output,
1675                          slot, no_mem_to_read, iteration, result_list, is_base_request);
1676 }
1677 
1678 /*
1679  * Feature group: Offline debugger.
1680  * Target device group: Ascend, GPU.
1681  * Runtime category: Old runtime, MindRT.
1682  * Description: Gets a list of the nodes that should be monitored, creates a vector called proto_to_dump with nodes'
1683  * original names and dump style names. Then, for each node, it creates an empty tensor_data object with data_byte_size
1684  * = 0 and data_ptr = nullptr and add it to the tensor_list (for both sync and async dump). This tensor_list is used for
1685  * checkwatchpoint functions.
1686  */
ReadNeededDumpedTensors(unsigned int iteration,ProcessedNPYFiles * const processed_npy_files,bool error_on_no_value)1687 std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
1688   unsigned int iteration, ProcessedNPYFiles *const processed_npy_files, bool error_on_no_value) {
1689   // get a list of nodes and the devices they are on to monitor
1690   std::vector<std::shared_ptr<TensorData>> tensor_list;
1691   std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes =
1692     GetAllWpNodes();
1693   // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
1694   // as they are found
1695   for (auto const &rank_and_graph_item : rank_and_graph_to_nodes) {
1696     std::tuple<uint32_t, uint32_t> rank_and_graph = rank_and_graph_item.first;
1697     uint32_t rank_id = std::get<0>(rank_and_graph);
1698     uint32_t root_graph_id = std::get<1>(rank_and_graph);
1699     MS_LOG(INFO) << "Get tensor files for rank_id: " << rank_id << ", root_graph_id: " << root_graph_id;
1700     std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(rank_id) + "/" + net_name_ + "/" +
1701                                     std::to_string(root_graph_id) + "/" + IterationString(iteration);
1702     std::string real_dump_dir = RealPath(specific_dump_dir);
1703     if (real_dump_dir.empty()) {
1704       MS_LOG(INFO) << "Dump dir " << specific_dump_dir << " doesn't exist. Skit it.";
1705       continue;
1706     }
1707     std::vector<std::tuple<std::string, bool>> wp_nodes = rank_and_graph_item.second;
1708     std::vector<ProtoDump> proto_to_dump;
1709 
1710     // convert node names to dump style
1711     for (auto node : wp_nodes) {
1712       std::string orig_name = std::get<0>(node);
1713       // Remove the scope from the fully qualified name to compare for both sync and async case.
1714       std::string dump_style_name = GetNodeNameWithoutScope(orig_name);
1715 
1716       bool node_is_out = std::get<1>(node);
1717       ProtoDump dump_proto;
1718       dump_proto.origin_node_name = orig_name;
1719       dump_proto.dump_name = dump_style_name;
1720       dump_proto.is_output = node_is_out;
1721 
1722       if (std::find(proto_to_dump.begin(), proto_to_dump.end(), dump_proto) == proto_to_dump.end()) {
1723         proto_to_dump.push_back(dump_proto);
1724       }
1725     }
1726     if (is_sync_mode_) {
1727       // search files in dir for the one that meets the filename prefix and read the file into memory
1728       NPYFilePool npy_files = PreProcessDumpDirSync(real_dump_dir);
1729       auto processed_npy_files_in_rank = ProcessNPYFilePool(npy_files);
1730       processed_npy_files->insert(processed_npy_files_in_rank.begin(), processed_npy_files_in_rank.end());
1731       ProcessTensorDataSync(proto_to_dump, real_dump_dir, *processed_npy_files, iteration, rank_id, root_graph_id,
1732                             &tensor_list, error_on_no_value);
1733     } else {
1734       auto preprocess_async_result = PreProcessDumpDirAsync(real_dump_dir);
1735       // convert all files in proto_to_dump to npy and add to pool of async file names
1736       NPYFilePool async_file_pool;
1737       ConvertWatchPointNodes(std::get<1>(preprocess_async_result), proto_to_dump, real_dump_dir, &async_file_pool);
1738       auto processed_npy_files_in_rank = ProcessNPYFilePool(async_file_pool);
1739       processed_npy_files->insert(processed_npy_files_in_rank.begin(), processed_npy_files_in_rank.end());
1740       GetTensorDataInfoAsync(proto_to_dump, real_dump_dir, iteration, rank_id, root_graph_id, *processed_npy_files,
1741                              &tensor_list);
1742     }
1743   }
1744 
1745   return tensor_list;
1746 }
1747 
1748 /*
1749  * Feature group: Offline debugger.
1750  * Target device group: Ascend, GPU.
1751  * Runtime category: Old runtime, MindRT.
1752  * Description: Iterates through the dump directory and for each file it looks for a match in the file name with node
1753  * names in proto_to_dump vector.
1754  */
ProcessTensorDataSync(const std::vector<ProtoDump> & proto_to_dump,const std::string & specific_dump_dir,ProcessedNPYFiles processed_npy_files,unsigned int iteration,unsigned int device_id,unsigned int root_graph_id,std::vector<std::shared_ptr<TensorData>> * const tensor_list,bool error_on_no_value)1755 void DebugServices::ProcessTensorDataSync(const std::vector<ProtoDump> &proto_to_dump,
1756                                           const std::string &specific_dump_dir, ProcessedNPYFiles processed_npy_files,
1757                                           unsigned int iteration, unsigned int device_id, unsigned int root_graph_id,
1758                                           std::vector<std::shared_ptr<TensorData>> *const tensor_list,
1759                                           bool error_on_no_value) {
1760   ProcessedNPYFiles::const_iterator it = processed_npy_files.find(specific_dump_dir);
1761   if (it == processed_npy_files.end()) {
1762     if (error_on_no_value) {
1763       MS_LOG(ERROR) << "no npy files was found for dump directory: " << specific_dump_dir;
1764     }
1765     return;
1766   }
1767   auto processed_files_for_dir = it->second;
1768   for (const auto &dump_file_attr : processed_files_for_dir) {
1769     for (auto &node : proto_to_dump) {
1770       std::string dump_name = node.dump_name;
1771       if (dump_name == dump_file_attr.name_to_match && node.is_output == dump_file_attr.is_output) {
1772         size_t slot = dump_file_attr.slot;
1773         std::vector<int64_t> shape;
1774         std::string orig_name = node.origin_node_name;
1775         bool output_flag = node.is_output;
1776 
1777         AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape, nullptr,
1778                         tensor_list);
1779         break;
1780       }
1781     }
1782   }
1783 }
1784 
IterationString(unsigned int iteration) const1785 std::string DebugServices::IterationString(unsigned int iteration) const {
1786   std::string iteration_string;
1787   bool init_dbg_suspend = (iteration == std::numeric_limits<unsigned int>::max());
1788   if (init_dbg_suspend) {
1789     iteration_string = "init";
1790   } else {
1791     iteration_string = std::to_string(iteration);
1792   }
1793   return iteration_string;
1794 }
1795 #endif
1796 
1797 /*
1798  * Feature group: Online debugger.
1799  * Target device group: Ascend, GPU.
1800  * Runtime category: Old runtime, MindRT.
1801  * Description: Searches for tensor in the loaded tensors, if the tensor is found and tensor's root_graph_id is equal to
1802  * current root_graph_id, it updates the given vectors.
1803  */
ReadNodesTensors(const std::vector<std::string> & name,std::vector<std::string> * const ret_name,std::vector<const char * > * const data_ptr,std::vector<ssize_t> * const data_size,std::vector<unsigned int> * const dtype,std::vector<std::vector<int64_t>> * const shape)1804 void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
1805                                      std::vector<const char *> *const data_ptr, std::vector<ssize_t> *const data_size,
1806                                      std::vector<unsigned int> *const dtype,
1807                                      std::vector<std::vector<int64_t>> *const shape) {
1808   std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
1809   tensor_loader_->SearchTensors(name, &result_list);
1810 
1811   for (auto result : result_list) {
1812     if (std::get<1>(result) == nullptr) {
1813       continue;
1814     }
1815 #ifndef OFFLINE_DBG_MODE
1816     auto debugger = Debugger::GetInstance();
1817     MS_EXCEPTION_IF_NULL(debugger);
1818     if (!CompareCurrentRootGraph(std::get<1>(result)->GetRootGraphId())) {
1819       MS_LOG(INFO) << "tensor root_graph_id: " << std::get<1>(result)->GetRootGraphId()
1820                    << " is different from cur_root_graph_id: " << debugger->GetCurrentRootGraphId() << ".";
1821       MS_LOG(INFO) << "Not reading tensor: " << std::get<0>(result) << ".";
1822     }
1823 #endif
1824     (void)ret_name->emplace_back(std::get<0>(result));
1825     (void)data_ptr->emplace_back(std::get<1>(result)->GetDataPtr());
1826     (void)data_size->emplace_back(std::get<1>(result)->GetByteSize());
1827     (void)dtype->emplace_back(std::get<1>(result)->GetType());
1828     (void)shape->emplace_back(std::get<1>(result)->GetShape());
1829   }
1830 }
1831 
SearchNodesTensors(const std::vector<std::string> & name,std::vector<std::tuple<std::string,std::shared_ptr<TensorData>>> * result_list)1832 void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
1833                                        std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
1834   if (result_list == nullptr) {
1835     MS_LOG(DEBUG) << "result_list is nullptr.";
1836     return;
1837   }
1838   tensor_loader_->SearchTensors(name, result_list);
1839 }
1840 
1841 #ifndef OFFLINE_DBG_MODE
IsWatchPoint(const std::string & kernel_name,const CNodePtr & kernel) const1842 bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
1843   bool ret = false;
1844   for (auto w_table_item : watchpoint_table_) {
1845     auto check_node_list = std::get<1>(w_table_item).check_node_list;
1846     for (auto check_node : check_node_list) {
1847       std::string w_name = std::get<0>(check_node);
1848       bool w_type = std::get<1>(check_node);
1849       if ((w_type == true &&
1850            ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
1851           (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
1852         ret = true;
1853         return ret;
1854       }
1855     }
1856   }
1857   return ret;
1858 }
1859 
IsWatchPointNodeInput(const std::string & w_name,const CNodePtr & kernel) const1860 bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
1861   if (kernel != nullptr && w_name.length() > 0) {
1862     auto input_size = common::AnfAlgo::GetInputTensorNum(kernel);
1863     for (size_t j = 0; j < input_size; ++j) {
1864       auto input_kernel = kernel->input(j + 1);
1865       std::string input_kernel_name = GetKernelNodeName(input_kernel);
1866       auto found = w_name.find_last_of('/');
1867       if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name) {
1868         return true;
1869       }
1870     }
1871     return false;
1872   } else {
1873     return false;
1874   }
1875 }
1876 #endif
1877 
GetTensor() const1878 std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
1879 
GetTensor(const std::string & tensor_name) const1880 std::shared_ptr<TensorData> DebugServices::GetTensor(const std::string &tensor_name) const {
1881   return tensor_loader_->GetTensor(tensor_name);
1882 }
1883 
EmptyCurrentTensor()1884 void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
1885 
1886 #ifndef OFFLINE_DBG_MODE
DumpTensorToFile(const std::string & filepath,const std::string & tensor_name,size_t slot) const1887 bool DebugServices::DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const {
1888   return tensor_loader_->DumpTensorToFile(filepath, tensor_name, slot);
1889 }
1890 #endif
1891 
LoadNewTensor(const std::shared_ptr<TensorData> & tensor,bool keep_prev)1892 bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
1893   return tensor_loader_->LoadNewTensor(tensor, keep_prev);
1894 }
1895 
1896 /*
1897  * Feature group: Offline debugger.
1898  * Target device group: Ascend, GPU.
1899  * Runtime category: Old runtime, MindRT.
1900  * Description: Returns the previous iteration in which tensor's graph was executed, if the current step is the first
1901  * run iteration for the graph or graph history file is not available it returns UINT32_MAX to identify invalid
1902  * prev_iteration.
1903  */
GetPrevIteration(const std::shared_ptr<TensorData> & tensor)1904 uint32_t DebugServices::GetPrevIteration(const std::shared_ptr<TensorData> &tensor) {
1905   uint32_t prev_iter;
1906   uint32_t rank_id = tensor->GetDeviceId();
1907   uint32_t root_graph_id = tensor->GetRootGraphId();
1908   std::tuple<uint32_t, uint32_t> rank_and_graph = std::make_tuple(rank_id, root_graph_id);
1909   if (graphs_run_history_.find(rank_and_graph) == graphs_run_history_.end()) {
1910     return UINT32_MAX;
1911   }
1912   auto it = std::find(graphs_run_history_[rank_and_graph].begin(), graphs_run_history_[rank_and_graph].end(),
1913                       tensor->GetIteration());
1914   if (it == graphs_run_history_[rank_and_graph].end()) {
1915     // The graph is not executed in that iteration
1916     return UINT32_MAX;
1917   } else if (it == graphs_run_history_[rank_and_graph].begin()) {
1918     // current iteration is the first iteration that the graph was run
1919     // no prev iter is available
1920     MS_LOG(DEBUG) << "Iteration: " << tensor->GetIteration()
1921                   << " is the first run iteration for tensor: " << tensor->GetName();
1922     return UINT32_MAX;
1923   }
1924   (void)it--;
1925   prev_iter = *it;
1926   tensor->SetPrevIteration(prev_iter);
1927   return prev_iter;
1928 }
1929 
ResetLoadedTensors()1930 void DebugServices::ResetLoadedTensors() {
1931   wp_id_cache_.clear();
1932   MS_LOG(INFO) << "Resetting loaded tensors";
1933   tensor_loader_->MoveParametersCurrentToPrev();
1934   tensor_loader_->EmptyCurrentTensor();
1935   // will move parameters from previous to current map
1936   tensor_loader_->SwapCurrentPrev();
1937   overflow_ops_.clear();
1938 }
1939 
1940 #ifndef OFFLINE_DBG_MODE
GetNodeTensor(const CNodePtr & kernel)1941 std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
1942   MS_EXCEPTION_IF_NULL(kernel);
1943   std::vector<std::shared_ptr<TensorData>> result;
1944   auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
1945   auto kernel_name = GetKernelNodeName(kernel);
1946   for (size_t j = 0; j < output_size; ++j) {
1947     auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
1948     auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
1949     if (tensor != nullptr) {
1950       result.push_back(tensor);
1951     }
1952   }
1953   return result;
1954 }
1955 #endif
1956 
GetOnlineOpOverflowDir()1957 std::string GetOnlineOpOverflowDir() {
1958   // only called for online debugger mode
1959   // get operator overflow directory for current iteration
1960   std::string overflow_bin_path = "";
1961 #ifndef OFFLINE_DBG_MODE
1962   if (DumpJsonParser::GetInstance().path().empty()) {
1963     MS_LOG(INFO) << "Dump config is not set.";
1964     return "";
1965   }
1966   auto debugger = Debugger::GetInstance();
1967   MS_EXCEPTION_IF_NULL(debugger);
1968   auto cur_graph = debugger->GetGraphPtr();
1969   if (cur_graph == nullptr) {
1970     return "";
1971   }
1972   overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(cur_graph->root_graph_id());
1973   auto realpath = FileUtils::GetRealPath(overflow_bin_path.c_str());
1974   if (!realpath.has_value()) {
1975     MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
1976     return "";
1977   }
1978   overflow_bin_path = realpath.value() + '/';
1979 #endif
1980   return overflow_bin_path;
1981 }
1982 
GetOverflowTaskStreamId(const std::string & overflow_bin_path,std::vector<std::pair<uint64_t,uint64_t>> * task_stream_hits) const1983 void DebugServices::GetOverflowTaskStreamId(const std::string &overflow_bin_path,
1984                                             std::vector<std::pair<uint64_t, uint64_t>> *task_stream_hits) const {
1985   MS_EXCEPTION_IF_NULL(task_stream_hits);
1986   const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
1987   MS_LOG(INFO) << "Processing debug_files path: " << overflow_bin_path;
1988   DIR *d = opendir(overflow_bin_path.c_str());
1989   if (d == nullptr) {
1990     MS_LOG(INFO) << "Overflow bin directory does not exist!";
1991   } else {
1992     struct dirent *dir = nullptr;
1993     while ((dir = readdir(d)) != nullptr) {
1994       std::string file_name = dir->d_name;
1995       if (file_name.rfind(overflow_file_prefix, 0) != 0) {
1996         continue;
1997       }
1998       std::string file_path = overflow_bin_path + std::string("/") + file_name;
1999       if (IsRegFile(file_path)) {
2000         // detect overflow bin file
2001         uint64_t task_id = 0;
2002         uint64_t stream_id = 0;
2003         if (!GetTaskIdStreamId(file_name, overflow_file_prefix, &task_id, &stream_id)) {
2004           continue;
2005         }
2006         MS_LOG(INFO) << "Overflow bin file" << file_name << ", task_id " << task_id << ", stream_id " << stream_id
2007                      << ".";
2008         task_stream_hits->push_back(std::make_pair(task_id, stream_id));
2009       }
2010     }
2011     (void)closedir(d);
2012   }
2013 }
2014 
GetTaskStreamIdNodeMap(const std::string & tensors_path,std::map<std::pair<uint64_t,uint64_t>,std::string> * task_stream_to_opnames) const2015 void DebugServices::GetTaskStreamIdNodeMap(
2016   const std::string &tensors_path, std::map<std::pair<uint64_t, uint64_t>, std::string> *task_stream_to_opnames) const {
2017   MS_EXCEPTION_IF_NULL(task_stream_to_opnames);
2018   MS_LOG(INFO) << "Processing debug_files path: " << tensors_path;
2019   const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
2020   DIR *d = opendir(tensors_path.c_str());
2021   if (d == nullptr) {
2022     MS_LOG(INFO) << "Tensors directory does not exist!";
2023   } else {
2024     struct dirent *dir = nullptr;
2025     while ((dir = readdir(d)) != nullptr) {
2026       std::string file_name = dir->d_name;
2027       if (file_name.rfind(overflow_file_prefix, 0) == 0) {
2028         MS_LOG(INFO) << "File: " << file_name << "is not a tensor file, skip it.";
2029         continue;
2030       }
2031       std::string file_path = tensors_path + std::string("/") + file_name;
2032       if (IsRegFile(file_path)) {
2033         // attempt to read the file
2034         std::ifstream infile;
2035         infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
2036         if (!infile.is_open()) {
2037           MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno;
2038           continue;
2039         }
2040         std::string node_name;
2041         uint64_t task_id = 0;
2042         uint64_t stream_id = 0;
2043         // detect overflow bin file, regular bin file or npy file
2044         bool success_parse = GetAttrsFromFilename(file_name, &node_name, &task_id, &stream_id);
2045         if (success_parse) {
2046           task_stream_to_opnames->insert({std::make_pair(task_id, stream_id), node_name});
2047         }
2048         infile.close();
2049       }
2050     }
2051     (void)closedir(d);
2052   }
2053 }
2054 
AddOpOverflowOpNames(const std::string & overflow_bin_path,const std::string & tensors_path,std::vector<std::string> * op_names) const2055 void DebugServices::AddOpOverflowOpNames(const std::string &overflow_bin_path, const std::string &tensors_path,
2056                                          std::vector<std::string> *op_names) const {
2057   MS_EXCEPTION_IF_NULL(op_names);
2058   std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
2059   std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
2060   GetOverflowTaskStreamId(overflow_bin_path, &task_stream_hit);
2061   GetTaskStreamIdNodeMap(tensors_path, &task_stream_to_opname);
2062 
2063   // find the op_names with an overflow hit
2064   for (auto &task_stream : task_stream_hit) {
2065     auto op_name = task_stream_to_opname[task_stream];
2066     if (!op_name.empty()) {
2067       MS_LOG(INFO) << "Operation overflow detected in " << op_name;
2068       op_names->push_back(op_name);
2069     }
2070   }
2071 }
2072 
2073 /*
2074  * Feature group: Online debugger, Offline debugger.
2075  * Target device group: Ascend.
2076  * Runtime category: Old runtime, MindRT.
2077  * Description: Checks whether for the given node the operator overflow happened or not by checking the overflow
2078  * directory. This function is for async mode only.
2079  */
CheckOpOverflow(std::string node_name_to_find,unsigned int device_id,unsigned int root_graph_id,unsigned int iteration)2080 bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
2081                                     unsigned int iteration) {
2082   if (is_sync_mode_) {
2083     return false;
2084   }
2085   std::string overflow_bin_path = "";
2086   std::string tensors_path = "";
2087 #ifndef OFFLINE_DBG_MODE
2088   overflow_bin_path = GetOnlineOpOverflowDir();
2089   tensors_path = overflow_bin_path;
2090 #else
2091   overflow_bin_path =
2092     dump_dir_ + "/rank_" + std::to_string(device_id) + "/debug_files/" + IterationString(iteration) + "/";
2093   overflow_bin_path = RealPath(overflow_bin_path);
2094   MS_LOG(INFO) << "overflow_bin_path: " << overflow_bin_path;
2095   tensors_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
2096                  std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
2097   tensors_path = RealPath(tensors_path);
2098   if (overflow_bin_path.empty()) {
2099     overflow_bin_path = tensors_path;
2100   }
2101 #endif
2102   if (overflow_bin_path.empty() || tensors_path.empty()) {
2103     MS_LOG(INFO) << "Get real path failed for overflow_bin_path or tensors path.";
2104     return false;
2105   }
2106   // remove kernel_graph_#
2107   std::string op_name_find_with_path = RemoveKernelGraphPrefix(node_name_to_find);
2108   std::replace(op_name_find_with_path.begin(), op_name_find_with_path.end(), '/', '_');
2109 
2110   // remove path
2111   size_t last_slash = node_name_to_find.rfind("/");
2112   std::string op_name_find = "";
2113   if (last_slash != std::string::npos) {
2114     op_name_find = node_name_to_find.substr(last_slash + 1);
2115   }
2116 
2117   std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
2118   std::vector<std::string> op_names;
2119 
2120   std::lock_guard<std::mutex> lg(overflow_wp_lock_);
2121   MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
2122   auto found_overflows = overflow_ops_.find(overflow_bin_path);
2123   if (found_overflows != overflow_ops_.end()) {
2124     MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
2125     op_names = overflow_ops_[overflow_bin_path];
2126   } else {
2127     AddOpOverflowOpNames(overflow_bin_path, tensors_path, &op_names);
2128     overflow_ops_[overflow_bin_path] = op_names;
2129   }
2130 
2131   // determine if overflow wp has been triggered for the op name with path (from bin file)
2132   if (find(op_names.begin(), op_names.end(), op_name_find_with_path) != op_names.end()) {
2133     MS_LOG(INFO) << "Operation overflow watchpoint triggered for  " << node_name_to_find;
2134     return true;
2135   }
2136 
2137   // determine if overflow wp has been triggered for the op name (from npy file)
2138   if (find(op_names.begin(), op_names.end(), op_name_find) != op_names.end()) {
2139     MS_LOG(INFO) << "Operation overflow watchpoint triggered for  " << node_name_to_find;
2140     return true;
2141   }
2142 
2143   return false;
2144 }
2145 
RemoveKernelGraphPrefix(std::string node_name_to_find) const2146 std::string DebugServices::RemoveKernelGraphPrefix(std::string node_name_to_find) const {
2147   std::string op_name_to_find = node_name_to_find;
2148   const std::string kernel_prefix = "kernel_graph_";
2149   if (node_name_to_find.rfind(kernel_prefix, 0) == 0) {
2150     auto start_of_op_name = node_name_to_find.find("/", kernel_prefix.length());
2151     if (start_of_op_name != std::string::npos) {
2152       op_name_to_find = node_name_to_find.substr(start_of_op_name + 1);
2153     }
2154   }
2155   return op_name_to_find;
2156 }
2157 
GetTaskIdStreamId(std::string file_name,std::string overflow_file_prefix,uint64_t * const task_id,uint64_t * const stream_id) const2158 bool DebugServices::GetTaskIdStreamId(std::string file_name, std::string overflow_file_prefix, uint64_t *const task_id,
2159                                       uint64_t *const stream_id) const {
2160   size_t task_pos_start = overflow_file_prefix.length();
2161   size_t task_pos_end = file_name.find(".", task_pos_start);
2162   if (task_pos_end == std::string::npos) {
2163     MS_LOG(ERROR) << "Cannot extract task_id from filename: " << file_name;
2164     return false;
2165   }
2166 
2167   size_t stream_pos_start = task_pos_end + 1;
2168   size_t stream_pos_end = file_name.find(".", stream_pos_start);
2169   if (stream_pos_end == std::string::npos) {
2170     MS_LOG(ERROR) << "Cannot extract stream_id from filename: " << file_name;
2171     return false;
2172   }
2173 
2174   std::string task_id_str = file_name.substr(task_pos_start, task_pos_end - task_pos_start);
2175   std::string stream_id_str = file_name.substr(stream_pos_start, stream_pos_end - stream_pos_start);
2176   if (!CheckStoull(task_id, task_id_str)) {
2177     MS_LOG(INFO) << "Failed to get the task_id from file_name: " << file_name << ", error in convert the string "
2178                  << task_id_str << " into an integer.";
2179     return false;
2180   }
2181   if (!CheckStoull(stream_id, stream_id_str)) {
2182     MS_LOG(INFO) << "Failed to get the stream_id from file_name: " << file_name << ", error in convert the string "
2183                  << stream_id_str << " into an integer.";
2184     return false;
2185   }
2186 
2187   return true;
2188 }
2189 
GetAttrsFromFilename(const std::string & file_name,std::string * const node_name,uint64_t * const task_id,uint64_t * const stream_id) const2190 bool DebugServices::GetAttrsFromFilename(const std::string &file_name, std::string *const node_name,
2191                                          uint64_t *const task_id, uint64_t *const stream_id) const {
2192   // get the node_name, task_id, and stream_id from dump filename in the following two formats:
2193   // 1. bin file: node_type.node_name.task_id.stream_id.timestamp
2194   // 2. npy file: node_type.node_name.task_id.stream_id.timestamp.output_input.slot.format.npy
2195   // Please note that node_name might contain dot (i.e. Parameter). So to search for the location of second dot, we need
2196   // to search the file name from right to left.
2197   size_t first_dot = file_name.find(".");
2198   size_t fourth_dot;
2199   if (file_name.rfind(kNpyExt) != std::string::npos) {
2200     // npy format file (converted file or A+M dump file)
2201     size_t pos = file_name.rfind(".");
2202     const int kFourthFromRight = 4;
2203     for (int cnt = 0; cnt < kFourthFromRight; cnt++) {
2204       pos = file_name.rfind(".", pos - 1);
2205     }
2206     fourth_dot = pos;
2207   } else {
2208     // bin format file
2209     fourth_dot = file_name.rfind(".");
2210   }
2211   size_t third_dot = file_name.rfind(".", fourth_dot - 1);
2212   size_t second_dot = file_name.rfind(".", third_dot - 1);
2213   // check if dots were found
2214   if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
2215       fourth_dot == std::string::npos) {
2216     return false;
2217   }
2218   // get node_name
2219   if (first_dot < second_dot) {
2220     *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
2221   } else {
2222     MS_LOG(ERROR) << "filename parse error to get node_name.";
2223     return false;
2224   }
2225   // get task id
2226   if (second_dot < third_dot) {
2227     std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
2228     if (!CheckStoull(task_id, extracted_task_id)) {
2229       MS_LOG(INFO) << "Failed to get the task_id from file_name: " << file_name << ", error in convert the string "
2230                    << extracted_task_id << " into an integer.";
2231       return false;
2232     }
2233   } else {
2234     MS_LOG(ERROR) << "Filename <" << file_name << "> parse error to get task_id.";
2235     return false;
2236   }
2237   // get stream id
2238   if (third_dot < fourth_dot) {
2239     std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
2240     if (!CheckStoull(stream_id, extracted_stream_id)) {
2241       MS_LOG(INFO) << "Failed to get the stream_id from file_name: " << file_name << ", error in convert the string "
2242                    << extracted_stream_id << " into an integer.";
2243       return false;
2244     }
2245   } else {
2246     MS_LOG(ERROR) << "Filename <" << file_name << "> parse error to get stream_id.";
2247     return false;
2248   }
2249 
2250   return true;
2251 }
2252 
RealPath(const std::string & input_path) const2253 std::string DebugServices::RealPath(const std::string &input_path) const {
2254   if (input_path.length() >= PATH_MAX) {
2255     MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
2256   }
2257 
2258   size_t path_split_pos = input_path.find_last_of('/');
2259 
2260   // get real path
2261   char real_path[PATH_MAX] = {0};
2262 
2263   // input_path is dir + file_name
2264   if (path_split_pos != std::string::npos) {
2265     std::string prefix_path = input_path.substr(0, path_split_pos);
2266     std::string file_name = input_path.substr(path_split_pos);
2267 
2268     if (file_name.length() > NAME_MAX) {
2269       MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX;
2270     }
2271     if (realpath(prefix_path.c_str(), real_path) == nullptr) {
2272       MS_LOG(INFO) << "The dir " << prefix_path << " does not exist.";
2273       return "";
2274     }
2275 
2276     return std::string(real_path) + file_name;
2277   }
2278 
2279   // input_path is only file_name
2280   if (input_path.length() > NAME_MAX) {
2281     MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX;
2282   }
2283   if (realpath(input_path.c_str(), real_path) == nullptr) {
2284     MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created.";
2285   }
2286 
2287   return std::string(real_path);
2288 }
2289 
TensorExistsInCurrent(const std::string & tensor_name)2290 bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
2291   return tensor_loader_->TensorExistsInCurrent(tensor_name);
2292 }
MoveTensorCurrentToPrev(const std::string & tensor_name)2293 void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
2294   tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
2295 }
2296 
AppendToCacheEvictQueue(const std::string & tensor_name)2297 void DebugServices::AppendToCacheEvictQueue(const std::string &tensor_name) {
2298   if (tensor_loader_->EnableMemoryControl()) {
2299     tensor_loader_->AppendToCacheEvictQueue(tensor_name);
2300   }
2301 }
2302 
SetNetName(std::string net_name)2303 void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
2304 
GetNetName()2305 std::string DebugServices::GetNetName() { return net_name_; }
2306 
SetDumpDir(std::string dump_dir)2307 void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
2308 
GetDumpDir()2309 std::string DebugServices::GetDumpDir() { return dump_dir_; }
2310 
SetSyncMode(bool is_sync_mode)2311 void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
2312 
GetSyncMode() const2313 bool DebugServices::GetSyncMode() const { return is_sync_mode_; }
2314 
SetMemLimit(uint64_t max_mem_size)2315 void DebugServices::SetMemLimit(uint64_t max_mem_size) { tensor_loader_->SetMemTotal(max_mem_size); }
2316 
2317 }  // namespace mindspore
2318