• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "debug/debug_services.h"
17 #include <dirent.h>
18 #include <algorithm>
19 #include <functional>
20 #include <fstream>
21 #include <future>
22 #include <thread>
23 #include <iterator>
24 #include <map>
25 #include <numeric>
26 #include <unordered_set>
27 #include <utility>
28 #include "pybind11/embed.h"
29 #include "pybind11/stl.h"
30 #ifdef ONLINE_DBG_MODE
31 #include "debug/common.h"
32 #include "debug/debugger/debugger.h"
33 #include "debug/anf_ir_utils.h"
34 #include "backend/session/anf_runtime_algorithm.h"
35 #endif
36 #include "debug/debugger/tensor_summary.h"
37 #include "utils/file_utils.h"
38 #ifdef ONLINE_DBG_MODE
39 namespace mindspore {
40 #endif
DebugServices()41 DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
42 
DebugServices(const DebugServices & other)43 DebugServices::DebugServices(const DebugServices &other) {
44   wp_id_cache_ = other.wp_id_cache_;
45   net_name_ = other.net_name_;
46   dump_dir_ = other.dump_dir_;
47   is_sync_mode_ = other.is_sync_mode_;
48   tensor_loader_ = other.tensor_loader_;
49   watchpoint_table_ = other.watchpoint_table_;
50 }
51 
operator =(const DebugServices & other)52 DebugServices &DebugServices::operator=(const DebugServices &other) {
53   if (this != &other) {
54     tensor_loader_ = other.tensor_loader_;
55     watchpoint_table_ = other.watchpoint_table_;
56   }
57   return *this;
58 }
59 
AddWatchpoint(unsigned int id,unsigned int watch_condition,float parameter,const std::vector<std::tuple<std::string,bool>> & check_node_list,const std::vector<parameter_t> & parameter_list,const std::vector<std::tuple<std::string,std::vector<uint32_t>>> * check_node_device_list,const std::vector<std::tuple<std::string,std::vector<uint32_t>>> * check_node_graph_list)60 void DebugServices::AddWatchpoint(
61   unsigned int id, unsigned int watch_condition, float parameter,
62   const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
63   const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
64   const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
65   std::lock_guard<std::mutex> lg(lock_);
66 
67   watchpoint_t watchpoint_item;
68   watchpoint_item.id = id;
69   watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
70   watchpoint_item.condition.parameter = parameter;
71   watchpoint_item.check_node_list = check_node_list;
72   if (check_node_device_list != nullptr) {
73     watchpoint_item.check_node_device_list = *check_node_device_list;
74   }
75   if (check_node_graph_list != nullptr) {
76     watchpoint_item.check_node_graph_list = *check_node_graph_list;
77   }
78   watchpoint_item.parameter_list = parameter_list;
79   watchpoint_table_[id] = watchpoint_item;
80 }
81 
RemoveWatchpoint(unsigned int id)82 void DebugServices::RemoveWatchpoint(unsigned int id) {
83   std::lock_guard<std::mutex> lg(lock_);
84   (void)watchpoint_table_.erase(id);
85 }
86 
GetSummaryPtr(const std::shared_ptr<TensorData> & tensor,const void * const previous_tensor_ptr,uint32_t num_elements,uint32_t prev_num_elements,int tensor_dtype)87 std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
88                                               const void *const previous_tensor_ptr, uint32_t num_elements,
89                                               uint32_t prev_num_elements, int tensor_dtype) {
90   switch (tensor_dtype) {
91     case DbgDataType::DT_UINT8: {
92       return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
93                                                       prev_num_elements);
94     }
95     case DbgDataType::DT_INT8: {
96       return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
97                                                      prev_num_elements);
98     }
99     case DbgDataType::DT_UINT16: {
100       return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
101                                                        prev_num_elements);
102     }
103     case DbgDataType::DT_INT16: {
104       return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
105                                                       prev_num_elements);
106     }
107     case DbgDataType::DT_UINT32: {
108       return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
109                                                        prev_num_elements);
110     }
111     case DbgDataType::DT_INT32:
112     case DbgDataType::DT_BASE_INT: {
113       return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
114                                                       prev_num_elements);
115     }
116     case DbgDataType::DT_UINT64: {
117       return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
118                                                        prev_num_elements);
119     }
120     case DbgDataType::DT_INT64: {
121       return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
122                                                       prev_num_elements);
123     }
124     case DbgDataType::DT_FLOAT16: {
125       return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
126                                                       prev_num_elements);
127     }
128     case DbgDataType::DT_FLOAT32:
129     case DbgDataType::DT_BASE_FLOAT: {
130       return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
131                                                     prev_num_elements);
132     }
133     case DbgDataType::DT_FLOAT64: {
134       return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
135                                                      prev_num_elements);
136     }
137     case DbgDataType::DT_BOOL: {
138       return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
139                                                    prev_num_elements);
140     }
141     default:
142       MS_LOG(INFO) << "Unsupported tensor type";
143       // return a null pointer
144       return std::unique_ptr<TensorSummary<int32_t>>{};
145   }
146 }
147 
GetTensorStatistics(const std::shared_ptr<TensorData> & tensor)148 DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
149   if (tensor == nullptr) {
150     MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
151     TensorStat empty_tensor_stat_data;
152     return empty_tensor_stat_data;
153   }
154   std::unique_ptr<ITensorSummary> base_summary_ptr;
155   void *previous_tensor_ptr = nullptr;
156   base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), 0, tensor->GetType());
157   if (base_summary_ptr == nullptr) {
158     MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
159     TensorStat empty_tensor_stat_data;
160     return empty_tensor_stat_data;
161   }
162   base_summary_ptr->TensorStatistics(tensor->GetType());
163   TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
164                               base_summary_ptr->max_value(), base_summary_ptr->min_value(),
165                               base_summary_ptr->avg_value(), base_summary_ptr->count(),
166                               base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
167                               base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(),
168                               base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count());
169 
170   return tensor_stat_data;
171 }
172 #ifdef OFFLINE_DBG_MODE
GetPrevTensor(const std::shared_ptr<TensorData> & tensor,bool previous_iter_tensor_needed,uint32_t * prev_num_elements)173 const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
174                                          uint32_t *prev_num_elements) {
175   const void *previous_tensor_ptr = nullptr;
176   std::shared_ptr<TensorData> tensor_prev;
177   if (previous_iter_tensor_needed && tensor->GetIteration() >= 1) {
178     // read data in offline mode
179     std::vector<std::string> file_paths;
180     if (!is_sync_mode_) {
181       ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
182                          std::vector<unsigned int>{tensor->GetDeviceId()},
183                          std::vector<unsigned int>{tensor->GetIteration() - 1},
184                          std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
185     }
186     std::vector<std::shared_ptr<TensorData>> result_list_prev;
187     ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
188                      std::vector<unsigned int>{tensor->GetDeviceId()},
189                      std::vector<unsigned int>{tensor->GetIteration() - 1},
190                      std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
191                      file_paths, &result_list_prev);
192     tensor_prev = result_list_prev[0];
193     if (!tensor_prev->GetByteSize()) {
194       tensor_prev.reset();
195     } else {
196       previous_tensor_ptr = tensor_prev->GetDataPtr();
197       *prev_num_elements = tensor_prev->GetNumElements();
198     }
199   }
200   return previous_tensor_ptr;
201 }
202 #endif
203 
AddWatchPointsToCheck(bool init_dbg_suspend,bool step_end,bool recheck,const std::shared_ptr<TensorData> & tensor,bool * previous_iter_tensor_needed,std::string * const qualified_tensor_name,std::vector<watchpoint_t> * const watchpoints_to_check)204 void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
205                                           const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
206                                           std::string *const qualified_tensor_name,
207                                           std::vector<watchpoint_t> *const watchpoints_to_check) {
208   if (tensor == nullptr) {
209     MS_LOG(DEBUG) << "tensor is nullptr.";
210     return;
211   }
212   const auto tensor_name = tensor->GetName();
213   const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
214   const auto tensor_device_id = tensor->GetDeviceId();
215   const auto tensor_root_graph_id = tensor->GetRootGraphId();
216   for (auto w_table_item : watchpoint_table_) {
217     auto wp = std::get<1>(w_table_item);
218     // check ONLY init conditions on initial suspended state.
219     // skip other conditions on initial suspended state
220     if (init_dbg_suspend && (wp.condition.type != INIT)) {
221       continue;
222     }
223     // skip init condition if not init suspend
224     if ((wp.condition.type == INIT) && !init_dbg_suspend) {
225       continue;
226     }
227     // check change conditions only on step end.
228     if (wp.change_condition() && !step_end) {
229       continue;
230     }
231     // if recheck, ignore the cache results and reanalyze everything.
232     // if not a recheck, check only unanalyzed tensors
233     if (!recheck) {
234       wp_lock_.lock();
235       bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
236       wp_lock_.unlock();
237       if (wp_cache_hit) {
238         continue;
239       }
240     }
241     std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot, tensor_device_id, tensor_root_graph_id);
242     if (!found.empty()) {
243       *qualified_tensor_name = found;
244       watchpoints_to_check->push_back(w_table_item.second);
245 #ifdef OFFLINE_DBG_MODE
246       if (wp.change_condition()) {
247         *previous_iter_tensor_needed = true;
248       }
249 #endif
250     }
251   }
252 }
253 
AddAnalyzedTensorToCache(const bool recheck,const unsigned int id,const std::string & tensor_name)254 void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
255                                              const std::string &tensor_name) {
256   // add analyzed tensor to cache
257   if (!recheck) {
258     wp_lock_.lock();
259     (void)wp_id_cache_[tensor_name].insert(id);
260     wp_lock_.unlock();
261   }
262 }
263 
SetCheckWatchpointsResult(const int chunk_id,partitioned_names * const chunk_names,partitioned_names * const chunk_slots,partitioned_numbers * const chunk_conditions,partitioned_id * const chunk_watchpoint_id,partitioned_parameters * const chunk_parameters,partitioned_error_code * const chunk_error_codes,partitioned_numbers * const chunk_exec_orders,partitioned_names * const chunk_time_stamp,partitioned_id * const chunk_device_id,partitioned_id * const chunk_root_graph_id,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id,const int exec_order,const std::string time_stamp,const std::string & qualified_tensor_name,const std::string & tensor_slot,const watchpoint_t & wp,const unsigned int device_id_val,const unsigned int root_graph_id_val,const std::vector<parameter_t> & parameter_list,const int32_t error_code)264 void DebugServices::SetCheckWatchpointsResult(
265   const int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
266   partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
267   partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
268   partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
269   partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
270   std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
271   const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
272   const watchpoint_t &wp, const unsigned int device_id_val, const unsigned int root_graph_id_val,
273   const std::vector<parameter_t> &parameter_list, const int32_t error_code) {
274   (void)(*chunk_exec_orders)[chunk_id].emplace_back(exec_order);
275   (void)(*chunk_names)[chunk_id].emplace_back(qualified_tensor_name);
276   (void)(*chunk_slots)[chunk_id].emplace_back(tensor_slot);
277   (void)(*chunk_conditions)[chunk_id].emplace_back(wp.condition.type);
278   (void)(*chunk_watchpoint_id)[chunk_id].emplace_back(wp.id);
279   if (device_id != nullptr) {
280     (void)(*chunk_device_id)[chunk_id].emplace_back(device_id_val);
281   }
282   if (root_graph_id != nullptr) {
283     (void)(*chunk_root_graph_id)[chunk_id].emplace_back(root_graph_id_val);
284   }
285   (void)(*chunk_parameters)[chunk_id].emplace_back(parameter_list);
286   (void)(*chunk_error_codes)[chunk_id].emplace_back(error_code);
287   (void)(*chunk_time_stamp)[chunk_id].emplace_back(time_stamp);
288 }
289 
290 #ifdef OFFLINE_DBG_MODE
ProcessCheckpointsOutofMemory(const bool no_mem_to_read,const std::vector<watchpoint_t> watchpoints_to_check,int chunk_id,partitioned_names * const chunk_names,partitioned_names * const chunk_slots,partitioned_numbers * const chunk_conditions,partitioned_id * const chunk_watchpoint_id,partitioned_parameters * const chunk_parameters,partitioned_error_code * const chunk_error_codes,partitioned_numbers * const chunk_exec_orders,partitioned_names * const chunk_time_stamp,partitioned_id * const chunk_device_id,partitioned_id * const chunk_root_graph_id,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id,const int exec_order,const std::string time_stamp,const std::string & qualified_tensor_name,const std::string & tensor_slot,const unsigned int device_id_val,const unsigned int root_graph_id_val,const std::vector<parameter_t> & parameter_list)291 void DebugServices::ProcessCheckpointsOutofMemory(
292   const bool no_mem_to_read, const std::vector<watchpoint_t> watchpoints_to_check, int chunk_id,
293   partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
294   partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
295   partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
296   partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
297   partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
298   std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
299   const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
300   const unsigned int device_id_val, const unsigned int root_graph_id_val,
301   const std::vector<parameter_t> &parameter_list) {
302   if (no_mem_to_read) {
303     // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
304     int32_t oversize_error_code = 8;
305     for (auto &wp : watchpoints_to_check) {
306       SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
307                                 chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp,
308                                 chunk_device_id, chunk_root_graph_id, device_id, root_graph_id, exec_order, time_stamp,
309                                 qualified_tensor_name, tensor_slot, wp, device_id_val, root_graph_id_val,
310                                 parameter_list, oversize_error_code);
311     }
312   }
313 }
314 #endif
315 
CheckWatchpointsForTensor(partitioned_names * const chunk_names,partitioned_names * const chunk_slots,partitioned_numbers * const chunk_conditions,partitioned_id * const chunk_watchpoint_id,partitioned_parameters * const chunk_parameters,partitioned_error_code * const chunk_error_codes,const std::vector<std::string> & op_overflows,const std::vector<std::string> & async_file_pool,partitioned_numbers * const chunk_exec_orders,std::vector<std::shared_ptr<TensorData>> * const tensor_list,int begin,int end,int chunk_id,const bool init_dbg_suspend,const bool step_end,const bool recheck,partitioned_id * const chunk_device_id,partitioned_id * const chunk_root_graph_id,std::vector<uint64_t> * const chunk_tensor_byte_size,partitioned_names * const chunk_time_stamp,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id)316 void DebugServices::CheckWatchpointsForTensor(
317   partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
318   partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
319   partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
320   const std::vector<std::string> &op_overflows, const std::vector<std::string> &async_file_pool,
321   partitioned_numbers *const chunk_exec_orders, std::vector<std::shared_ptr<TensorData>> *const tensor_list, int begin,
322   int end, int chunk_id, const bool init_dbg_suspend, const bool step_end, const bool recheck,
323   partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
324   std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_names *const chunk_time_stamp,
325   std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id) {
326   int list_size = tensor_list->size();
327   if (end > list_size) {
328     end = list_size;
329   }
330   for (int i = begin; i < end; i++) {
331     auto &tensor = (*tensor_list)[i];
332     const auto tensor_name = tensor->GetName();
333     const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
334     const auto tensor_slot = std::to_string(tensor->GetSlot());
335     std::vector<watchpoint_t> watchpoints_to_check;
336     std::string qualified_tensor_name;
337     bool previous_iter_tensor_needed = false;
338     AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor, &previous_iter_tensor_needed,
339                           &qualified_tensor_name, &watchpoints_to_check);
340     // no wp set on current tensor
341     if (watchpoints_to_check.empty()) {
342       continue;
343     }
344 #ifdef OFFLINE_DBG_MODE
345     // read data in offline mode
346     bool no_mem_to_read = false;
347     std::vector<std::shared_ptr<TensorData>> result_list;
348     ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
349                      std::vector<unsigned int>{tensor->GetDeviceId()},
350                      std::vector<unsigned int>{tensor->GetIteration()},
351                      std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
352                      async_file_pool, &result_list, &no_mem_to_read);
353     tensor = result_list[0];
354     if (!tensor->GetByteSize()) {
355       ProcessCheckpointsOutofMemory(
356         no_mem_to_read, watchpoints_to_check, chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
357         chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id,
358         device_id, root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name,
359         tensor_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), std::vector<parameter_t>());
360       tensor.reset();
361       continue;
362     }
363 #endif
364     // no elements to analyze
365     if (tensor->GetByteSize() == 0) {
366       continue;
367     }
368     (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
369     int tensor_dtype = tensor->GetType();
370     uint32_t num_elements = tensor->GetNumElements();
371     uint32_t prev_num_elements = 0;
372     const void *previous_tensor_ptr = nullptr;
373 #ifdef OFFLINE_DBG_MODE
374     previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements);
375 #else
376     std::shared_ptr<TensorData> prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
377     if (prev_tensor_data) {
378       previous_tensor_ptr = prev_tensor_data->GetDataPtr();
379       prev_num_elements = prev_tensor_data->GetNumElements();
380     }
381 #endif
382 
383     std::unique_ptr<ITensorSummary> base_summary_ptr;
384     if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
385       base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
386       if (base_summary_ptr != nullptr) {
387         base_summary_ptr->SummarizeTensor(watchpoints_to_check);
388       }
389     }
390     for (auto &wp : watchpoints_to_check) {
391       bool is_hit = false;
392       int error_code = 0;
393       std::vector<parameter_t> parameter_list = {};
394       if (wp.condition.type == IS_OVERFLOW) {
395         is_hit =
396           CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration());
397       } else if (base_summary_ptr != nullptr) {
398         auto item = base_summary_ptr->IsWatchpointHit(wp);
399         is_hit = std::get<ITensorSummary::eHitPos>(item);
400         error_code = std::get<ITensorSummary::eErrorCodePos>(item);
401         parameter_list = std::get<ITensorSummary::eParamListPos>(item);
402       }
403       AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
404       if (is_hit || error_code) {
405         SetCheckWatchpointsResult(
406           chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
407           chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id, device_id,
408           root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, wp,
409           tensor->GetDeviceId(), tensor->GetRootGraphId(), parameter_list, error_code);
410       }
411     }
412 
413 #ifdef OFFLINE_DBG_MODE
414     // set the tensor into not-in-use status in tensor_loader.
415     std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
416                                     std::to_string(tensor->GetRootGraphId()) + ":" +
417                                     std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
418     AppendToCacheEvictQueue(key_name_in_cache);
419     if (previous_tensor_ptr != nullptr) {
420       AppendToCacheEvictQueue(key_name_in_cache + ":prev");
421     }
422     // in offline mode remove the need for the data
423     tensor.reset();
424 #endif
425   }
426 }
CheckWatchpoints(std::vector<std::string> * const name,std::vector<std::string> * const slot,std::vector<int> * const condition,std::vector<unsigned int> * const watchpoint_id,std::vector<std::vector<parameter_t>> * const parameters,std::vector<int32_t> * const error_codes,const std::vector<std::string> & op_overflows,const std::vector<std::string> & async_file_pool,std::vector<std::shared_ptr<TensorData>> * const tensor_list,const bool init_dbg_suspend,const bool step_end,const bool recheck,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id)427 void DebugServices::CheckWatchpoints(
428   std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
429   std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
430   std::vector<int32_t> *const error_codes, const std::vector<std::string> &op_overflows,
431   const std::vector<std::string> &async_file_pool, std::vector<std::shared_ptr<TensorData>> *const tensor_list,
432   const bool init_dbg_suspend, const bool step_end, const bool recheck, std::vector<unsigned int> *const device_id,
433   std::vector<unsigned int> *const root_graph_id) {
434   std::lock_guard<std::mutex> lg(lock_);
435   auto t1 = std::chrono::high_resolution_clock::now();
436   if (watchpoint_table_.empty()) {
437     return;
438   }
439   // vector to store execution order of tensors hit
440   std::vector<int> exec_order;
441   std::vector<std::string> time_stamps;
442   int tensor_list_size = tensor_list->size();
443   uint64_t tensor_list_byte_size = 0;
444   MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
445   if (tensor_list_size <= 0) {
446     return;
447   }
448   // default value for number of threads
449   const int default_thread_num = 16;
450   int max_thread_num = default_thread_num;
451   if (max_thread_num > tensor_list_size) {
452     max_thread_num = tensor_list_size;
453   }
454   MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
455   int chunk_size = tensor_list_size / max_thread_num;
456   int remainder = tensor_list_size % max_thread_num;
457   partitioned_numbers chunk_exec_orders(max_thread_num);
458   partitioned_names chunk_names(max_thread_num);
459   partitioned_names chunk_slots(max_thread_num);
460   partitioned_numbers chunk_conditions(max_thread_num);
461   partitioned_id chunk_watchpoint_id(max_thread_num);
462   partitioned_parameters chunk_parameters(max_thread_num);
463   partitioned_error_code chunk_error_codes(max_thread_num);
464   partitioned_id chunk_device_id(max_thread_num);
465   partitioned_id chunk_root_graph_id(max_thread_num);
466   std::vector<uint64_t> chunk_tensor_byte_size(max_thread_num, 0);
467   partitioned_names chunk_time_stamp(max_thread_num);
468 
469   std::vector<std::future<void>> tensor_future_vec;
470   int begin = 0;
471   int end = begin;
472   for (int i = 0; i < max_thread_num; i++) {
473     end += chunk_size;
474     if (remainder > 0) {
475       end++;
476       remainder--;
477     }
478     (void)tensor_future_vec.emplace_back(std::async(
479       std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots,
480       &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows, async_file_pool,
481       &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck, &chunk_device_id,
482       &chunk_root_graph_id, &chunk_tensor_byte_size, &chunk_time_stamp, device_id, root_graph_id));
483     begin = end;
484   }
485 
486   SortWatchpointsInfo(&tensor_future_vec, &exec_order, &time_stamps, &tensor_list_byte_size, name, slot, condition,
487                       watchpoint_id, parameters, error_codes, &chunk_names, &chunk_slots, &chunk_conditions,
488                       &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, &chunk_exec_orders,
489                       &chunk_time_stamp, &chunk_tensor_byte_size, &chunk_device_id, &chunk_root_graph_id, device_id,
490                       root_graph_id);
491 
492   auto t2 = std::chrono::high_resolution_clock::now();
493   std::chrono::duration<double, std::milli> ms_double = t2 - t1;
494   MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
495   MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s";
496 }
497 
SortWatchpointsInfo(std::vector<std::future<void>> * const tensor_future_vec,std::vector<int> * const exec_order,std::vector<std::string> * const time_stamps,uint64_t * const tensor_list_byte_size,std::vector<std::string> * const name,std::vector<std::string> * const slot,std::vector<int> * const condition,std::vector<unsigned int> * const watchpoint_id,std::vector<std::vector<parameter_t>> * const parameters,std::vector<int32_t> * const error_codes,partitioned_names * const chunk_names,partitioned_names * const chunk_slots,partitioned_numbers * const chunk_conditions,partitioned_id * const chunk_watchpoint_id,partitioned_parameters * const chunk_parameters,partitioned_error_code * const chunk_error_codes,partitioned_numbers * const chunk_exec_orders,partitioned_names * const chunk_time_stamp,std::vector<uint64_t> * const chunk_tensor_byte_size,partitioned_id * const chunk_device_id,partitioned_id * const chunk_root_graph_id,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id)498 void DebugServices::SortWatchpointsInfo(
499   std::vector<std::future<void>> *const tensor_future_vec, std::vector<int> *const exec_order,
500   std::vector<std::string> *const time_stamps, uint64_t *const tensor_list_byte_size,
501   std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
502   std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
503   std::vector<int32_t> *const error_codes, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
504   partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
505   partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
506   partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
507   std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_id *const chunk_device_id,
508   partitioned_id *const chunk_root_graph_id, std::vector<unsigned int> *const device_id,
509   std::vector<unsigned int> *const root_graph_id) {
510   for (unsigned int i = 0; i < (*tensor_future_vec).size(); i++) {
511     (*tensor_future_vec)[i].wait();
512     (*tensor_future_vec)[i].get();
513     for (unsigned int j = 0; j < (*chunk_exec_orders)[i].size(); j++) {
514 #ifdef ONLINE_DBG_MODE
515       // if the execution order is repeated,inserts the new one before the others with same execution order.
516       std::vector<int>::iterator iter =
517         std::lower_bound(exec_order->begin(), exec_order->end(), (*chunk_exec_orders)[i][j]);
518       int position = iter - exec_order->begin();
519       (void)exec_order->emplace(iter, (*chunk_exec_orders)[i][j]);
520 #endif
521 #ifdef OFFLINE_DBG_MODE
522       std::vector<std::string>::iterator iter =
523         std::lower_bound(time_stamps->begin(), time_stamps->end(), (*chunk_time_stamp)[i][j]);
524       int position = iter - time_stamps->begin();
525       (void)time_stamps->emplace(iter, (*chunk_time_stamp)[i][j]);
526 #endif
527       (void)name->emplace(name->begin() + position, (*chunk_names)[i][j]);
528       (void)slot->emplace(slot->begin() + position, (*chunk_slots)[i][j]);
529       (void)condition->emplace(condition->begin() + position, (*chunk_conditions)[i][j]);
530       (void)watchpoint_id->emplace(watchpoint_id->begin() + position, (*chunk_watchpoint_id)[i][j]);
531       if (device_id != nullptr) {
532         (void)device_id->emplace(device_id->begin() + position, (*chunk_device_id)[i][j]);
533       }
534       if (root_graph_id != nullptr) {
535         (void)root_graph_id->emplace(root_graph_id->begin() + position, (*chunk_root_graph_id)[i][j]);
536       }
537       (void)parameters->emplace(parameters->begin() + position, (*chunk_parameters)[i][j]);
538       (void)error_codes->emplace(error_codes->begin() + position, (*chunk_error_codes)[i][j]);
539     }
540     // free the memory for used vectors
541     std::vector<int>().swap((*chunk_exec_orders)[i]);
542     std::vector<std::string>().swap((*chunk_time_stamp)[i]);
543     std::vector<std::string>().swap((*chunk_names)[i]);
544     std::vector<std::string>().swap((*chunk_slots)[i]);
545     std::vector<int>().swap((*chunk_conditions)[i]);
546     std::vector<unsigned int>().swap((*chunk_watchpoint_id)[i]);
547     std::vector<std::vector<parameter_t>>().swap((*chunk_parameters)[i]);
548     std::vector<int32_t>().swap((*chunk_error_codes)[i]);
549     std::vector<unsigned int>().swap((*chunk_device_id)[i]);
550     std::vector<unsigned int>().swap((*chunk_root_graph_id)[i]);
551     (*tensor_list_byte_size) += (*chunk_tensor_byte_size)[i];
552   }
553 }
554 
555 #ifdef OFFLINE_DBG_MODE
ReadTensorFromNpy(const std::string & tensor_name,const std::string & file_name,std::string * const tensor_type,std::size_t * const size,std::vector<int64_t> * const shape,std::vector<char> ** const data_buffer,bool * no_mem_to_read)556 void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name,
557                                       std::string *const tensor_type, std::size_t *const size,
558                                       std::vector<int64_t> *const shape, std::vector<char> **const data_buffer,
559                                       bool *no_mem_to_read) {
560   std::ifstream infile;
561   std::string file_path = file_name;
562   MS_LOG(INFO) << "Reading in file: " << file_path;
563   infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
564   if (!infile.is_open()) {
565     MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno;
566     const int kMaxFilenameLength = 128;
567     char err_info[kMaxFilenameLength];
568     auto ret = strerror_r(errno, err_info, sizeof(err_info));
569     if (ret != nullptr) {
570       MS_LOG(ERROR) << " ErrInfo:" << ret;
571     }
572     return;
573   }
574   const int substr_len = 2;
575   const int header_len_offset = 8;
576   const int header_offset = 9;
577   const int header_len_buffer_size = 2;
578   const int type_offset = 10;
579   // get header length
580   (void)infile.seekg(0, std::ios::beg);
581   auto header_len_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len_buffer_size);
582   if (!infile.read(header_len_buffer->data(), header_len_offset + header_len_buffer_size)) {
583     MS_LOG(ERROR) << "Failed to parse header length from " << file_path;
584     return;
585   }
586   uint16_t header_len = *reinterpret_cast<uint16_t *>(header_len_buffer->data() + header_len_offset);
587   header_len_buffer.reset();
588   // read in header
589   (void)infile.seekg(0, std::ios::beg);
590   auto header_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len);
591   if (!infile.read(header_buffer->data(), header_len_offset + header_len)) {
592     MS_LOG(ERROR) << "Failed to read header from " << file_path;
593     return;
594   }
595   std::string header(header_buffer->data() + header_offset, header_len);
596   header_buffer.reset();
597   std::size_t type_i = header.find("descr") + type_offset;
598   if (header.length() < type_i + substr_len) {
599     MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length();
600     return;
601   }
602   *tensor_type = header.substr(type_i, substr_len);
603   std::size_t shape_i_open = header.find("(");
604   std::size_t shape_i_close = header.find(")");
605   std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
606   std::string intermediate;
607   std::stringstream check_shape(shape_str);
608   MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
609   while (getline(check_shape, intermediate, ',')) {
610     shape->push_back(std::stoi(intermediate));
611   }
612   std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1]));
613   std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
614   std::size_t data_size = data_len * word_size;
615   if (!data_size) {
616     return;
617   }
618   // Check memory available before loading tensor into host.
619   bool has_enough_memory = true;
620   if (tensor_loader_->EnableMemoryControl()) {
621     has_enough_memory = tensor_loader_->CheckMemoryAvailable(tensor_name, data_size);
622   }
623   if (!has_enough_memory) {
624     MS_LOG(ERROR) << "No enough memory available for loading " << tensor_name << " into host memory.";
625     *no_mem_to_read = true;
626   } else {
627     (void)infile.seekg(header_len + type_offset);
628     *data_buffer = new std::vector<char>(data_size);
629     if ((*data_buffer) == nullptr || !infile.read((*data_buffer)->data(), data_size)) {
630       MS_LOG(ERROR) << "Unable to get tensor data from npy";
631     }
632     *size = data_size;
633   }
634 }
635 
ConvertToHostFormat(const std::map<std::string,std::vector<std::string>> & dir_to_files_map,std::vector<std::string> * const result_list)636 void DebugServices::ConvertToHostFormat(const std::map<std::string, std::vector<std::string>> &dir_to_files_map,
637                                         std::vector<std::string> *const result_list) {
638   std::string file_format = "npy";
639   for (auto const &d : dir_to_files_map) {
640     std::vector<std::string> files_to_convert_in_dir;
641     std::vector<std::string> files_after_convert_in_dir;
642     std::string dump_key = d.first;
643     for (auto const &file_name : d.second) {
644       bool already_converted = false;
645       // Remove scope from the file_name for matching files converted by mindinsight tool.
646       std::size_t found_first_dot = file_name.find(".");
647       std::size_t found_last_underscore = file_name.find_last_of("_");
648       std::string file_name_without_scope = file_name;
649       if (found_last_underscore != std::string::npos && found_last_underscore > found_first_dot) {
650         file_name_without_scope =
651           file_name_without_scope.erase(found_first_dot + 1, found_last_underscore - found_first_dot);
652       }
653       for (std::string &file_found : *result_list) {
654         if (file_found.find(file_name_without_scope) != std::string::npos) {
655           already_converted = true;
656           break;
657         }
658       }
659       if (!already_converted) {
660         (void)files_to_convert_in_dir.emplace_back(dump_key + "/" + file_name);
661         (void)files_after_convert_in_dir.emplace_back(dump_key + "/" + file_name_without_scope);
662       }
663     }
664     MS_LOG(INFO) << "Number of files to convert: " << files_to_convert_in_dir.size();
665     if (!files_to_convert_in_dir.empty()) {
666       // Look for the installation path to the conver_async package. If not found, throw exception and terminate the
667       // later task.
668       try {
669         auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
670         auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
671         (void)convert_obj.attr("convert_files")();
672       } catch (pybind11::error_already_set &e) {
673         MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
674       }
675       ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list, file_format);
676     }
677   }
678 }
679 
ProcessConvertToHostFormat(const std::vector<std::string> & files_after_convert_in_dir,const std::string & dump_key,std::vector<std::string> * const result_list,const std::string & file_format)680 void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
681                                                const std::string &dump_key, std::vector<std::string> *const result_list,
682                                                const std::string &file_format) {
683   std::string real_dump_iter_dir = RealPath(dump_key);
684   DIR *d_handle = opendir(real_dump_iter_dir.c_str());
685   if (d_handle == nullptr) {
686     MS_LOG(ERROR) << "Directory does not exit in ConvertToHostFormat.";
687     return;
688   }
689   struct dirent *dir = nullptr;
690   while ((dir = readdir(d_handle)) != nullptr) {
691     if (dir->d_type == DT_REG) {
692       std::string candidate = dir->d_name;
693       for (const std::string &file_to_find : files_after_convert_in_dir) {
694         std::string file_n = file_to_find;
695         auto last_slash_pos = file_to_find.find_last_of("\\/");
696         if (last_slash_pos != std::string::npos) {
697           file_n = file_to_find.substr(last_slash_pos + 1);
698         }
699         if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) {
700           // we found a converted file for this op
701           std::string found_file = dump_key + "/" + candidate;
702           if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
703             result_list->push_back(found_file);
704           }
705         }
706       }
707     }
708   }
709   (void)closedir(d_handle);
710 }
711 
GetNodeNameWithoutScope(const std::string & dump_style_name)712 std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
713   if (dump_style_name.empty()) {
714     return "";
715   }
716   std::size_t last_scope_marker;
717   std::string delim = "/";
718   last_scope_marker = dump_style_name.rfind(delim);
719   if (last_scope_marker == std::string::npos) {
720     return dump_style_name;
721   }
722   return dump_style_name.substr(last_scope_marker + delim.size());
723 }
724 
ConvertReadTensors(std::vector<std::string> backend_name,std::vector<size_t> slot,std::vector<unsigned int> device_id,std::vector<unsigned int> iteration,std::vector<unsigned int> root_graph_id,std::vector<std::string> * const result_list)725 void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
726                                        std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
727                                        std::vector<unsigned int> root_graph_id,
728                                        std::vector<std::string> *const result_list) {
729   std::string file_format = "npy";
730   std::map<std::string, std::vector<std::string>> dir_to_files_map;
731   for (unsigned int i = 0; i < backend_name.size(); i++) {
732     // form prefix of the tensor file to read from graph pb node name
733     std::string dump_style_kernel_name = backend_name[i];
734 
735     // remove slot from name
736     std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
737     dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
738 
739     std::string prefix_dump_file_name = GetNodeNameWithoutScope(dump_style_kernel_name);
740 
741     std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
742                                     std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
743 
744     // search files in dir for the one that meets the filename prefix and read the file into memory
745     std::string abspath = RealPath(specific_dump_dir);
746     DIR *d = opendir(abspath.c_str());
747     if (d == nullptr) {
748       MS_LOG(ERROR) << "Directory does not exist in ConvertReadTensors.";
749       return;
750     }
751     ProcessConvertList(prefix_dump_file_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
752     (void)closedir(d);
753   }
754   ConvertToHostFormat(dir_to_files_map, result_list);
755 }
756 
ConvertWatchPointNodes(const std::vector<std::tuple<std::string,std::string>> & proto_dump,const std::string & specific_dump_dir,std::vector<std::string> * const result_list)757 void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
758                                            const std::string &specific_dump_dir,
759                                            std::vector<std::string> *const result_list) {
760   std::string file_format = "npy";
761   std::map<std::string, std::vector<std::string>> dir_to_files_map;
762   for (const auto &node : proto_dump) {
763     std::string dump_name = std::get<1>(node);
764     dump_name = dump_name.substr(0, dump_name.rfind("."));
765     // search files in dir for the one that meets the filename prefix and read the file into memory
766     std::string abspath = RealPath(specific_dump_dir);
767     DIR *d = opendir(abspath.c_str());
768     if (d == nullptr) {
769       MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
770       return;
771     }
772     ProcessConvertList(dump_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
773     (void)closedir(d);
774   }
775   ConvertToHostFormat(dir_to_files_map, result_list);
776 }
777 
ProcessConvertList(const std::string & prefix_dump_file_name,const std::string & file_format,const std::string & specific_dump_dir,std::map<std::string,std::vector<std::string>> * dir_to_files_map,std::vector<std::string> * const result_list)778 void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &file_format,
779                                        const std::string &specific_dump_dir,
780                                        std::map<std::string, std::vector<std::string>> *dir_to_files_map,
781                                        std::vector<std::string> *const result_list) {
782   DIR *d = opendir(specific_dump_dir.c_str());
783   struct dirent *dir = nullptr;
784   while ((dir = readdir(d)) != nullptr) {
785     if (dir->d_type != DT_REG) {
786       continue;
787     }
788     std::string file_name = dir->d_name;
789     std::string file_name_w_o_perfix = file_name;
790     auto type_pos = file_name.find('.');
791     if (type_pos == std::string::npos || file_name.find(prefix_dump_file_name, type_pos + 1) == std::string::npos) {
792       continue;
793     }
794     if (file_name.rfind(file_format) == std::string::npos) {
795       // if file matches prefix and is in device format add to candidate files to convert.
796       (*dir_to_files_map)[specific_dump_dir].push_back(file_name);
797     } else {
798       // otherwise, if file matches prefix and already has been converted to host format
799       // add to result of converted files.
800       std::string found_file = specific_dump_dir + "/" + file_name;
801       if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
802         result_list->push_back(found_file);
803       }
804     }
805   }
806   (void)closedir(d);
807 }
808 
GetTensorDataInfoAsync(const std::vector<std::tuple<std::string,std::string>> & proto_dump,const std::string & specific_dump_dir,uint32_t iteration,uint32_t device_id,uint32_t root_graph_id,const std::vector<std::string> & async_file_pool,std::vector<std::shared_ptr<TensorData>> * const tensor_list)809 void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
810                                            const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
811                                            uint32_t root_graph_id, const std::vector<std::string> &async_file_pool,
812                                            std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
813   for (auto &node : proto_dump) {
814     std::vector<size_t> slot_list;
815     std::string dump_style_name = std::get<1>(node);
816     // Get dump_name and output_str from the second element of tuple
817     std::size_t found_dot = dump_style_name.rfind(".");
818     std::string dump_name = dump_style_name.substr(0, found_dot);
819     std::string output_str = dump_style_name.substr(found_dot + 1);
820     bool output_flag = (output_str == "output");
821 
822     for (const std::string &file_name : async_file_pool) {
823       std::size_t found = file_name.find(dump_name);
824       std::size_t found_out = file_name.find(output_str);
825       std::size_t found_dot_start = file_name.find(".", found_out);
826       std::size_t found_dot_end = file_name.find(".", found_dot_start);
827 
828       if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos &&
829           found_out != std::string::npos) {
830         slot_list.push_back(std::stoul(file_name.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1)));
831       }
832     }
833     for (auto slot : slot_list) {
834       // add a TensorData entry (data will be read when needed)
835       std::vector<int64_t> shape;
836       std::string orig_name = std::get<0>(node);
837       auto tensor_data = std::make_shared<TensorData>();
838       tensor_data->SetName(orig_name);
839       tensor_data->SetExecutionOrder(0);
840       tensor_data->SetSlot(slot);
841       tensor_data->SetIteration(iteration);
842       tensor_data->SetDeviceId(device_id);
843       tensor_data->SetRootGraphId(root_graph_id);
844       tensor_data->SetDataPtr(nullptr);
845       tensor_data->SetByteSize(0);
846       tensor_data->SetType("");
847       tensor_data->SetShape(shape);
848       tensor_data->SetIsOutput(output_flag);
849 
850       tensor_list->push_back(tensor_data);
851     }
852   }
853 }
854 
AddToTensorData(const std::string & backend_name,const std::string & time_stamp,const std::size_t slot,const unsigned int iteration,const unsigned int device_id,const unsigned int root_graph_id,const bool is_output,const std::size_t data_size,const std::string & type_name,const std::vector<int64_t> & shape,std::vector<char> * buffer,std::vector<std::shared_ptr<TensorData>> * const result_list)855 void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
856                                     const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
857                                     const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
858                                     const std::string &type_name, const std::vector<int64_t> &shape,
859                                     std::vector<char> *buffer,
860                                     std::vector<std::shared_ptr<TensorData>> *const result_list) {
861   // call LoadNewTensor to store tensor in internal cache
862   auto tensor_data = std::make_shared<TensorData>();
863   tensor_data->SetName(backend_name);
864   tensor_data->SetExecutionOrder(0);
865   tensor_data->SetSlot(slot);
866   tensor_data->SetIteration(iteration);
867   tensor_data->SetDeviceId(device_id);
868   tensor_data->SetRootGraphId(root_graph_id);
869   tensor_data->SetIsOutput(is_output);
870   if (buffer != nullptr) {
871     tensor_data->SetDataPtr(buffer->data());
872   } else {
873     tensor_data->SetDataPtr(nullptr);
874   }
875   tensor_data->SetByteSize(data_size);
876   tensor_data->SetType(type_name);
877   tensor_data->SetShape(shape);
878   tensor_data->SetTimeStamp(time_stamp);
879   if (data_size) {
880     (void)tensor_loader_->LoadNewTensor(tensor_data, false);
881   }
882 
883   // add to result_list
884   result_list->push_back(tensor_data);
885 }
886 
SetPrefixToCheck(std::string * const prefix_dump_file_name,std::string * const slot_string_to_check,std::string * const dump_style_kernel_name,size_t slot,bool is_output)887 void DebugServices::SetPrefixToCheck(std::string *const prefix_dump_file_name, std::string *const slot_string_to_check,
888                                      std::string *const dump_style_kernel_name, size_t slot, bool is_output) {
889   std::string dump_style_name_part = *dump_style_kernel_name;
890   dump_style_name_part = GetNodeNameWithoutScope(dump_style_name_part);
891   std::string slot_str;
892   if (is_output) {
893     slot_str = ".output." + std::to_string(slot);
894   } else {
895     slot_str = ".input." + std::to_string(slot);
896   }
897   dump_style_name_part += slot_str;
898   *prefix_dump_file_name = dump_style_name_part;
899   *slot_string_to_check = slot_str;
900 }
901 
GetNewestFilePath(std::vector<std::string> file_list)902 std::string GetNewestFilePath(std::vector<std::string> file_list) {
903   // get file with the newest timestamp from the list.
904   if (file_list.empty()) {
905     return "";
906   }
907   std::sort(file_list.begin(), file_list.end());
908   return file_list.back();
909 }
910 
GetTimeStampStr(std::string file_path)911 std::string GetTimeStampStr(std::string file_path) {
912   // get the file_name from file_path.
913   size_t pos = file_path.rfind("/");
914   std::string file_name = file_path.substr(pos + 1);
915   size_t first_dot = file_name.rfind(".");
916   size_t second_dot = file_name.rfind(".", first_dot - 1);
917   size_t third_dot = file_name.rfind(".", second_dot - 1);
918   size_t fourth_dot = file_name.rfind(".", third_dot - 1);
919   size_t fifth_dot = file_name.rfind(".", fourth_dot - 1);
920   if (fourth_dot != std::string::npos && fifth_dot != std::string::npos && fourth_dot > fifth_dot) {
921     std::string time_stamp = file_name.substr(fifth_dot + 1, fourth_dot - fifth_dot - 1);
922     return time_stamp;
923   }
924   return "";
925 }
926 
ReadDumpedTensor(std::vector<std::string> backend_name,std::vector<size_t> slot,std::vector<unsigned int> device_id,std::vector<unsigned int> iteration,std::vector<unsigned int> root_graph_id,const std::vector<bool> & is_output,const std::vector<std::string> & async_file_pool,std::vector<std::shared_ptr<TensorData>> * const result_list,bool * no_mem_to_read)927 void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
928                                      std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
929                                      std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
930                                      const std::vector<std::string> &async_file_pool,
931                                      std::vector<std::shared_ptr<TensorData>> *const result_list,
932                                      bool *no_mem_to_read) {
933   for (unsigned int i = 0; i < backend_name.size(); i++) {
934     // form prefix of the tensor file to read from graph pb node name
935     std::string dump_style_kernel_name = backend_name[i];
936 
937     // remove slot from name
938     std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
939     dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
940 
941     std::string slot_string_to_check;
942     std::string prefix_dump_file_name;
943     SetPrefixToCheck(&prefix_dump_file_name, &slot_string_to_check, &dump_style_kernel_name, slot[i], is_output[i]);
944     std::string prefix_dump_to_check = GetNodeNameWithoutScope(dump_style_kernel_name);
945 
946     std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
947                                     std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
948 
949     // search files in dir for the one that meets the filename prefix and read the file into memory
950     if (is_sync_mode_) {
951       ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i],
952                            iteration[i], root_graph_id[i], is_output[i], result_list, no_mem_to_read);
953     } else {
954       ReadDumpedTensorAsync(specific_dump_dir, prefix_dump_to_check, slot_string_to_check, backend_name[i], slot[i],
955                             device_id[i], iteration[i], root_graph_id[i], is_output[i], async_file_pool, result_list,
956                             no_mem_to_read);
957     }
958   }
959 }
960 
ReadFileAndAddToTensor(const bool found,const std::vector<std::string> & matched_paths,const std::string & backend_name,const unsigned int device_id,const unsigned int root_graph_id,const bool & is_output,size_t slot,bool * no_mem_to_read,unsigned int iteration,std::vector<std::shared_ptr<TensorData>> * result_list)961 void DebugServices::ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,
962                                            const std::string &backend_name, const unsigned int device_id,
963                                            const unsigned int root_graph_id, const bool &is_output, size_t slot,
964                                            bool *no_mem_to_read, unsigned int iteration,
965                                            std::vector<std::shared_ptr<TensorData>> *result_list) {
966   std::string time_stamp = "";
967   std::string type_name = "";
968   uint64_t data_size = 0;
969   std::vector<int64_t> shape;
970   std::vector<char> *buffer = nullptr;
971   if (found) {
972     std::string result_path = GetNewestFilePath(matched_paths);
973     time_stamp = GetTimeStampStr(result_path);
974     std::string key_name_in_cache = backend_name + ":" + std::to_string(device_id) + ":" +
975                                     std::to_string(root_graph_id) + ":" + std::to_string(is_output) + ":" +
976                                     std::to_string(slot);
977     ReadTensorFromNpy(key_name_in_cache, result_path, &type_name, &data_size, &shape, &buffer, no_mem_to_read);
978     AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, data_size,
979                     type_name, shape, buffer, result_list);
980   } else {
981     AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape,
982                     buffer, result_list);
983     MS_LOG(INFO) << "Target tensor has not been found.";
984   }
985 }
986 
ReadDumpedTensorSync(const std::string & prefix_dump_file_name,const std::string & specific_dump_dir,const std::string & backend_name,size_t slot,const unsigned int device_id,unsigned int iteration,unsigned int root_graph_id,const bool & is_output,std::vector<std::shared_ptr<TensorData>> * result_list,bool * no_mem_to_read)987 void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
988                                          const std::string &backend_name, size_t slot, const unsigned int device_id,
989                                          unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
990                                          std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
991   std::string abspath = RealPath(specific_dump_dir);
992   DIR *d = opendir(abspath.c_str());
993   bool found_file = false;
994   std::vector<std::string> matched_paths;
995   if (d == nullptr) {
996     MS_LOG(INFO) << "Directory " << specific_dump_dir << " does not exist!";
997   } else {
998     struct dirent *dir = nullptr;
999     while ((dir = readdir(d)) != nullptr) {
1000       if (dir->d_type == DT_REG) {
1001         std::string file_name = dir->d_name;
1002         std::string stripped_file_name = GetStrippedFilename(file_name);
1003         if (stripped_file_name.empty()) {
1004           continue;
1005         }
1006         std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
1007         if (found != 0) {
1008           continue;
1009         }
1010         std::string full_path = specific_dump_dir + "/" + file_name;
1011         matched_paths.push_back(full_path);
1012         found_file = true;
1013       }
1014     }
1015     (void)closedir(d);
1016   }
1017   ReadFileAndAddToTensor(found_file, matched_paths, backend_name, device_id, root_graph_id, is_output, slot,
1018                          no_mem_to_read, iteration, result_list);
1019 }
1020 
ReadDumpedTensorAsync(const std::string & specific_dump_dir,const std::string & prefix_dump_to_check,const std::string & slot_string_to_check,const std::string & backend_name,size_t slot,unsigned int device_id,unsigned int iteration,unsigned int root_graph_id,const bool & is_output,const std::vector<std::string> & async_file_pool,std::vector<std::shared_ptr<TensorData>> * result_list,bool * no_mem_to_read)1021 void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
1022                                           const std::string &slot_string_to_check, const std::string &backend_name,
1023                                           size_t slot, unsigned int device_id, unsigned int iteration,
1024                                           unsigned int root_graph_id, const bool &is_output,
1025                                           const std::vector<std::string> &async_file_pool,
1026                                           std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
1027   bool found = false;
1028   std::vector<std::string> matched_paths;
1029   // if async mode
1030   for (const std::string &file_path : async_file_pool) {
1031     if (file_path.find(specific_dump_dir) != std::string::npos &&
1032         file_path.find(prefix_dump_to_check) != std::string::npos &&
1033         file_path.find(slot_string_to_check) != std::string::npos) {
1034       matched_paths.push_back(file_path);
1035       found = true;
1036     }
1037   }
1038   ReadFileAndAddToTensor(found, matched_paths, backend_name, device_id, root_graph_id, is_output, slot, no_mem_to_read,
1039                          iteration, result_list);
1040 }
1041 
GetStrippedFilename(const std::string & file_name)1042 std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
1043   // strip off the task_id, stream_id, and timestamp, then compare
1044   size_t first_dot = file_name.find(".");
1045   size_t seventh_dot = file_name.rfind(".", file_name.rfind(".") - 1);
1046   size_t fifth_dot = file_name.rfind(".", file_name.rfind(".", seventh_dot - 1) - 1);
1047 
1048   if (fifth_dot == std::string::npos || fifth_dot <= first_dot) {
1049     return std::string();
1050   }
1051 
1052   // Look for the second dot's position from the back to avoid issue due to dots in the node name.
1053   size_t second_dot = fifth_dot;
1054   const int8_t kSecondDotPosition = 2;
1055   for (int8_t pos = 5; pos > kSecondDotPosition; pos--) {
1056     second_dot = file_name.rfind(".", second_dot - 1);
1057   }
1058 
1059   if (second_dot == std::string::npos || second_dot <= first_dot) {
1060     return std::string();
1061   }
1062 
1063   std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
1064   std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot);
1065   std::string stripped_file_name = start_string + end_string;
1066   return stripped_file_name;
1067 }
1068 
ReadNeededDumpedTensors(unsigned int iteration,std::vector<std::string> * const async_file_pool)1069 std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
1070   unsigned int iteration, std::vector<std::string> *const async_file_pool) {
1071   // get a list of nodes and the devices they are on to monitor
1072   std::vector<std::shared_ptr<TensorData>> tensor_list;
1073   std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> device_and_graph_to_nodes;
1074   for (auto w_table_item : watchpoint_table_) {
1075     auto wp = std::get<1>(w_table_item);
1076     unsigned int index = 0;
1077     for (auto check_node : wp.check_node_list) {
1078       std::vector<uint32_t> devices = std::get<1>(wp.check_node_device_list[index]);
1079       std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
1080       for (auto device : devices) {
1081         for (auto graph : graphs) {
1082           std::tuple<uint32_t, uint32_t> key(device, graph);
1083           device_and_graph_to_nodes[key].push_back(check_node);
1084         }
1085       }
1086 
1087       index++;
1088     }
1089   }
1090 
1091   // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
1092   // as they are found
1093   for (auto const &device_and_graph_item : device_and_graph_to_nodes) {
1094     std::tuple<uint32_t, uint32_t> device_and_graph = device_and_graph_item.first;
1095     uint32_t device_id = std::get<0>(device_and_graph);
1096     uint32_t root_graph_id = std::get<1>(device_and_graph);
1097     std::vector<std::tuple<std::string, bool>> wp_nodes = device_and_graph_item.second;
1098     std::vector<std::tuple<std::string, std::string>> proto_to_dump;
1099 
1100     std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
1101                                     std::to_string(root_graph_id) + "/" + IterationString(iteration);
1102 
1103     // convert node names to dump style
1104     for (auto node : wp_nodes) {
1105       std::string orig_name = std::get<0>(node);
1106       // Remove the scope from the fully qualified name to compare for both sync and async case.
1107       std::string dump_style_name = GetNodeNameWithoutScope(orig_name);
1108 
1109       bool node_is_out = std::get<1>(node);
1110       if (node_is_out) {
1111         dump_style_name += ".output";
1112       } else {
1113         dump_style_name += ".input";
1114       }
1115       if (std::find(proto_to_dump.begin(), proto_to_dump.end(),
1116                     std::tuple<std::string, std::string>(orig_name, dump_style_name)) == proto_to_dump.end()) {
1117         proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
1118       }
1119     }
1120 
1121     if (is_sync_mode_) {
1122       std::string abspath = RealPath(specific_dump_dir);
1123       ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, device_id, root_graph_id,
1124                             &tensor_list);
1125     } else {
1126       // convert all files in proto_to_dump to npy and add to pool of async file names
1127       ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
1128       GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, device_id, root_graph_id, *async_file_pool,
1129                              &tensor_list);
1130     }
1131   }
1132 
1133   return tensor_list;
1134 }
1135 
ProcessTensorDataSync(const std::vector<std::tuple<std::string,std::string>> & proto_to_dump,const std::string & abspath,const std::string & specific_dump_dir,unsigned int iteration,unsigned int device_id,unsigned int root_graph_id,std::vector<std::shared_ptr<TensorData>> * const tensor_list)1136 void DebugServices::ProcessTensorDataSync(const std::vector<std::tuple<std::string, std::string>> &proto_to_dump,
1137                                           const std::string &abspath, const std::string &specific_dump_dir,
1138                                           unsigned int iteration, unsigned int device_id, unsigned int root_graph_id,
1139                                           std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
1140   DIR *d = opendir(abspath.c_str());
1141   if (d == nullptr) {
1142     MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ReadNeededDumpedTensors.";
1143   } else {
1144     struct dirent *dir = nullptr;
1145     while ((dir = readdir(d)) != nullptr) {
1146       if (dir->d_type == DT_REG) {
1147         std::string file_name = dir->d_name;
1148         for (auto &node : proto_to_dump) {
1149           std::string dump_name = std::get<1>(node);
1150 
1151           std::string stripped_file_name = GetStrippedFilename(file_name);
1152           if (stripped_file_name.empty() || stripped_file_name.length() <= dump_name.length()) {
1153             continue;
1154           }
1155           std::size_t found = stripped_file_name.rfind(dump_name, 0);
1156           if (found == 0) {
1157             size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1));
1158             std::vector<int64_t> shape;
1159             std::string orig_name = std::get<0>(node);
1160             std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
1161             bool output_flag = (output_str == "output");
1162 
1163             AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape,
1164                             nullptr, tensor_list);
1165             break;
1166           }
1167         }
1168       }
1169     }
1170     (void)closedir(d);
1171   }
1172 }
1173 
IterationString(unsigned int iteration)1174 std::string DebugServices::IterationString(unsigned int iteration) {
1175   std::string iteration_string;
1176   bool init_dbg_suspend = (iteration == UINT_MAX);
1177   if (init_dbg_suspend) {
1178     iteration_string = "init";
1179   } else {
1180     iteration_string = std::to_string(iteration);
1181   }
1182   return iteration_string;
1183 }
1184 #endif
1185 
ReadNodesTensors(const std::vector<std::string> & name,std::vector<std::string> * const ret_name,std::vector<const char * > * const data_ptr,std::vector<ssize_t> * const data_size,std::vector<unsigned int> * const dtype,std::vector<std::vector<int64_t>> * const shape)1186 void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
1187                                      std::vector<const char *> *const data_ptr, std::vector<ssize_t> *const data_size,
1188                                      std::vector<unsigned int> *const dtype,
1189                                      std::vector<std::vector<int64_t>> *const shape) {
1190   std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
1191   tensor_loader_->SearchTensors(name, &result_list);
1192 
1193   for (auto result : result_list) {
1194     if (std::get<1>(result) == nullptr) {
1195       continue;
1196     }
1197     (void)ret_name->emplace_back(std::get<0>(result));
1198     (void)data_ptr->emplace_back(reinterpret_cast<const char *>(std::get<1>(result)->GetDataPtr()));
1199     (void)data_size->emplace_back(std::get<1>(result)->GetByteSize());
1200     (void)dtype->emplace_back(std::get<1>(result)->GetType());
1201     (void)shape->emplace_back(std::get<1>(result)->GetShape());
1202   }
1203 }
1204 
SearchNodesTensors(const std::vector<std::string> & name,std::vector<std::tuple<std::string,std::shared_ptr<TensorData>>> * result_list)1205 void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
1206                                        std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
1207   if (result_list == nullptr) {
1208     MS_LOG(DEBUG) << "result_list is nullptr.";
1209     return;
1210   }
1211   tensor_loader_->SearchTensors(name, result_list);
1212 }
1213 
1214 #ifdef ONLINE_DBG_MODE
IsWatchPoint(const std::string & kernel_name,const CNodePtr & kernel) const1215 bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
1216   bool ret = false;
1217   for (auto w_table_item : watchpoint_table_) {
1218     auto check_node_list = std::get<1>(w_table_item).check_node_list;
1219     for (auto check_node : check_node_list) {
1220       std::string w_name = std::get<0>(check_node);
1221       bool w_type = std::get<1>(check_node);
1222       if ((w_type == true &&
1223            ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
1224           (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
1225         ret = true;
1226         return ret;
1227       }
1228     }
1229   }
1230   return ret;
1231 }
1232 
IsWatchPointNodeInput(const std::string & w_name,const CNodePtr & kernel) const1233 bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
1234   if (kernel != nullptr && w_name.length() > 0) {
1235     auto input_size = AnfAlgo::GetInputTensorNum(kernel);
1236     for (size_t j = 0; j < input_size; ++j) {
1237       auto input_kernel = kernel->input(j + 1);
1238       std::string input_kernel_name = GetKernelNodeName(input_kernel);
1239       auto found = w_name.find_last_of('/');
1240       if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name)
1241         return true;
1242     }
1243     return false;
1244   } else {
1245     return false;
1246   }
1247 }
1248 #endif
1249 
GetTensor() const1250 std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
1251 
EmptyCurrentTensor()1252 void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
1253 
1254 #ifdef ONLINE_DBG_MODE
DumpTensorToFile(const std::string & tensor_name,bool trans_flag,const std::string & filepath,const std::string & host_fmt,const std::vector<int64_t> & host_shape,TypeId host_type,TypeId device_type,const std::string & addr_format,size_t slot) const1255 bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
1256                                      const std::string &host_fmt, const std::vector<int64_t> &host_shape,
1257                                      TypeId host_type, TypeId device_type, const std::string &addr_format,
1258                                      size_t slot) const {
1259   return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
1260                                           device_type, addr_format, slot);
1261 }
1262 #endif
1263 
LoadNewTensor(const std::shared_ptr<TensorData> & tensor,bool keep_prev)1264 bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
1265   return tensor_loader_->LoadNewTensor(tensor, keep_prev);
1266 }
1267 
ResetLoadedTensors()1268 void DebugServices::ResetLoadedTensors() {
1269   wp_id_cache_.clear();
1270   MS_LOG(INFO) << "Resetting loaded tensors";
1271   tensor_loader_->MoveParametersCurrentToPrev();
1272   tensor_loader_->EmptyCurrentTensor();
1273   // will move parameters from previous to current map
1274   tensor_loader_->SwapCurrentPrev();
1275   overflow_ops_.clear();
1276 }
1277 
1278 #ifdef ONLINE_DBG_MODE
GetNodeTensor(const CNodePtr & kernel)1279 std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
1280   MS_EXCEPTION_IF_NULL(kernel);
1281   std::vector<std::shared_ptr<TensorData>> result;
1282   auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
1283   auto kernel_name = GetKernelNodeName(kernel);
1284   for (size_t j = 0; j < output_size; ++j) {
1285     auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
1286     auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
1287     if (tensor != nullptr) {
1288       result.push_back(tensor);
1289     }
1290   }
1291   return result;
1292 }
1293 #endif
1294 
CheckOpOverflow(std::string node_name_to_find,unsigned int device_id,unsigned int root_graph_id,unsigned int iteration)1295 bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
1296                                     unsigned int iteration) {
1297   std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
1298   std::vector<std::string> op_names;
1299   std::string overflow_bin_path;
1300 
1301 #ifdef ONLINE_DBG_MODE
1302   if (DumpJsonParser::GetInstance().path().empty()) {
1303     // Dump config is not set.
1304     return false;
1305   }
1306   auto debugger = Debugger::GetInstance();
1307   overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(debugger->GetGraphPtr()->root_graph_id());
1308   auto realpath = FileUtils::GetRealPath(overflow_bin_path.c_str());
1309   if (!realpath.has_value()) {
1310     MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
1311     return false;
1312   }
1313   overflow_bin_path = realpath.value() + '/';
1314 #else
1315   overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
1316                       std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
1317   overflow_bin_path = RealPath(overflow_bin_path);
1318 #endif
1319 
1320   overflow_wp_lock_.lock();
1321 
1322   MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
1323   auto found_overflows = overflow_ops_.find(overflow_bin_path);
1324   if (found_overflows != overflow_ops_.end()) {
1325     MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
1326     op_names = overflow_ops_[overflow_bin_path];
1327   } else {
1328     std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
1329     std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
1330     const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
1331 
1332     MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
1333 
1334     std::string abspath = RealPath(overflow_bin_path);
1335     DIR *d = opendir(abspath.c_str());
1336     if (d == nullptr) {
1337       MS_LOG(ERROR) << "OverFlow bin directory does not exist!";
1338     } else {
1339       struct dirent *dir = nullptr;
1340       while ((dir = readdir(d)) != nullptr) {
1341         if (dir->d_type == DT_REG) {
1342           // form fully qualified  filename
1343           std::string file_path = overflow_bin_path;
1344           std::string file_name = dir->d_name;
1345           (void)file_path.append(file_name);
1346           // attempt to read the file
1347           std::ifstream infile;
1348           infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
1349           if (!infile.is_open()) {
1350             MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno;
1351             continue;
1352           }
1353 
1354           std::string node_name;
1355           uint64_t task_id = 0;
1356           uint64_t stream_id = 0;
1357           // detect overflow bin file
1358           if (file_name.rfind(overflow_file_prefix, 0) == 0) {
1359             // start of op overflow data in bin file
1360             const uint32_t offset = 321;
1361             (void)infile.seekg(offset, std::ios::beg);
1362             std::vector<char> buffer;
1363             // size of op overflow info section
1364             const size_t buf_size = 256;
1365             buffer.resize(buf_size);
1366             (void)infile.read(buffer.data(), buf_size);
1367             if (infile.gcount() != buf_size) {
1368               MS_LOG(ERROR) << "The file: " << file_path << "may be damaged!";
1369               continue;
1370             }
1371             const uint8_t stream_id_offset = 16;
1372             const uint8_t task_id_offset = 24;
1373             // The stream_id and task_id in the dump file are 8 byte fields for extensibility purpose, but only hold 4
1374             // byte values currently.
1375             stream_id = BytestoUInt64(std::vector<char>(buffer.begin() + stream_id_offset, buffer.end()));
1376             task_id = BytestoUInt64(std::vector<char>(buffer.begin() + task_id_offset, buffer.end()));
1377             MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id
1378                          << ".";
1379             task_stream_hit.push_back(std::make_pair(task_id, stream_id));
1380           } else {
1381             // regular bin file
1382             bool success_parse = GetAttrsFromAsyncFilename(file_name, &node_name, &task_id, &stream_id);
1383             if (success_parse) {
1384               task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
1385             }
1386           }
1387           infile.close();
1388         }
1389       }
1390       (void)closedir(d);
1391     }
1392 
1393     // find the op_names with an overflow hit
1394     for (auto &task_stream : task_stream_hit) {
1395       auto op_name = task_stream_to_opname[task_stream];
1396       if (!op_name.empty()) {
1397         MS_LOG(INFO) << "Operation overflow detected in " << op_name;
1398         op_names.push_back(op_name);
1399       }
1400     }
1401 
1402     overflow_ops_[overflow_bin_path] = op_names;
1403   }
1404 
1405   overflow_wp_lock_.unlock();
1406 
1407   // determine if overflow wp has been triggered for node_name_to_find
1408   if (find(op_names.begin(), op_names.end(), node_name_to_find) != op_names.end()) {
1409     MS_LOG(INFO) << "Operation overflow watchpoint triggered for  " << node_name_to_find;
1410     return true;
1411   }
1412 
1413   return false;
1414 }
1415 
GetAttrsFromAsyncFilename(const std::string & file_name,std::string * const node_name,uint64_t * task_id,uint64_t * stream_id)1416 bool DebugServices::GetAttrsFromAsyncFilename(const std::string &file_name, std::string *const node_name,
1417                                               uint64_t *task_id, uint64_t *stream_id) {
1418   // get the node_name, task_id, and stream_id from async dump filename
1419   // node_type.node_name.task_id.stram_id.timestamp
1420   // WARNING: node_name may have dots in it
1421   size_t fourth_dot = file_name.rfind(".");
1422   size_t third_dot = file_name.rfind(".", fourth_dot - 1);
1423   size_t second_dot = file_name.rfind(".", third_dot - 1);
1424   size_t first_dot = file_name.find(".");
1425 
1426   // check if dots were found
1427   if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
1428       fourth_dot == std::string::npos) {
1429     return false;
1430   }
1431 
1432   // check if its not an async bin file
1433   if (file_name.substr(fourth_dot) == ".npy") {
1434     return false;
1435   }
1436 
1437   // get node_name
1438   if (first_dot < second_dot) {
1439     *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
1440   } else {
1441     MS_LOG(ERROR) << "Async filename parse error to get node_name.";
1442     return false;
1443   }
1444 
1445   // get task id
1446   if (second_dot < third_dot) {
1447     std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
1448     try {
1449       *task_id = std::stoull(extracted_task_id);
1450     } catch (std::invalid_argument &e) {
1451       MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, invalid argument.";
1452       return false;
1453     } catch (std::out_of_range &e) {
1454       MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, out of range.";
1455       return false;
1456     }
1457   } else {
1458     MS_LOG(ERROR) << "Async filename parse error to get task_id.";
1459     return false;
1460   }
1461 
1462   // get stream id
1463   if (third_dot < fourth_dot) {
1464     std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
1465     try {
1466       *stream_id = std::stoull(extracted_stream_id);
1467     } catch (std::invalid_argument &e) {
1468       MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, invalid argument.";
1469       return false;
1470     } catch (std::out_of_range &e) {
1471       MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, out of range.";
1472       return false;
1473     }
1474   } else {
1475     MS_LOG(ERROR) << "Async filename parse error to get stream_id.";
1476     return false;
1477   }
1478 
1479   return true;
1480 }
1481 
RealPath(const std::string & input_path)1482 std::string DebugServices::RealPath(const std::string &input_path) {
1483   if (input_path.length() >= PATH_MAX) {
1484     MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
1485   }
1486 
1487   size_t path_split_pos = input_path.find_last_of('/');
1488 
1489   // get real path
1490   char real_path[PATH_MAX] = {0};
1491 
1492   // input_path is dir + file_name
1493   if (path_split_pos != std::string::npos) {
1494     std::string prefix_path = input_path.substr(0, path_split_pos);
1495     std::string file_name = input_path.substr(path_split_pos);
1496 
1497     if (file_name.length() > NAME_MAX) {
1498       MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX;
1499     }
1500     if (realpath(prefix_path.c_str(), real_path) == nullptr) {
1501       MS_LOG(ERROR) << "The dir " << prefix_path << " does not exist.";
1502       return "";
1503     }
1504 
1505     return std::string(real_path) + file_name;
1506   }
1507 
1508   // input_path is only file_name
1509   if (input_path.length() > NAME_MAX) {
1510     MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX;
1511   }
1512   if (realpath(input_path.c_str(), real_path) == nullptr) {
1513     MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created.";
1514   }
1515 
1516   return std::string(real_path);
1517 }
1518 
BytestoUInt64(const std::vector<char> & buffer)1519 uint64_t DebugServices::BytestoUInt64(const std::vector<char> &buffer) {
1520   return le64toh(*reinterpret_cast<const uint64_t *>(buffer.data()));
1521 }
1522 
TensorExistsInCurrent(const std::string & tensor_name)1523 bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
1524   return tensor_loader_->TensorExistsInCurrent(tensor_name);
1525 }
MoveTensorCurrentToPrev(const std::string & tensor_name)1526 void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
1527   tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
1528 }
1529 
AppendToCacheEvictQueue(const std::string & tensor_name)1530 void DebugServices::AppendToCacheEvictQueue(const std::string &tensor_name) {
1531   if (tensor_loader_->EnableMemoryControl()) {
1532     tensor_loader_->AppendToCacheEvictQueue(tensor_name);
1533   }
1534 }
1535 
SetNetName(std::string net_name)1536 void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
1537 
GetNetName()1538 std::string DebugServices::GetNetName() { return net_name_; }
1539 
SetDumpDir(std::string dump_dir)1540 void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
1541 
GetDumpDir()1542 std::string DebugServices::GetDumpDir() { return dump_dir_; }
1543 
SetSyncMode(bool is_sync_mode)1544 void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
1545 
GetSyncMode()1546 bool DebugServices::GetSyncMode() { return is_sync_mode_; }
1547 
SetMemLimit(uint64_t max_mem_size)1548 void DebugServices::SetMemLimit(uint64_t max_mem_size) { tensor_loader_->SetMemTotal(max_mem_size); }
1549 
1550 #ifdef ONLINE_DBG_MODE
1551 }  // namespace mindspore
1552 #endif
1553