• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_
17 #define MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_
18 
19 #ifdef OFFLINE_DBG_MODE
20 #include "base/float16.h"
21 #endif
22 
23 #include <cmath>
24 #include <vector>
25 #include <future>
26 #include <string>
27 #include <memory>
28 #include <tuple>
29 #include <unordered_map>
30 #include <set>
31 #include <mutex>
32 #include <map>
33 #include <limits>
34 #include <sstream>
35 #include <utility>
36 #include "debug/tensor_load.h"
37 #include "include/backend/debug/tensor_data.h"
38 
39 namespace mindspore {
40 class DebugServices {
41  public:
42   DebugServices();
43 
44   DebugServices(const DebugServices &other);
45 
46   DebugServices &operator=(const DebugServices &other);
47 
48   ~DebugServices() = default;
49   enum File_ATTR_MATCH { START_POS = 0, END_POS = 1, STR_POS = 2 };
50 
51   enum CONDITION_TYPE {
52     HAS_NAN,
53     HAS_INF,
54     IS_OVERFLOW,
55     MAX_GT,
56     MAX_LT,
57     MIN_GT,
58     MIN_LT,
59     MAX_MIN_GT,
60     MAX_MIN_LT,
61     MEAN_GT,
62     MEAN_LT,
63     SD_GT,
64     SD_LT,
65     GENERAL_OVERFLOW,
66     INIT,
67     TOO_LARGE,
68     TOO_SMALL,
69     ALL_ZERO,
70     CHANGE_TOO_LARGE,
71     CHANGE_TOO_SMALL,
72     NOT_CHANGED,
73     RANGE
74   };
75 
76   struct condition_t {
77     CONDITION_TYPE type;
78     float parameter = 0;
79   };
80 
81   struct parameter_t {
82     std::string name;
83     bool disabled;
84     double_t value;
85     bool hit;
86     double_t actual_value;
Evaluateparameter_t87     void Evaluate(double_t actualValue, std::string inequality_type) {
88       if (std::isnan(actualValue)) {
89         return;
90       }
91 
92       actual_value = actualValue;
93       // if cannot extract inequality type from watchpoint
94       // try extract from parameter name
95       if (inequality_type.empty()) {
96         auto pos = name.find_last_of('_');
97         if (pos != std::string::npos) {
98           inequality_type = name.substr(pos + 1);
99         }
100       }
101 
102       std::map<std::string, bool> condition_check{{"gt", actual_value > value},
103                                                   {"lt", actual_value < value},
104                                                   {"ge", actual_value >= value},
105                                                   {"le", actual_value <= value}};
106 
107       hit = condition_check[inequality_type];
108     }
109   };
110   struct MappedFiles {
111     std::vector<std::string> bin_files;
112     // key is op_name and value is the vector of matched npy files to that op name.
113     std::map<std::string, std::vector<std::string>> npy_files;
114   };
115 
116   struct DumpFileAttr {
117     std::string file_path;
118     // name_to_match is the op_name extracted from file name.
119     std::string name_to_match;
120     std::string time_stamp;
121     uint64_t slot = 0;
122     bool is_output{false};
123   };
124 
125   struct ProtoDump {
126     bool operator==(const ProtoDump obj) {
127       return (origin_node_name == obj.origin_node_name && dump_name == obj.dump_name && is_output == obj.is_output);
128     }
129     // name_to_match is the op_name between first and second dot in file_name
130     std::string origin_node_name;
131     std::string dump_name;
132     bool is_output{false};
133   };
134 
135   typedef std::vector<std::vector<int>> partitioned_numbers;
136   typedef std::vector<std::vector<std::string>> partitioned_names;
137   typedef std::vector<std::vector<std::vector<parameter_t>>> partitioned_parameters;
138   typedef std::vector<std::vector<int32_t>> partitioned_error_code;
139   typedef std::vector<std::vector<unsigned int>> partitioned_id;
140   typedef std::set<std::string> NPYFilePool;
141   typedef std::map<std::string, std::vector<std::tuple<std::string, std::string>>> DirMap;
142   // key is dump dir path and value is vector of bin files and map of npy files.
143   typedef std::map<std::string, DebugServices::MappedFiles> DumpFileMap;
144   typedef std::map<std::string, std::vector<DebugServices::DumpFileAttr>> ProcessedNPYFiles;
145   // bool shows if preprocess was successful, and DumpFileMap is preprocessed file result
146   typedef std::tuple<bool, DumpFileMap> AsyncPreProcessResult;
147 
148   struct watchpoint_t {
149     unsigned int id;
150     condition_t condition;
151     std::vector<std::tuple<std::string, bool>> check_node_list;
152     std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_device_list;
153     std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_graph_list;
154     std::vector<parameter_t> parameter_list;
155     size_t location = 0;
156 
FindQualifiedTensorNamewatchpoint_t157     std::string FindQualifiedTensorName(const std::string &tensor_name, unsigned const int &tensor_device_id,
158                                         unsigned const int &tensor_root_graph_id) const {
159       size_t indx = 0;
160       for (auto check_node : check_node_list) {
161         std::string w_name = std::get<0>(check_node);
162         bool w_type = std::get<1>(check_node);
163         auto found = w_name.find_last_of('/');
164         bool check_tensor_name = found != std::string::npos && w_name.substr(found + 1) == tensor_name;
165         bool check_node_name =
166           (w_type && (tensor_name == w_name || w_name == "*")) || (!w_type && tensor_name == w_name);
167         if (check_tensor_name || check_node_name) {
168           // online debugger only support single card
169           if (check_node_device_list.empty()) {
170             return w_name;
171           }
172           auto device_vec = std::get<1>(check_node_device_list[indx]);
173           auto root_graph_vec = std::get<1>(check_node_graph_list[indx]);
174           auto iter1 = std::find(device_vec.begin(), device_vec.end(), tensor_device_id);
175           auto iter2 = std::find(root_graph_vec.begin(), root_graph_vec.end(), tensor_root_graph_id);
176           if (iter1 != device_vec.end() && iter2 != root_graph_vec.end()) {
177             return w_name;
178           }
179         }
180         indx++;
181       }
182       return {};
183     }
184 
is_gt_wpwatchpoint_t185     bool is_gt_wp() const {
186       return condition.type == MAX_GT || condition.type == MIN_GT || condition.type == MEAN_GT ||
187              condition.type == SD_GT || condition.type == MAX_MIN_GT;
188     }
189 
is_lt_wpwatchpoint_t190     bool is_lt_wp() const {
191       return condition.type == MAX_LT || condition.type == MIN_LT || condition.type == MEAN_LT ||
192              condition.type == SD_LT || condition.type == MAX_MIN_LT;
193     }
194 
195     // for parameter_list of the condition TOO_LARGE/TOO_SMALL, the meaning of parameter_list is:
196     // parameter_list[0]: the absolute mean value is set; parameter_list[1]: the max value is set;
197     // parameter_list[2]: the min is set; parameter_list[3]: the mean value is set.
198     // mean or sd related condition set
mean_sd_enabledwatchpoint_t199     bool mean_sd_enabled() const {
200       return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT ||
201              condition.type == SD_GT || (condition.type == TOO_LARGE && !parameter_list[3].disabled) ||
202              (condition.type == TOO_SMALL && !parameter_list[3].disabled);
203     }
abs_mean_enabledwatchpoint_t204     bool abs_mean_enabled() const {
205       return (condition.type == TOO_LARGE && !parameter_list[0].disabled) ||
206              (condition.type == TOO_SMALL && !parameter_list[0].disabled);
207     }
208 
tensor_update_ratio_mean_enabledwatchpoint_t209     bool tensor_update_ratio_mean_enabled() const {
210       return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL;
211     }
allclose_enabledwatchpoint_t212     bool allclose_enabled() const { return condition.type == NOT_CHANGED; }
213 
214     // for parameter_list of the condition RANGE, the meaning of parameter_list is:
215     // parameter_list[0]: the elements value in range is lower than setting percentage is set;
216     // parameter_list[1]: the elements value in range is higher than setting percentage is set.
range_enabledwatchpoint_t217     bool range_enabled() const {
218       return condition.type == RANGE && (!parameter_list[0].disabled || !parameter_list[1].disabled);
219     }
220 
change_conditionwatchpoint_t221     bool change_condition() const {
222       return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL || condition.type == NOT_CHANGED;
223     }
224   };
225 
226   struct TensorBase {
TensorBaseTensorBase227     TensorBase(uint64_t data_size, int dtype, const std::vector<int64_t> &shape)
228         : data_size(data_size), dtype(dtype), shape(shape) {}
229     TensorBase() = default;
230     uint64_t data_size = 0;
231     int dtype = 0;
232     std::vector<int64_t> shape;
233   };
234 
235   struct TensorStat {
236     TensorStat(uint64_t data_size, int dtype, const std::vector<int64_t> &shape, bool is_bool, double max_value,
237                double min_value, double avg_value, uint64_t count, uint64_t neg_zero_count, uint64_t pos_zero_count,
238                uint64_t nan_count, uint64_t neg_inf_count, uint64_t pos_inf_count, uint64_t zero_count, double l2_value,
239                std::string md5 = "")
data_sizeTensorStat240         : data_size(data_size),
241           dtype(dtype),
242           shape(shape),
243           is_bool(is_bool),
244           max_value(max_value),
245           min_value(min_value),
246           avg_value(avg_value),
247           count(count),
248           neg_zero_count(neg_zero_count),
249           pos_zero_count(pos_zero_count),
250           nan_count(nan_count),
251           neg_inf_count(neg_inf_count),
252           pos_inf_count(pos_inf_count),
253           zero_count(zero_count),
254           l2_value(l2_value),
255           md5(md5) {}
256 
257     TensorStat() = default;
258 
259     uint64_t data_size = 0;
260     int dtype = 0;
261     std::vector<int64_t> shape;
262     bool is_bool = false;
263     double max_value = std::numeric_limits<double>::lowest();
264     double min_value = std::numeric_limits<double>::max();
265     double avg_value = 0.0;
266     uint64_t count = 0;
267     uint64_t neg_zero_count = 0;
268     uint64_t pos_zero_count = 0;
269     uint64_t nan_count = 0;
270     uint64_t neg_inf_count = 0;
271     uint64_t pos_inf_count = 0;
272     uint64_t zero_count = 0;
273     double l2_value = 0.0;
274     std::string md5 = "";
275     std::map<std::string, std::string> header_item_map;
DoubleToStringTensorStat276     std::string DoubleToString(double value) {
277       std::ostringstream ss;
278       ss << value;
279       return ss.str();
280     }
UpdateHeaderItemMapTensorStat281     void UpdateHeaderItemMap() {
282       header_item_map = {{"max", DoubleToString(max_value)},
283                          {"min", DoubleToString(min_value)},
284                          {"avg", DoubleToString(avg_value)},
285                          {"count", std::to_string(count)},
286                          {"negative zero count", std::to_string(neg_zero_count)},
287                          {"positive zero count", std::to_string(pos_zero_count)},
288                          {"nan count", std::to_string(nan_count)},
289                          {"negative inf count", std::to_string(neg_inf_count)},
290                          {"positive inf count", std::to_string(pos_inf_count)},
291                          {"zero count", std::to_string(zero_count)},
292                          {"l2norm", DoubleToString(l2_value)},
293                          {"md5", md5}};
294     }
295   };
296 
297   struct ChunkData {
298     partitioned_names chunk_names;
299     partitioned_names chunk_slots;
300     partitioned_numbers chunk_conditions;
301     partitioned_id chunk_watchpoint_id;
302     partitioned_parameters chunk_parameters;
303     partitioned_error_code chunk_error_codes;
304     partitioned_numbers chunk_exec_orders;
305     partitioned_id chunk_device_id;
306     partitioned_id chunk_root_graph_id;
307     std::vector<uint64_t> chunk_tensor_byte_size;
308     partitioned_names chunk_time_stamp;
309   };
310 
311   static TensorStat GetTensorStatistics(const std::shared_ptr<TensorData> &tensor);
312 
313   void AddWatchpoint(
314     int id, int watch_condition, float parameter, const std::vector<std::tuple<std::string, bool>> &check_node_list,
315     const std::vector<parameter_t> &parameter_list,
316     const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list = nullptr,
317     const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list = nullptr);
318 
319   void RemoveWatchpoint(unsigned int id);
320 
321 #ifdef OFFLINE_DBG_MODE
322   void CheckOutofMemoryandNoValue(const bool no_mem_to_read, const bool error_on_no_value,
323                                   const std::vector<watchpoint_t> watchpoints_to_check, const int chunk_id,
324                                   ChunkData *chunk_data, std::vector<unsigned int> *const device_id,
325                                   std::vector<unsigned int> *const root_graph_id, const int exec_order,
326                                   const std::string time_stamp, const std::string &qualified_tensor_name,
327                                   const std::string &tensor_slot, const unsigned int device_id_val,
328                                   const unsigned int root_graph_id_val,
329                                   const std::vector<parameter_t> &parameter_list) const;
330 #endif
331 
332   const void *PreparePrevTensor(uint64_t *prev_num_elements, const std::string &tensor_name);
333 
334   void CheckHistoryErrorCode(int *error_code, bool history_not_found) const;
335 
336   void CheckWatchpointsForTensor(ChunkData *chunk_data, ProcessedNPYFiles *const processed_npy_files,
337                                  std::vector<std::shared_ptr<TensorData>> *const tensor_list, int begin, int end,
338                                  int chunk_id, const bool init_dbg_suspend, const bool step_end, const bool recheck,
339                                  std::vector<unsigned int> *device_id, std::vector<unsigned int> *root_graph_id,
340                                  bool error_on_no_value = false);
341 
342   void GetOverflowTaskStreamId(const std::string &overflow_bin_path,
343                                std::vector<std::pair<uint64_t, uint64_t>> *task_stream_hits) const;
344 
345   void GetTaskStreamIdNodeMap(const std::string &tensor_path,
346                               std::map<std::pair<uint64_t, uint64_t>, std::string> *task_stream_to_opnames) const;
347 
348   void AddOpOverflowOpNames(const std::string &overflow_bin_path, const std::string &tensors_path,
349                             std::vector<std::string> *op_names) const;
350 
351   void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
352                         std::vector<unsigned int> *const watchpoint_id,
353                         std::vector<std::vector<parameter_t>> *parameters, std::vector<int32_t> *error_code,
354                         ProcessedNPYFiles *const processed_npy_files,
355                         std::vector<std::shared_ptr<TensorData>> *tensor_list, bool init_dbg_suspend,
356                         const bool step_end, const bool recheck, std::vector<unsigned int> *device_id = nullptr,
357                         std::vector<unsigned int> *root_graph_id = nullptr, bool error_on_no_value = false);
358 
359   void SortWatchpointsInfo(std::vector<std::future<void>> *const tensor_future_vec, std::vector<int> *exec_order,
360                            std::vector<std::string> *time_stamps, uint64_t *tensor_list_byte_size,
361                            std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition,
362                            std::vector<unsigned int> *const watchpoint_id,
363                            std::vector<std::vector<parameter_t>> *parameters, std::vector<int32_t> *error_codes,
364                            ChunkData *chunk_data, std::vector<unsigned int> *device_id,
365                            std::vector<unsigned int> *root_graph_id) const;
366 #ifdef OFFLINE_DBG_MODE
367   void SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr);
368 #endif
369 
370   void AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
371                              const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
372                              std::string *qualified_tensor_name, std::vector<watchpoint_t> *watchpoints_to_check);
373 
374   void SetCheckWatchpointsResult(const int chunk_id, ChunkData *chunk_data, std::vector<unsigned int> *device_id,
375                                  std::vector<unsigned int> *root_graph_id, const int exec_order,
376                                  const std::string time_stamp, const std::string &qualified_tensor_name,
377                                  const std::string &tensor_slot, const watchpoint_t &wp,
378                                  const unsigned int device_id_val, const unsigned int root_graph_id_val,
379                                  const std::vector<parameter_t> &parameter_list, const int32_t error_code) const;
380 #ifdef OFFLINE_DBG_MODE
381   void AddToTensorData(const std::string &backend_name, const std::string &time_stamp, const std::size_t slot,
382                        const unsigned int iteration, const unsigned int device_id, const unsigned int root_graph_id,
383                        const bool is_output, const std::size_t data_size, const std::string &type_name,
384                        const std::vector<int64_t> &shape, char *buffer,
385                        std::vector<std::shared_ptr<TensorData>> *const result_list);
386 
387   void SetPrefixToCheck(std::string *const prefix_dump_file_name, std::string *const slot_string_to_check,
388                         std::string *const dump_style_kernel_name, size_t slot, bool is_output);
389 
390   void ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
391                         std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
392                         std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
393                         ProcessedNPYFiles *const processed_npy_files,
394                         std::vector<std::shared_ptr<TensorData>> *const result_list, bool is_base_request,
395                         bool *no_mem_to_read = nullptr);
396 
397   void ProcessTensorDataSync(const std::vector<ProtoDump> &proto_to_dump, const std::string &specific_dump_dir,
398                              ProcessedNPYFiles processed_npy_files, unsigned int iteration, unsigned int device_id,
399                              unsigned int root_graph_id, std::vector<std::shared_ptr<TensorData>> *const tensor_list,
400                              bool error_on_no_value = false);
401 
402   void ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,
403                               const std::vector<std::string> &matched_time_stamps, const std::string &backend_name,
404                               const unsigned int device_id, const unsigned int root_graph_id, bool is_output,
405                               size_t slot, bool *no_mem_to_read, unsigned int iteration,
406                               std::vector<std::shared_ptr<TensorData>> *result_list, bool is_base_request = false);
407 
408   void ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
409                             const std::string &backend_name, size_t slot, unsigned int device_id,
410                             unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
411                             std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read);
412 
413   void ReadDumpedTensorUtils(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
414                              const std::string &backend_name, size_t slot, unsigned int device_id,
415                              unsigned int iteration, unsigned int root_graph_id, bool is_output,
416                              const ProcessedNPYFiles &processed_npy_files,
417                              std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read,
418                              bool is_base_request = false);
419 
420   std::vector<std::shared_ptr<TensorData>> ReadNeededDumpedTensors(unsigned int iteration,
421                                                                    ProcessedNPYFiles *const processed_npy_files,
422                                                                    bool error_on_no_value = false);
423 
424   const void *GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
425                             uint64_t *prev_num_elements, bool *history_not_found);
426 
427   void ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name, std::string *const tensor_type,
428                          std::size_t *const size, std::vector<int64_t> *const shape, char **const data_buffer,
429                          bool *no_mem_to_read, bool is_base_request = false);
430 
431   AsyncPreProcessResult PreProcessDumpDirAsync(const std::string &specific_dump_dir) const;
432 
433   DebugServices::NPYFilePool PreProcessDumpDirSync(const std::string &specific_dump_dir) const;
434 
435   ProcessedNPYFiles ProcessNPYFilePool(const NPYFilePool &npy_file_pool) const;
436 
437   void ConvertToHostFormat(const DirMap &dir_to_files_map, NPYFilePool *const result_list) const;
438 
439   void ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
440                                   const std::string &dump_key, NPYFilePool *const result_list) const;
441 
442   void ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
443                           std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
444                           std::vector<unsigned int> root_graph_id, NPYFilePool *const result_list);
445 
446   void ConvertWatchPointNodes(const DumpFileMap &dump_dir_mapped_files, const std::vector<ProtoDump> &proto_dump,
447                               const std::string &specific_dump_dir, NPYFilePool *const result_list) const;
448 
449   void ProcessConvertList(const DumpFileMap &dump_dir_mapped_files, const std::string &prefix_dump_file_name,
450                           const std::string &specific_dump_dir, DirMap *dir_to_files_map,
451                           NPYFilePool *const result_list) const;
452 
453   void GetTensorDataInfoAsync(const std::vector<ProtoDump> &proto_dump, const std::string &specific_dump_dir,
454                               uint32_t iteration, uint32_t device_id, uint32_t root_graph_id,
455                               const ProcessedNPYFiles &processed_async_files,
456                               std::vector<std::shared_ptr<TensorData>> *const tensor_list);
457 
458   void SetGraphsHistory();
459 
460   std::vector<uint32_t> GetDumpRankIdList();
461 
462   void CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list);
463 
464   void ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id);
465 
466   std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> GetAllWpNodes();
467 
468   void ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph);
469 
470   std::string IterationString(unsigned int iteration) const;
471 #endif
472   void ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *ret_name,
473                         std::vector<const char *> *data_ptr, std::vector<ssize_t> *data_size,
474                         std::vector<unsigned int> *dtype, std::vector<std::vector<int64_t>> *const shape);
475 
476   void SearchNodesTensors(const std::vector<std::string> &name,
477                           std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list);
478 #ifndef OFFLINE_DBG_MODE
479   bool IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel = nullptr) const;
480 
481   bool IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const;
482 
483   bool CompareCurrentRootGraph(uint32_t id) const;
484 #endif
485 
486   std::vector<std::shared_ptr<TensorData>> GetTensor() const;
487 
488   std::shared_ptr<TensorData> GetTensor(const std::string &tensor_name) const;
489 
490   void AddAnalyzedTensorToCache(const bool recheck, const unsigned int id, const std::string &tensor_name);
491 
492   void EmptyCurrentTensor();
493 
494 #ifndef OFFLINE_DBG_MODE
495   bool DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const;
496 #endif
497 
498   bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev);
499 
500   uint32_t GetPrevIteration(const std::shared_ptr<TensorData> &tensor);
501 
502   void ResetLoadedTensors();
503 #ifndef OFFLINE_DBG_MODE
504   std::vector<std::shared_ptr<TensorData>> GetNodeTensor(const CNodePtr &kernel);
505 #endif
506 
507   // Find if any operation overflow happened on a particular node name
508   bool CheckOpOverflow(std::string node_name_to_find, unsigned int device_id = 0, unsigned int root_graph_id = 0,
509                        unsigned int iteration = 0);
510 
511   std::string RemoveKernelGraphPrefix(std::string node_name_to_find) const;
512 
513   bool GetTaskIdStreamId(std::string file_name, std::string overflow_file_prefix, uint64_t *const task_id,
514                          uint64_t *const stream_id) const;
515 
516   bool GetAttrsFromFilename(const std::string &file_name, std::string *const node_name, uint64_t *const task_id,
517                             uint64_t *const stream_id) const;
518 
519   std::string RealPath(const std::string &input_path) const;
520 
521   bool TensorExistsInCurrent(const std::string &tensor_name);
522 
523   void MoveTensorCurrentToPrev(const std::string &tensor_name);
524 
525   void AppendToCacheEvictQueue(const std::string &tensor_name);
526 
527   void SetNetName(std::string net_name);
528 
529   std::string GetNetName();
530 
531   void SetDumpDir(std::string dump_dir);
532 
533   std::string GetDumpDir();
534 
535   void SetSyncMode(bool is_sync_mode);
536 
537   bool GetSyncMode() const;
538 
539   void SetMemLimit(uint64_t max_mem_size);
540 
541   void CheckWatchpointProgress(size_t tensor_list_size);
542 
GetProcessedTensorCount()543   size_t GetProcessedTensorCount() const { return tensor_processed_count_; }
544 
545  private:
546   std::mutex lock_;
547   std::mutex wp_lock_;
548   std::mutex overflow_wp_lock_;
549 
550   // to keep track of watchpoints that have been checked already for a tensor in current step
551   std::unordered_map<std::string, std::set<int32_t>> wp_id_cache_;
552   std::unordered_map<unsigned int, watchpoint_t> watchpoint_table_;
553   // key is the iteration path, value is vector of op_names which have overflowed
554   std::unordered_map<std::string, std::vector<std::string>> overflow_ops_;
555   std::string net_name_;
556   std::string dump_dir_;
557   // store history of graphs that have been run (rank_id, graph_id)
558   std::map<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>> graphs_run_history_;
559   bool is_sync_mode_{false};
560   // processed tensors in checkwatchpoint function
561   std::atomic<size_t> tensor_processed_count_{0};
562   bool wp_progress_enabled_{false};
563   std::unique_ptr<std::thread> wp_progress_thread_;
564   std::shared_ptr<TensorLoader> tensor_loader_;
565 };
566 }  // namespace mindspore
567 
568 #endif  // MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_
569