1 /** 2 * Copyright 2020-2023 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_ 17 #define MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_ 18 19 #ifdef OFFLINE_DBG_MODE 20 #include "base/float16.h" 21 #endif 22 23 #include <cmath> 24 #include <vector> 25 #include <future> 26 #include <string> 27 #include <memory> 28 #include <tuple> 29 #include <unordered_map> 30 #include <set> 31 #include <mutex> 32 #include <map> 33 #include <limits> 34 #include <sstream> 35 #include <utility> 36 #include "debug/tensor_load.h" 37 #include "include/backend/debug/tensor_data.h" 38 39 namespace mindspore { 40 class DebugServices { 41 public: 42 DebugServices(); 43 44 DebugServices(const DebugServices &other); 45 46 DebugServices &operator=(const DebugServices &other); 47 48 ~DebugServices() = default; 49 enum File_ATTR_MATCH { START_POS = 0, END_POS = 1, STR_POS = 2 }; 50 51 enum CONDITION_TYPE { 52 HAS_NAN, 53 HAS_INF, 54 IS_OVERFLOW, 55 MAX_GT, 56 MAX_LT, 57 MIN_GT, 58 MIN_LT, 59 MAX_MIN_GT, 60 MAX_MIN_LT, 61 MEAN_GT, 62 MEAN_LT, 63 SD_GT, 64 SD_LT, 65 GENERAL_OVERFLOW, 66 INIT, 67 TOO_LARGE, 68 TOO_SMALL, 69 ALL_ZERO, 70 CHANGE_TOO_LARGE, 71 CHANGE_TOO_SMALL, 72 NOT_CHANGED, 73 RANGE 74 }; 75 76 struct condition_t { 77 CONDITION_TYPE type; 78 float parameter = 0; 79 }; 80 81 struct parameter_t { 82 std::string name; 83 bool disabled; 84 double_t value; 85 bool hit; 86 double_t actual_value; Evaluateparameter_t87 void Evaluate(double_t actualValue, std::string inequality_type) { 88 if (std::isnan(actualValue)) { 89 return; 90 } 91 92 actual_value = actualValue; 93 // if cannot extract inequality type from watchpoint 94 // try extract from parameter name 95 if (inequality_type.empty()) { 96 auto pos = name.find_last_of('_'); 97 if (pos != std::string::npos) { 98 inequality_type = name.substr(pos + 1); 99 } 100 } 101 102 std::map<std::string, bool> condition_check{{"gt", actual_value > value}, 103 {"lt", actual_value < value}, 104 {"ge", actual_value >= value}, 105 {"le", actual_value <= value}}; 106 107 hit = condition_check[inequality_type]; 108 } 109 }; 110 struct MappedFiles { 111 std::vector<std::string> bin_files; 112 // key is op_name and value is the vector of matched npy files to that op name. 113 std::map<std::string, std::vector<std::string>> npy_files; 114 }; 115 116 struct DumpFileAttr { 117 std::string file_path; 118 // name_to_match is the op_name extracted from file name. 119 std::string name_to_match; 120 std::string time_stamp; 121 uint64_t slot = 0; 122 bool is_output{false}; 123 }; 124 125 struct ProtoDump { 126 bool operator==(const ProtoDump obj) { 127 return (origin_node_name == obj.origin_node_name && dump_name == obj.dump_name && is_output == obj.is_output); 128 } 129 // name_to_match is the op_name between first and second dot in file_name 130 std::string origin_node_name; 131 std::string dump_name; 132 bool is_output{false}; 133 }; 134 135 typedef std::vector<std::vector<int>> partitioned_numbers; 136 typedef std::vector<std::vector<std::string>> partitioned_names; 137 typedef std::vector<std::vector<std::vector<parameter_t>>> partitioned_parameters; 138 typedef std::vector<std::vector<int32_t>> partitioned_error_code; 139 typedef std::vector<std::vector<unsigned int>> partitioned_id; 140 typedef std::set<std::string> NPYFilePool; 141 typedef std::map<std::string, std::vector<std::tuple<std::string, std::string>>> DirMap; 142 // key is dump dir path and value is vector of bin files and map of npy files. 143 typedef std::map<std::string, DebugServices::MappedFiles> DumpFileMap; 144 typedef std::map<std::string, std::vector<DebugServices::DumpFileAttr>> ProcessedNPYFiles; 145 // bool shows if preprocess was successful, and DumpFileMap is preprocessed file result 146 typedef std::tuple<bool, DumpFileMap> AsyncPreProcessResult; 147 148 struct watchpoint_t { 149 unsigned int id; 150 condition_t condition; 151 std::vector<std::tuple<std::string, bool>> check_node_list; 152 std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_device_list; 153 std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_graph_list; 154 std::vector<parameter_t> parameter_list; 155 size_t location = 0; 156 FindQualifiedTensorNamewatchpoint_t157 std::string FindQualifiedTensorName(const std::string &tensor_name, unsigned const int &tensor_device_id, 158 unsigned const int &tensor_root_graph_id) const { 159 size_t indx = 0; 160 for (auto check_node : check_node_list) { 161 std::string w_name = std::get<0>(check_node); 162 bool w_type = std::get<1>(check_node); 163 auto found = w_name.find_last_of('/'); 164 bool check_tensor_name = found != std::string::npos && w_name.substr(found + 1) == tensor_name; 165 bool check_node_name = 166 (w_type && (tensor_name == w_name || w_name == "*")) || (!w_type && tensor_name == w_name); 167 if (check_tensor_name || check_node_name) { 168 // online debugger only support single card 169 if (check_node_device_list.empty()) { 170 return w_name; 171 } 172 auto device_vec = std::get<1>(check_node_device_list[indx]); 173 auto root_graph_vec = std::get<1>(check_node_graph_list[indx]); 174 auto iter1 = std::find(device_vec.begin(), device_vec.end(), tensor_device_id); 175 auto iter2 = std::find(root_graph_vec.begin(), root_graph_vec.end(), tensor_root_graph_id); 176 if (iter1 != device_vec.end() && iter2 != root_graph_vec.end()) { 177 return w_name; 178 } 179 } 180 indx++; 181 } 182 return {}; 183 } 184 is_gt_wpwatchpoint_t185 bool is_gt_wp() const { 186 return condition.type == MAX_GT || condition.type == MIN_GT || condition.type == MEAN_GT || 187 condition.type == SD_GT || condition.type == MAX_MIN_GT; 188 } 189 is_lt_wpwatchpoint_t190 bool is_lt_wp() const { 191 return condition.type == MAX_LT || condition.type == MIN_LT || condition.type == MEAN_LT || 192 condition.type == SD_LT || condition.type == MAX_MIN_LT; 193 } 194 195 // for parameter_list of the condition TOO_LARGE/TOO_SMALL, the meaning of parameter_list is: 196 // parameter_list[0]: the absolute mean value is set; parameter_list[1]: the max value is set; 197 // parameter_list[2]: the min is set; parameter_list[3]: the mean value is set. 198 // mean or sd related condition set mean_sd_enabledwatchpoint_t199 bool mean_sd_enabled() const { 200 return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT || 201 condition.type == SD_GT || (condition.type == TOO_LARGE && !parameter_list[3].disabled) || 202 (condition.type == TOO_SMALL && !parameter_list[3].disabled); 203 } abs_mean_enabledwatchpoint_t204 bool abs_mean_enabled() const { 205 return (condition.type == TOO_LARGE && !parameter_list[0].disabled) || 206 (condition.type == TOO_SMALL && !parameter_list[0].disabled); 207 } 208 tensor_update_ratio_mean_enabledwatchpoint_t209 bool tensor_update_ratio_mean_enabled() const { 210 return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL; 211 } allclose_enabledwatchpoint_t212 bool allclose_enabled() const { return condition.type == NOT_CHANGED; } 213 214 // for parameter_list of the condition RANGE, the meaning of parameter_list is: 215 // parameter_list[0]: the elements value in range is lower than setting percentage is set; 216 // parameter_list[1]: the elements value in range is higher than setting percentage is set. range_enabledwatchpoint_t217 bool range_enabled() const { 218 return condition.type == RANGE && (!parameter_list[0].disabled || !parameter_list[1].disabled); 219 } 220 change_conditionwatchpoint_t221 bool change_condition() const { 222 return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL || condition.type == NOT_CHANGED; 223 } 224 }; 225 226 struct TensorBase { TensorBaseTensorBase227 TensorBase(uint64_t data_size, int dtype, const std::vector<int64_t> &shape) 228 : data_size(data_size), dtype(dtype), shape(shape) {} 229 TensorBase() = default; 230 uint64_t data_size = 0; 231 int dtype = 0; 232 std::vector<int64_t> shape; 233 }; 234 235 struct TensorStat { 236 TensorStat(uint64_t data_size, int dtype, const std::vector<int64_t> &shape, bool is_bool, double max_value, 237 double min_value, double avg_value, uint64_t count, uint64_t neg_zero_count, uint64_t pos_zero_count, 238 uint64_t nan_count, uint64_t neg_inf_count, uint64_t pos_inf_count, uint64_t zero_count, double l2_value, 239 std::string md5 = "") data_sizeTensorStat240 : data_size(data_size), 241 dtype(dtype), 242 shape(shape), 243 is_bool(is_bool), 244 max_value(max_value), 245 min_value(min_value), 246 avg_value(avg_value), 247 count(count), 248 neg_zero_count(neg_zero_count), 249 pos_zero_count(pos_zero_count), 250 nan_count(nan_count), 251 neg_inf_count(neg_inf_count), 252 pos_inf_count(pos_inf_count), 253 zero_count(zero_count), 254 l2_value(l2_value), 255 md5(md5) {} 256 257 TensorStat() = default; 258 259 uint64_t data_size = 0; 260 int dtype = 0; 261 std::vector<int64_t> shape; 262 bool is_bool = false; 263 double max_value = std::numeric_limits<double>::lowest(); 264 double min_value = std::numeric_limits<double>::max(); 265 double avg_value = 0.0; 266 uint64_t count = 0; 267 uint64_t neg_zero_count = 0; 268 uint64_t pos_zero_count = 0; 269 uint64_t nan_count = 0; 270 uint64_t neg_inf_count = 0; 271 uint64_t pos_inf_count = 0; 272 uint64_t zero_count = 0; 273 double l2_value = 0.0; 274 std::string md5 = ""; 275 std::map<std::string, std::string> header_item_map; DoubleToStringTensorStat276 std::string DoubleToString(double value) { 277 std::ostringstream ss; 278 ss << value; 279 return ss.str(); 280 } UpdateHeaderItemMapTensorStat281 void UpdateHeaderItemMap() { 282 header_item_map = {{"max", DoubleToString(max_value)}, 283 {"min", DoubleToString(min_value)}, 284 {"avg", DoubleToString(avg_value)}, 285 {"count", std::to_string(count)}, 286 {"negative zero count", std::to_string(neg_zero_count)}, 287 {"positive zero count", std::to_string(pos_zero_count)}, 288 {"nan count", std::to_string(nan_count)}, 289 {"negative inf count", std::to_string(neg_inf_count)}, 290 {"positive inf count", std::to_string(pos_inf_count)}, 291 {"zero count", std::to_string(zero_count)}, 292 {"l2norm", DoubleToString(l2_value)}, 293 {"md5", md5}}; 294 } 295 }; 296 297 struct ChunkData { 298 partitioned_names chunk_names; 299 partitioned_names chunk_slots; 300 partitioned_numbers chunk_conditions; 301 partitioned_id chunk_watchpoint_id; 302 partitioned_parameters chunk_parameters; 303 partitioned_error_code chunk_error_codes; 304 partitioned_numbers chunk_exec_orders; 305 partitioned_id chunk_device_id; 306 partitioned_id chunk_root_graph_id; 307 std::vector<uint64_t> chunk_tensor_byte_size; 308 partitioned_names chunk_time_stamp; 309 }; 310 311 static TensorStat GetTensorStatistics(const std::shared_ptr<TensorData> &tensor); 312 313 void AddWatchpoint( 314 int id, int watch_condition, float parameter, const std::vector<std::tuple<std::string, bool>> &check_node_list, 315 const std::vector<parameter_t> ¶meter_list, 316 const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list = nullptr, 317 const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list = nullptr); 318 319 void RemoveWatchpoint(unsigned int id); 320 321 #ifdef OFFLINE_DBG_MODE 322 void CheckOutofMemoryandNoValue(const bool no_mem_to_read, const bool error_on_no_value, 323 const std::vector<watchpoint_t> watchpoints_to_check, const int chunk_id, 324 ChunkData *chunk_data, std::vector<unsigned int> *const device_id, 325 std::vector<unsigned int> *const root_graph_id, const int exec_order, 326 const std::string time_stamp, const std::string &qualified_tensor_name, 327 const std::string &tensor_slot, const unsigned int device_id_val, 328 const unsigned int root_graph_id_val, 329 const std::vector<parameter_t> ¶meter_list) const; 330 #endif 331 332 const void *PreparePrevTensor(uint64_t *prev_num_elements, const std::string &tensor_name); 333 334 void CheckHistoryErrorCode(int *error_code, bool history_not_found) const; 335 336 void CheckWatchpointsForTensor(ChunkData *chunk_data, ProcessedNPYFiles *const processed_npy_files, 337 std::vector<std::shared_ptr<TensorData>> *const tensor_list, int begin, int end, 338 int chunk_id, const bool init_dbg_suspend, const bool step_end, const bool recheck, 339 std::vector<unsigned int> *device_id, std::vector<unsigned int> *root_graph_id, 340 bool error_on_no_value = false); 341 342 void GetOverflowTaskStreamId(const std::string &overflow_bin_path, 343 std::vector<std::pair<uint64_t, uint64_t>> *task_stream_hits) const; 344 345 void GetTaskStreamIdNodeMap(const std::string &tensor_path, 346 std::map<std::pair<uint64_t, uint64_t>, std::string> *task_stream_to_opnames) const; 347 348 void AddOpOverflowOpNames(const std::string &overflow_bin_path, const std::string &tensors_path, 349 std::vector<std::string> *op_names) const; 350 351 void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition, 352 std::vector<unsigned int> *const watchpoint_id, 353 std::vector<std::vector<parameter_t>> *parameters, std::vector<int32_t> *error_code, 354 ProcessedNPYFiles *const processed_npy_files, 355 std::vector<std::shared_ptr<TensorData>> *tensor_list, bool init_dbg_suspend, 356 const bool step_end, const bool recheck, std::vector<unsigned int> *device_id = nullptr, 357 std::vector<unsigned int> *root_graph_id = nullptr, bool error_on_no_value = false); 358 359 void SortWatchpointsInfo(std::vector<std::future<void>> *const tensor_future_vec, std::vector<int> *exec_order, 360 std::vector<std::string> *time_stamps, uint64_t *tensor_list_byte_size, 361 std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition, 362 std::vector<unsigned int> *const watchpoint_id, 363 std::vector<std::vector<parameter_t>> *parameters, std::vector<int32_t> *error_codes, 364 ChunkData *chunk_data, std::vector<unsigned int> *device_id, 365 std::vector<unsigned int> *root_graph_id) const; 366 #ifdef OFFLINE_DBG_MODE 367 void SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr); 368 #endif 369 370 void AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck, 371 const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed, 372 std::string *qualified_tensor_name, std::vector<watchpoint_t> *watchpoints_to_check); 373 374 void SetCheckWatchpointsResult(const int chunk_id, ChunkData *chunk_data, std::vector<unsigned int> *device_id, 375 std::vector<unsigned int> *root_graph_id, const int exec_order, 376 const std::string time_stamp, const std::string &qualified_tensor_name, 377 const std::string &tensor_slot, const watchpoint_t &wp, 378 const unsigned int device_id_val, const unsigned int root_graph_id_val, 379 const std::vector<parameter_t> ¶meter_list, const int32_t error_code) const; 380 #ifdef OFFLINE_DBG_MODE 381 void AddToTensorData(const std::string &backend_name, const std::string &time_stamp, const std::size_t slot, 382 const unsigned int iteration, const unsigned int device_id, const unsigned int root_graph_id, 383 const bool is_output, const std::size_t data_size, const std::string &type_name, 384 const std::vector<int64_t> &shape, char *buffer, 385 std::vector<std::shared_ptr<TensorData>> *const result_list); 386 387 void SetPrefixToCheck(std::string *const prefix_dump_file_name, std::string *const slot_string_to_check, 388 std::string *const dump_style_kernel_name, size_t slot, bool is_output); 389 390 void ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot, 391 std::vector<unsigned int> device_id, std::vector<unsigned int> iteration, 392 std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output, 393 ProcessedNPYFiles *const processed_npy_files, 394 std::vector<std::shared_ptr<TensorData>> *const result_list, bool is_base_request, 395 bool *no_mem_to_read = nullptr); 396 397 void ProcessTensorDataSync(const std::vector<ProtoDump> &proto_to_dump, const std::string &specific_dump_dir, 398 ProcessedNPYFiles processed_npy_files, unsigned int iteration, unsigned int device_id, 399 unsigned int root_graph_id, std::vector<std::shared_ptr<TensorData>> *const tensor_list, 400 bool error_on_no_value = false); 401 402 void ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths, 403 const std::vector<std::string> &matched_time_stamps, const std::string &backend_name, 404 const unsigned int device_id, const unsigned int root_graph_id, bool is_output, 405 size_t slot, bool *no_mem_to_read, unsigned int iteration, 406 std::vector<std::shared_ptr<TensorData>> *result_list, bool is_base_request = false); 407 408 void ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir, 409 const std::string &backend_name, size_t slot, unsigned int device_id, 410 unsigned int iteration, unsigned int root_graph_id, const bool &is_output, 411 std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read); 412 413 void ReadDumpedTensorUtils(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check, 414 const std::string &backend_name, size_t slot, unsigned int device_id, 415 unsigned int iteration, unsigned int root_graph_id, bool is_output, 416 const ProcessedNPYFiles &processed_npy_files, 417 std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read, 418 bool is_base_request = false); 419 420 std::vector<std::shared_ptr<TensorData>> ReadNeededDumpedTensors(unsigned int iteration, 421 ProcessedNPYFiles *const processed_npy_files, 422 bool error_on_no_value = false); 423 424 const void *GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed, 425 uint64_t *prev_num_elements, bool *history_not_found); 426 427 void ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name, std::string *const tensor_type, 428 std::size_t *const size, std::vector<int64_t> *const shape, char **const data_buffer, 429 bool *no_mem_to_read, bool is_base_request = false); 430 431 AsyncPreProcessResult PreProcessDumpDirAsync(const std::string &specific_dump_dir) const; 432 433 DebugServices::NPYFilePool PreProcessDumpDirSync(const std::string &specific_dump_dir) const; 434 435 ProcessedNPYFiles ProcessNPYFilePool(const NPYFilePool &npy_file_pool) const; 436 437 void ConvertToHostFormat(const DirMap &dir_to_files_map, NPYFilePool *const result_list) const; 438 439 void ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir, 440 const std::string &dump_key, NPYFilePool *const result_list) const; 441 442 void ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot, 443 std::vector<unsigned int> device_id, std::vector<unsigned int> iteration, 444 std::vector<unsigned int> root_graph_id, NPYFilePool *const result_list); 445 446 void ConvertWatchPointNodes(const DumpFileMap &dump_dir_mapped_files, const std::vector<ProtoDump> &proto_dump, 447 const std::string &specific_dump_dir, NPYFilePool *const result_list) const; 448 449 void ProcessConvertList(const DumpFileMap &dump_dir_mapped_files, const std::string &prefix_dump_file_name, 450 const std::string &specific_dump_dir, DirMap *dir_to_files_map, 451 NPYFilePool *const result_list) const; 452 453 void GetTensorDataInfoAsync(const std::vector<ProtoDump> &proto_dump, const std::string &specific_dump_dir, 454 uint32_t iteration, uint32_t device_id, uint32_t root_graph_id, 455 const ProcessedNPYFiles &processed_async_files, 456 std::vector<std::shared_ptr<TensorData>> *const tensor_list); 457 458 void SetGraphsHistory(); 459 460 std::vector<uint32_t> GetDumpRankIdList(); 461 462 void CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list); 463 464 void ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id); 465 466 std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> GetAllWpNodes(); 467 468 void ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph); 469 470 std::string IterationString(unsigned int iteration) const; 471 #endif 472 void ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *ret_name, 473 std::vector<const char *> *data_ptr, std::vector<ssize_t> *data_size, 474 std::vector<unsigned int> *dtype, std::vector<std::vector<int64_t>> *const shape); 475 476 void SearchNodesTensors(const std::vector<std::string> &name, 477 std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list); 478 #ifndef OFFLINE_DBG_MODE 479 bool IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel = nullptr) const; 480 481 bool IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const; 482 483 bool CompareCurrentRootGraph(uint32_t id) const; 484 #endif 485 486 std::vector<std::shared_ptr<TensorData>> GetTensor() const; 487 488 std::shared_ptr<TensorData> GetTensor(const std::string &tensor_name) const; 489 490 void AddAnalyzedTensorToCache(const bool recheck, const unsigned int id, const std::string &tensor_name); 491 492 void EmptyCurrentTensor(); 493 494 #ifndef OFFLINE_DBG_MODE 495 bool DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const; 496 #endif 497 498 bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev); 499 500 uint32_t GetPrevIteration(const std::shared_ptr<TensorData> &tensor); 501 502 void ResetLoadedTensors(); 503 #ifndef OFFLINE_DBG_MODE 504 std::vector<std::shared_ptr<TensorData>> GetNodeTensor(const CNodePtr &kernel); 505 #endif 506 507 // Find if any operation overflow happened on a particular node name 508 bool CheckOpOverflow(std::string node_name_to_find, unsigned int device_id = 0, unsigned int root_graph_id = 0, 509 unsigned int iteration = 0); 510 511 std::string RemoveKernelGraphPrefix(std::string node_name_to_find) const; 512 513 bool GetTaskIdStreamId(std::string file_name, std::string overflow_file_prefix, uint64_t *const task_id, 514 uint64_t *const stream_id) const; 515 516 bool GetAttrsFromFilename(const std::string &file_name, std::string *const node_name, uint64_t *const task_id, 517 uint64_t *const stream_id) const; 518 519 std::string RealPath(const std::string &input_path) const; 520 521 bool TensorExistsInCurrent(const std::string &tensor_name); 522 523 void MoveTensorCurrentToPrev(const std::string &tensor_name); 524 525 void AppendToCacheEvictQueue(const std::string &tensor_name); 526 527 void SetNetName(std::string net_name); 528 529 std::string GetNetName(); 530 531 void SetDumpDir(std::string dump_dir); 532 533 std::string GetDumpDir(); 534 535 void SetSyncMode(bool is_sync_mode); 536 537 bool GetSyncMode() const; 538 539 void SetMemLimit(uint64_t max_mem_size); 540 541 void CheckWatchpointProgress(size_t tensor_list_size); 542 GetProcessedTensorCount()543 size_t GetProcessedTensorCount() const { return tensor_processed_count_; } 544 545 private: 546 std::mutex lock_; 547 std::mutex wp_lock_; 548 std::mutex overflow_wp_lock_; 549 550 // to keep track of watchpoints that have been checked already for a tensor in current step 551 std::unordered_map<std::string, std::set<int32_t>> wp_id_cache_; 552 std::unordered_map<unsigned int, watchpoint_t> watchpoint_table_; 553 // key is the iteration path, value is vector of op_names which have overflowed 554 std::unordered_map<std::string, std::vector<std::string>> overflow_ops_; 555 std::string net_name_; 556 std::string dump_dir_; 557 // store history of graphs that have been run (rank_id, graph_id) 558 std::map<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>> graphs_run_history_; 559 bool is_sync_mode_{false}; 560 // processed tensors in checkwatchpoint function 561 std::atomic<size_t> tensor_processed_count_{0}; 562 bool wp_progress_enabled_{false}; 563 std::unique_ptr<std::thread> wp_progress_thread_; 564 std::shared_ptr<TensorLoader> tensor_loader_; 565 }; 566 } // namespace mindspore 567 568 #endif // MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_ 569