1 /** 2 * Copyright 2020-2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_ 17 #define MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_ 18 19 #ifndef OFFLINE_DBG_MODE 20 #define ONLINE_DBG_MODE 21 #endif 22 23 #ifdef OFFLINE_DBG_MODE 24 #include "base/float16.h" 25 #endif 26 27 #include <math.h> 28 #include <vector> 29 #include <future> 30 #include <string> 31 #include <memory> 32 #include <tuple> 33 #include <unordered_map> 34 #include <set> 35 #include <mutex> 36 #include <map> 37 #include <limits> 38 #include <sstream> 39 #include "debug/tensor_load.h" 40 #include "debug/tensor_data.h" 41 42 #ifdef ONLINE_DBG_MODE 43 namespace mindspore { 44 #endif 45 class DebugServices { 46 public: 47 DebugServices(); 48 49 DebugServices(const DebugServices &other); 50 51 DebugServices &operator=(const DebugServices &other); 52 53 ~DebugServices() = default; 54 55 enum CONDITION_TYPE { 56 HAS_NAN, 57 HAS_INF, 58 IS_OVERFLOW, 59 MAX_GT, 60 MAX_LT, 61 MIN_GT, 62 MIN_LT, 63 MAX_MIN_GT, 64 MAX_MIN_LT, 65 MEAN_GT, 66 MEAN_LT, 67 SD_GT, 68 SD_LT, 69 GENERAL_OVERFLOW, 70 INIT, 71 TOO_LARGE, 72 TOO_SMALL, 73 ALL_ZERO, 74 CHANGE_TOO_LARGE, 75 CHANGE_TOO_SMALL, 76 NOT_CHANGED, 77 RANGE 78 }; 79 80 struct condition_t { 81 CONDITION_TYPE type; 82 float parameter = 0; 83 }; 84 85 struct parameter_t { 86 std::string name; 87 bool disabled; 88 double_t value; 89 bool hit; 90 double_t actual_value; Evaluateparameter_t91 void Evaluate(double_t actualValue, std::string inequality_type) { 92 if (std::isnan(actualValue)) { 93 return; 94 } 95 96 actual_value = actualValue; 97 // if cannot extract inequality type from watchpoint 98 // try extract from parameter name 99 if (inequality_type.empty()) { 100 auto pos = name.find_last_of('_'); 101 if (pos != std::string::npos) { 102 inequality_type = name.substr(pos + 1); 103 } 104 } 105 106 std::map<std::string, bool> condition_check{{"gt", actual_value > value}, 107 {"lt", actual_value < value}, 108 {"ge", actual_value >= value}, 109 {"le", actual_value <= value}}; 110 111 hit = condition_check[inequality_type]; 112 } 113 }; 114 115 typedef std::vector<std::vector<int>> partitioned_numbers; 116 typedef std::vector<std::vector<std::string>> partitioned_names; 117 typedef std::vector<std::vector<std::vector<parameter_t>>> partitioned_parameters; 118 typedef std::vector<std::vector<int32_t>> partitioned_error_code; 119 typedef std::vector<std::vector<unsigned int>> partitioned_id; 120 121 struct watchpoint_t { 122 unsigned int id; 123 condition_t condition; 124 std::vector<std::tuple<std::string, bool>> check_node_list; 125 std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_device_list; 126 std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_graph_list; 127 std::vector<parameter_t> parameter_list; 128 size_t location = 0; 129 FindQualifiedTensorNamewatchpoint_t130 std::string FindQualifiedTensorName(const std::string &tensor_name, unsigned const int &tensor_device_id, 131 unsigned const int &tensor_root_graph_id) const { 132 int indx = 0; 133 for (auto check_node : check_node_list) { 134 std::string w_name = std::get<0>(check_node); 135 bool w_type = std::get<1>(check_node); 136 auto found = w_name.find_last_of('/'); 137 bool check_tensor_name = found != std::string::npos && w_name.substr(found + 1) == tensor_name; 138 bool check_node_name = 139 (w_type && (tensor_name == w_name || w_name == "*")) || (!w_type && tensor_name == w_name); 140 if (check_tensor_name || check_node_name) { 141 // online debugger only support single card 142 if (check_node_device_list.empty()) { 143 return w_name; 144 } 145 auto device_vec = std::get<1>(check_node_device_list[indx]); 146 auto root_graph_vec = std::get<1>(check_node_graph_list[indx]); 147 auto iter1 = std::find(device_vec.begin(), device_vec.end(), tensor_device_id); 148 auto iter2 = std::find(root_graph_vec.begin(), root_graph_vec.end(), tensor_root_graph_id); 149 if (iter1 != device_vec.end() && iter2 != root_graph_vec.end()) { 150 return w_name; 151 } 152 } 153 indx++; 154 } 155 return {}; 156 } 157 is_gt_wpwatchpoint_t158 bool is_gt_wp() const { 159 return condition.type == MAX_GT || condition.type == MIN_GT || condition.type == MEAN_GT || 160 condition.type == SD_GT || condition.type == MAX_MIN_GT; 161 } 162 is_lt_wpwatchpoint_t163 bool is_lt_wp() const { 164 return condition.type == MAX_LT || condition.type == MIN_LT || condition.type == MEAN_LT || 165 condition.type == SD_LT || condition.type == MAX_MIN_LT; 166 } 167 168 // mean or sd related condition set mean_sd_enabledwatchpoint_t169 bool mean_sd_enabled() const { 170 return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT || 171 condition.type == SD_GT || (condition.type == TOO_LARGE && !parameter_list[3].disabled) || 172 (condition.type == TOO_SMALL && !parameter_list[3].disabled); 173 } abs_mean_enabledwatchpoint_t174 bool abs_mean_enabled() const { 175 return (condition.type == TOO_LARGE && !parameter_list[0].disabled) || 176 (condition.type == TOO_SMALL && !parameter_list[0].disabled); 177 } 178 tensor_update_ratio_mean_enabledwatchpoint_t179 bool tensor_update_ratio_mean_enabled() const { 180 return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL; 181 } allclose_enabledwatchpoint_t182 bool allclose_enabled() const { return condition.type == NOT_CHANGED; } 183 range_enabledwatchpoint_t184 bool range_enabled() const { 185 return condition.type == RANGE && (!parameter_list[0].disabled || !parameter_list[1].disabled); 186 } 187 change_conditionwatchpoint_t188 bool change_condition() const { 189 return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL || condition.type == NOT_CHANGED; 190 } 191 }; 192 193 struct TensorBase { TensorBaseTensorBase194 TensorBase(uint64_t data_size, int dtype, const std::vector<int64_t> &shape) 195 : data_size(data_size), dtype(dtype), shape(shape) {} 196 TensorBase() = default; 197 uint64_t data_size = 0; 198 int dtype = 0; 199 std::vector<int64_t> shape; 200 }; 201 202 struct TensorStat { TensorStatTensorStat203 TensorStat(uint64_t data_size, int dtype, const std::vector<int64_t> &shape, bool is_bool, double max_value, 204 double min_value, double avg_value, int count, int neg_zero_count, int pos_zero_count, int nan_count, 205 int neg_inf_count, int pos_inf_count, int zero_count) 206 : data_size(data_size), 207 dtype(dtype), 208 shape(shape), 209 is_bool(is_bool), 210 max_value(max_value), 211 min_value(min_value), 212 avg_value(avg_value), 213 count(count), 214 neg_zero_count(neg_zero_count), 215 pos_zero_count(pos_zero_count), 216 nan_count(nan_count), 217 neg_inf_count(neg_inf_count), 218 pos_inf_count(pos_inf_count), 219 zero_count(zero_count) {} 220 221 TensorStat() = default; 222 223 uint64_t data_size = 0; 224 int dtype = 0; 225 std::vector<int64_t> shape; 226 bool is_bool = false; 227 double max_value = std::numeric_limits<double>::lowest(); 228 double min_value = std::numeric_limits<double>::max(); 229 double avg_value = 0.0; 230 int count = 0; 231 int neg_zero_count = 0; 232 int pos_zero_count = 0; 233 int nan_count = 0; 234 int neg_inf_count = 0; 235 int pos_inf_count = 0; 236 int zero_count = 0; 237 }; 238 239 TensorStat GetTensorStatistics(const std::shared_ptr<TensorData> &tensor); 240 241 void AddWatchpoint( 242 unsigned int id, unsigned int watch_condition, float parameter, 243 const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> ¶meter_list, 244 const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list = nullptr, 245 const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list = nullptr); 246 247 void RemoveWatchpoint(unsigned int id); 248 249 #ifdef OFFLINE_DBG_MODE 250 void ProcessCheckpointsOutofMemory( 251 const bool no_mem_to_read, const std::vector<watchpoint_t> watchpoints_to_check, const int chunk_id, 252 partitioned_names *const chunk_names, partitioned_names *const chunk_slots, 253 partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id, 254 partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes, 255 partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp, 256 partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id, 257 std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order, 258 const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot, 259 const unsigned int device_id_val, const unsigned int root_graph_id_val, 260 const std::vector<parameter_t> ¶meter_list); 261 #endif 262 263 void CheckWatchpointsForTensor(partitioned_names *chunk_names, partitioned_names *chunk_slots, 264 partitioned_numbers *chunk_conditions, partitioned_id *const chunk_watchpoint_id, 265 partitioned_parameters *chunk_parameters, partitioned_error_code *chunk_error_codes, 266 const std::vector<std::string> &op_overflows, 267 const std::vector<std::string> &async_file_pool, 268 partitioned_numbers *chunk_exec_orders, 269 std::vector<std::shared_ptr<TensorData>> *tensor_list, int begin, int end, 270 int chunk_id, const bool init_dbg_suspend, const bool step_end, const bool recheck, 271 partitioned_id *chunk_device_id, partitioned_id *chunk_root_graph_id, 272 std::vector<uint64_t> *chunk_tensor_byte_size, partitioned_names *chunk_time_stamp, 273 std::vector<unsigned int> *device_id, std::vector<unsigned int> *root_graph_id); 274 275 void CheckWatchpoints(std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition, 276 std::vector<unsigned int> *const watchpoint_id, 277 std::vector<std::vector<parameter_t>> *parameters, std::vector<int32_t> *error_code, 278 const std::vector<std::string> &op_overflows, const std::vector<std::string> &async_file_pool, 279 std::vector<std::shared_ptr<TensorData>> *tensor_list, bool init_dbg_suspend, 280 const bool step_end, const bool recheck, std::vector<unsigned int> *device_id = nullptr, 281 std::vector<unsigned int> *root_graph_id = nullptr); 282 283 void SortWatchpointsInfo(std::vector<std::future<void>> *tensor_future_vec, std::vector<int> *exec_order, 284 std::vector<std::string> *time_stamps, uint64_t *tensor_list_byte_size, 285 std::vector<std::string> *name, std::vector<std::string> *slot, std::vector<int> *condition, 286 std::vector<unsigned int> *const watchpoint_id, 287 std::vector<std::vector<parameter_t>> *parameters, std::vector<int32_t> *error_codes, 288 partitioned_names *chunk_names, partitioned_names *chunk_slots, 289 partitioned_numbers *chunk_conditions, partitioned_id *chunk_watchpoint_id, 290 partitioned_parameters *chunk_parameters, partitioned_error_code *chunk_error_codes, 291 partitioned_numbers *chunk_exec_orders, partitioned_names *chunk_time_stamp, 292 std::vector<uint64_t> *chunk_tensor_byte_size, partitioned_id *chunk_device_id, 293 partitioned_id *chunk_root_graph_id, std::vector<unsigned int> *device_id, 294 std::vector<unsigned int> *root_graph_id); 295 296 void AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck, 297 const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed, 298 std::string *qualified_tensor_name, std::vector<watchpoint_t> *watchpoints_to_check); 299 300 void SetCheckWatchpointsResult(const int chunk_id, partitioned_names *chunk_names, partitioned_names *chunk_slots, 301 partitioned_numbers *chunk_conditions, partitioned_id *chunk_watchpoint_id, 302 partitioned_parameters *chunk_parameters, partitioned_error_code *chunk_error_codes, 303 partitioned_numbers *chunk_exec_orders, partitioned_names *chunk_time_stamp, 304 partitioned_id *chunk_device_id, partitioned_id *chunk_root_graph_id, 305 std::vector<unsigned int> *device_id, std::vector<unsigned int> *root_graph_id, 306 const int exec_order, const std::string time_stamp, 307 const std::string &qualified_tensor_name, const std::string &tensor_slot, 308 const watchpoint_t &wp, const unsigned int device_id_val, 309 const unsigned int root_graph_id_val, const std::vector<parameter_t> ¶meter_list, 310 const int32_t error_code); 311 #ifdef OFFLINE_DBG_MODE 312 void AddToTensorData(const std::string &backend_name, const std::string &time_stamp, const std::size_t slot, 313 const unsigned int iteration, const unsigned int device_id, const unsigned int root_graph_id, 314 const bool is_output, const std::size_t data_size, const std::string &type_name, 315 const std::vector<int64_t> &shape, std::vector<char> *buffer, 316 std::vector<std::shared_ptr<TensorData>> *const result_list); 317 318 void SetPrefixToCheck(std::string *const prefix_dump_file_name, std::string *const slot_string_to_check, 319 std::string *const dump_style_kernel_name, size_t slot, bool is_output); 320 321 void ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot, 322 std::vector<unsigned int> device_id, std::vector<unsigned int> iteration, 323 std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output, 324 const std::vector<std::string> &async_file_pool, 325 std::vector<std::shared_ptr<TensorData>> *const result_list, bool *no_mem_to_read = nullptr); 326 327 void ProcessTensorDataSync(const std::vector<std::tuple<std::string, std::string>> &proto_to_dump, 328 const std::string &abspath, const std::string &specific_dump_dir, unsigned int iteration, 329 unsigned int device_id, unsigned int root_graph_id, 330 std::vector<std::shared_ptr<TensorData>> *const tensor_list); 331 332 void ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths, 333 const std::string &backend_name, const unsigned int device_id, 334 const unsigned int root_graph_id, const bool &is_output, size_t slot, 335 bool *no_mem_to_read, unsigned int iteration, 336 std::vector<std::shared_ptr<TensorData>> *result_list); 337 338 void ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir, 339 const std::string &backend_name, size_t slot, unsigned int device_id, 340 unsigned int iteration, unsigned int root_graph_id, const bool &is_output, 341 std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read); 342 343 void ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check, 344 const std::string &slot_string_to_check, const std::string &backend_name, size_t slot, 345 unsigned int device_id, unsigned int iteration, unsigned int root_graph_id, 346 const bool &is_output, const std::vector<std::string> &async_file_pool, 347 std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read); 348 349 std::vector<std::shared_ptr<TensorData>> ReadNeededDumpedTensors(unsigned int iteration, 350 std::vector<std::string> *const async_file_pool); 351 352 const void *GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed, 353 uint32_t *prev_num_elements); 354 355 void ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name, std::string *const tensor_type, 356 std::size_t *const size, std::vector<int64_t> *const shape, 357 std::vector<char> **const data_buffer, bool *no_mem_to_read); 358 359 void ConvertToHostFormat(const std::map<std::string, std::vector<std::string>> &dir_to_files_map, 360 std::vector<std::string> *const result_list); 361 362 void ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir, 363 const std::string &dump_key, std::vector<std::string> *const result_list, 364 const std::string &file_format); 365 366 void ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot, 367 std::vector<unsigned int> device_id, std::vector<unsigned int> iteration, 368 std::vector<unsigned int> root_graph_id, std::vector<std::string> *const result_list); 369 370 void ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump, 371 const std::string &specific_dump_dir, std::vector<std::string> *const result_list); 372 373 void ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &file_format, 374 const std::string &specific_dump_dir, 375 std::map<std::string, std::vector<std::string>> *dir_to_files_map, 376 std::vector<std::string> *const result_list); 377 378 void GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump, 379 const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id, 380 uint32_t root_graph_id, const std::vector<std::string> &async_file_pool, 381 std::vector<std::shared_ptr<TensorData>> *const tensor_list); 382 383 std::string GetStrippedFilename(const std::string &file_name); 384 385 std::string IterationString(unsigned int iteration); 386 #endif 387 void ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *ret_name, 388 std::vector<const char *> *data_ptr, std::vector<ssize_t> *data_size, 389 std::vector<unsigned int> *dtype, std::vector<std::vector<int64_t>> *const shape); 390 391 void SearchNodesTensors(const std::vector<std::string> &name, 392 std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list); 393 #ifdef ONLINE_DBG_MODE 394 bool IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel = nullptr) const; 395 396 bool IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const; 397 #endif 398 399 std::vector<std::shared_ptr<TensorData>> GetTensor() const; 400 401 void AddAnalyzedTensorToCache(const bool recheck, const unsigned int id, const std::string &tensor_name); 402 403 void EmptyCurrentTensor(); 404 405 #ifdef ONLINE_DBG_MODE 406 bool DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath, 407 const std::string &host_fmt, const std::vector<int64_t> &host_shape, TypeId host_type, 408 TypeId device_type, const std::string &addr_format, size_t slot) const; 409 #endif 410 411 bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev); 412 413 void ResetLoadedTensors(); 414 #ifdef ONLINE_DBG_MODE 415 std::vector<std::shared_ptr<TensorData>> GetNodeTensor(const CNodePtr &kernel); 416 #endif 417 418 // Find if any operation overflow happened on a particular node name 419 bool CheckOpOverflow(std::string node_name_to_find, unsigned int device_id = 0, unsigned int root_graph_id = 0, 420 unsigned int iteration = 0); 421 422 bool GetAttrsFromAsyncFilename(const std::string &file_name, std::string *const node_name, uint64_t *task_id, 423 uint64_t *stream_id); 424 425 std::string RealPath(const std::string &input_path); 426 427 uint64_t BytestoUInt64(const std::vector<char> &buffer); 428 429 bool TensorExistsInCurrent(const std::string &tensor_name); 430 431 void MoveTensorCurrentToPrev(const std::string &tensor_name); 432 433 void AppendToCacheEvictQueue(const std::string &tensor_name); 434 435 void SetNetName(std::string net_name); 436 437 std::string GetNetName(); 438 439 void SetDumpDir(std::string dump_dir); 440 441 std::string GetDumpDir(); 442 443 void SetSyncMode(bool is_sync_mode); 444 445 bool GetSyncMode(); 446 447 void SetMemLimit(uint64_t max_mem_size); 448 449 private: 450 std::mutex lock_; 451 std::mutex wp_lock_; 452 std::mutex overflow_wp_lock_; 453 454 // to keep track of watchpoints that have been checked already for a tensor in current step 455 std::unordered_map<std::string, std::set<int32_t>> wp_id_cache_; 456 std::unordered_map<unsigned int, watchpoint_t> watchpoint_table_; 457 // key is the iteration path, value is vector of op_names which have overflowed 458 std::unordered_map<std::string, std::vector<std::string>> overflow_ops_; 459 std::string net_name_; 460 std::string dump_dir_; 461 bool is_sync_mode_{false}; 462 463 std::shared_ptr<TensorLoader> tensor_loader_; 464 }; 465 #ifdef ONLINE_DBG_MODE 466 } // namespace mindspore 467 #endif 468 469 #endif // MINDSPORE_CCSRC_DEBUG_DEBUG_SERVICES_H_ 470