1 /** 2 * Copyright 2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef DEBUG_DBG_SERVICES_H_ 17 #define DEBUG_DBG_SERVICES_H_ 18 19 #include <vector> 20 #include <string> 21 #include <map> 22 #include <memory> 23 #include <tuple> 24 #include <iostream> 25 #include <variant> 26 #include "pybind11/pybind11.h" 27 #include "pybind11/stl.h" 28 #include "pybind11/stl_bind.h" 29 30 #include "utils/ms_utils.h" 31 #include "debug/debug_services.h" 32 namespace py = pybind11; 33 namespace common = mindspore::common; 34 35 struct parameter_t { parameter_tparameter_t36 parameter_t(const std::string &name, bool disabled, double value, bool hit, double actual_value) 37 : name(name), disabled(disabled), value(value), hit(hit), actual_value(actual_value) {} get_nameparameter_t38 const std::string get_name() const { return name; } get_disabledparameter_t39 const bool get_disabled() const { return disabled; } get_valueparameter_t40 const double get_value() const { return value; } get_hitparameter_t41 const bool get_hit() const { return hit; } get_actual_valueparameter_t42 const double get_actual_value() const { return actual_value; } 43 std::string name; 44 bool disabled; 45 double value; 46 bool hit; 47 double actual_value; 48 }; 49 50 struct watchpoint_hit_t { watchpoint_hit_twatchpoint_hit_t51 watchpoint_hit_t(const std::string &name, uint32_t slot, int condition, uint32_t watchpoint_id, 52 const std::vector<parameter_t> ¶meters, int32_t error_code, uint32_t rank_id, 53 uint32_t root_graph_id) 54 : name(name), 55 slot(slot), 56 condition(condition), 57 watchpoint_id(watchpoint_id), 58 parameters(parameters), 59 error_code(error_code), 60 rank_id(rank_id), 61 root_graph_id(root_graph_id) {} get_namewatchpoint_hit_t62 const std::string get_name() const { return name; } get_slotwatchpoint_hit_t63 const uint32_t get_slot() const { return slot; } get_conditionwatchpoint_hit_t64 const int get_condition() const { return condition; } get_watchpoint_idwatchpoint_hit_t65 const uint32_t get_watchpoint_id() const { return watchpoint_id; } get_parameterswatchpoint_hit_t66 const std::vector<parameter_t> get_parameters() const { return parameters; } get_error_codewatchpoint_hit_t67 const int32_t get_error_code() const { return error_code; } get_rank_idwatchpoint_hit_t68 const uint32_t get_rank_id() const { return rank_id; } get_root_graph_idwatchpoint_hit_t69 const uint32_t get_root_graph_id() const { return root_graph_id; } 70 std::string name; 71 uint32_t slot; 72 int condition; 73 uint32_t watchpoint_id; 74 std::vector<parameter_t> parameters; 75 int32_t error_code; 76 uint32_t rank_id; 77 uint32_t root_graph_id; 78 }; 79 80 struct tensor_info_t { tensor_info_ttensor_info_t81 tensor_info_t(const std::string &node_name, uint32_t slot, uint32_t iteration, uint32_t rank_id, 82 uint32_t root_graph_id, bool is_output) 83 : node_name(node_name), 84 slot(slot), 85 iteration(iteration), 86 rank_id(rank_id), 87 root_graph_id(root_graph_id), 88 is_output(is_output) {} get_node_nametensor_info_t89 const std::string get_node_name() const { return node_name; } get_slottensor_info_t90 const uint32_t get_slot() const { return slot; } get_iterationtensor_info_t91 const uint32_t get_iteration() const { return iteration; } get_rank_idtensor_info_t92 const uint32_t get_rank_id() const { return rank_id; } get_root_graph_idtensor_info_t93 const uint32_t get_root_graph_id() const { return root_graph_id; } get_is_outputtensor_info_t94 const bool get_is_output() const { return is_output; } 95 std::string node_name; 96 uint32_t slot; 97 uint32_t iteration; 98 uint32_t rank_id; 99 uint32_t root_graph_id; 100 bool is_output; 101 }; 102 103 struct tensor_data_t { tensor_data_ttensor_data_t104 tensor_data_t(const char *data_ptr, uint64_t data_size, int dtype, const std::vector<int64_t> &shape) 105 : data_size(data_size), dtype(dtype), shape(shape) { 106 if (data_ptr != nullptr) { 107 this->data_ptr = py::bytes(data_ptr, data_size); 108 } else { 109 this->data_ptr = py::bytes(); 110 } 111 } get_data_ptrtensor_data_t112 const py::bytes get_data_ptr() const { return data_ptr; } get_data_sizetensor_data_t113 const uint64_t get_data_size() const { return data_size; } get_dtypetensor_data_t114 const int get_dtype() const { return dtype; } get_shapetensor_data_t115 const std::vector<int64_t> &get_shape() const { return shape; } 116 py::bytes data_ptr; 117 uint64_t data_size; 118 int dtype; 119 std::vector<int64_t> shape; 120 }; 121 122 struct TensorBaseData { TensorBaseDataTensorBaseData123 TensorBaseData(uint64_t data_size, int dtype, const std::vector<int64_t> &shape) 124 : data_size_(data_size), dtype_(dtype), shape_(shape) {} 125 data_sizeTensorBaseData126 const uint64_t data_size() const { return data_size_; } dtypeTensorBaseData127 const int dtype() const { return dtype_; } shapeTensorBaseData128 const std::vector<int64_t> &shape() const { return shape_; } 129 uint64_t data_size_; 130 int dtype_; 131 std::vector<int64_t> shape_; 132 }; 133 134 struct TensorStatData { TensorStatDataTensorStatData135 TensorStatData(uint64_t data_size, int dtype, const std::vector<int64_t> &shape, bool is_bool, double max_value, 136 double min_value, double avg_value, int count, int neg_zero_count, int pos_zero_count, int nan_count, 137 int neg_inf_count, int pos_inf_count, int zero_count) 138 : data_size_(data_size), 139 dtype_(dtype), 140 shape_(shape), 141 is_bool_(is_bool), 142 max_value_(max_value), 143 min_value_(min_value), 144 avg_value_(avg_value), 145 count_(count), 146 neg_zero_count_(neg_zero_count), 147 pos_zero_count_(pos_zero_count), 148 nan_count_(nan_count), 149 neg_inf_count_(neg_inf_count), 150 pos_inf_count_(pos_inf_count), 151 zero_count_(zero_count) {} 152 data_sizeTensorStatData153 const uint64_t data_size() const { return data_size_; } dtypeTensorStatData154 const int dtype() const { return dtype_; } shapeTensorStatData155 const std::vector<int64_t> &shape() const { return shape_; } is_boolTensorStatData156 const bool is_bool() const { return is_bool_; } max_valueTensorStatData157 const double max_value() const { return max_value_; } min_valueTensorStatData158 const double min_value() const { return min_value_; } avg_valueTensorStatData159 const double avg_value() const { return avg_value_; } countTensorStatData160 const int count() const { return count_; } neg_zero_countTensorStatData161 const int neg_zero_count() const { return neg_zero_count_; } pos_zero_countTensorStatData162 const int pos_zero_count() const { return pos_zero_count_; } nan_countTensorStatData163 const int nan_count() const { return nan_count_; } neg_inf_countTensorStatData164 const int neg_inf_count() const { return neg_inf_count_; } pos_inf_countTensorStatData165 const int pos_inf_count() const { return pos_inf_count_; } zero_countTensorStatData166 const int zero_count() const { return zero_count_; } 167 168 uint64_t data_size_; 169 int dtype_; 170 std::vector<int64_t> shape_; 171 bool is_bool_; 172 double max_value_; 173 double min_value_; 174 double avg_value_; 175 int count_; 176 int neg_zero_count_; 177 int pos_zero_count_; 178 int nan_count_; 179 int neg_inf_count_; 180 int pos_inf_count_; 181 int zero_count_; 182 }; 183 184 class DbgServices { 185 public: 186 DbgServices(); 187 188 DbgServices(const DbgServices &other); 189 190 DbgServices &operator=(const DbgServices &other); 191 192 ~DbgServices(); 193 194 int32_t Initialize(const std::string net_name, const std::string dump_folder_path, bool is_sync_mode, 195 uint64_t max_mem_usage); 196 197 int32_t AddWatchpoint( 198 unsigned int id, unsigned int watch_condition, 199 std::map<std::string, std::map<std::string, std::variant<bool, std::vector<std::string>>>> check_nodes, 200 std::vector<parameter_t> parameter_list); 201 202 int32_t RemoveWatchpoint(unsigned int id); 203 204 std::vector<watchpoint_hit_t> CheckWatchpoints(unsigned int iteration); 205 206 std::vector<std::shared_ptr<TensorData>> ReadTensorsUtil(std::vector<tensor_info_t> info); 207 208 std::vector<tensor_data_t> ReadTensors(const std::vector<tensor_info_t> info); 209 210 std::vector<TensorBaseData> ReadTensorsBase(const std::vector<tensor_info_t> info); 211 212 std::vector<TensorStatData> ReadTensorsStat(const std::vector<tensor_info_t> info); 213 214 std::string GetVersion() const; 215 216 private: 217 std::shared_ptr<DebugServices> debug_services_ = nullptr; 218 }; 219 220 #endif // DEBUG_DBG_SERVICES_H_ 221