1 /**
2 * Copyright 2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "debugger/offline_debug/dbg_services.h"
17
18 #include <algorithm>
19 #include <chrono>
20
DbgServices()21 DbgServices::DbgServices() { debug_services_ = std::make_shared<DebugServices>(); }
22
DbgServices(const DbgServices & other)23 DbgServices::DbgServices(const DbgServices &other) {
24 MS_LOG(INFO) << "cpp DbgServices object is created via copy";
25 debug_services_ = other.debug_services_;
26 }
27
operator =(const DbgServices & other)28 DbgServices &DbgServices::operator=(const DbgServices &other) {
29 MS_LOG(INFO) << "cpp DbgServices object is being assigned a different state";
30 if (this != &other) {
31 debug_services_ = other.debug_services_;
32 }
33 return *this;
34 }
35
~DbgServices()36 DbgServices::~DbgServices() noexcept {
37 MS_LOG(INFO) << "cpp DbgServices object is deleted";
38 debug_services_ = nullptr;
39 }
40
GetVersion() const41 std::string DbgServices::GetVersion() const {
42 MS_LOG(INFO) << "get version is called";
43 return "1.5.0";
44 }
45
Initialize(const std::string net_name,const std::string dump_folder_path,bool is_sync_mode,uint64_t max_mem_usage)46 int32_t DbgServices::Initialize(const std::string net_name, const std::string dump_folder_path, bool is_sync_mode,
47 uint64_t max_mem_usage) {
48 MS_LOG(INFO) << "cpp DbgServices initialize network name " << net_name;
49 MS_LOG(INFO) << "cpp DbgServices initialize dump folder path " << dump_folder_path;
50 MS_LOG(INFO) << "cpp DbgServices initialize sync mode " << is_sync_mode;
51 MS_LOG(INFO) << "cpp DbgServices initialize maximum memory size for debugger internal cache " << max_mem_usage
52 << "MB.";
53 if (debug_services_ == nullptr) {
54 MS_LOG(EXCEPTION) << "Debugger services initialize failed as occur null pointer error,"
55 << "may be due to memory allocation failure, check as: top";
56 }
57 debug_services_->SetNetName(net_name);
58 debug_services_->SetDumpDir(dump_folder_path);
59 debug_services_->SetSyncMode(is_sync_mode);
60 // Set the memory ratio used by tensor cache. Leave 50% for other debugger backend usage.
61 const uint64_t kMegabytesToBytes = 1048576; // max_mem_usage will be bytes in unit in debugger backend.
62 const uint64_t ratio_inversion = 2;
63 const uint64_t memlimit = max_mem_usage * kMegabytesToBytes / ratio_inversion;
64 debug_services_->SetMemLimit(memlimit);
65 return 0;
66 }
67
AddWatchpoint(unsigned int id,unsigned int watch_condition,std::map<std::string,std::map<std::string,std::variant<bool,std::vector<std::string>>>> check_nodes,std::vector<parameter_t> parameter_list)68 int32_t DbgServices::AddWatchpoint(
69 unsigned int id, unsigned int watch_condition,
70 std::map<std::string, std::map<std::string, std::variant<bool, std::vector<std::string>>>> check_nodes,
71 std::vector<parameter_t> parameter_list) {
72 MS_LOG(INFO) << "cpp DbgServices start AddWatchpoint";
73
74 MS_LOG(INFO) << "cpp DbgServices AddWatchpoint id " << id;
75 MS_LOG(INFO) << "cpp DbgServices AddWatchpoint watch_condition " << watch_condition;
76 for (auto const &node : check_nodes) {
77 MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint name " << node.first;
78 auto attr_map = node.second;
79
80 bool is_output = std::get<bool>(attr_map["is_output"]);
81 MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint is_output " << is_output;
82
83 std::vector<std::string> rank_id_str = std::get<std::vector<std::string>>(attr_map["rank_id"]);
84 std::vector<std::uint32_t> rank_id;
85 (void)std::transform(
86 rank_id_str.begin(), rank_id_str.end(), std::back_inserter(rank_id),
87 [](std::string &id_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(id_str)); });
88 MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint rank_id: ";
89 for (auto const &i : rank_id) {
90 MS_LOG(DEBUG) << i << " ";
91 }
92
93 std::vector<std::string> root_graph_id_str = std::get<std::vector<std::string>>(attr_map["root_graph_id"]);
94 std::vector<std::uint32_t> root_graph_id;
95 (void)std::transform(
96 root_graph_id_str.begin(), root_graph_id_str.end(), std::back_inserter(root_graph_id),
97 [](std::string &graph_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(graph_str)); });
98 MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint root_graph_id: ";
99 for (auto const &j : root_graph_id) {
100 MS_LOG(DEBUG) << j << " ";
101 }
102 }
103
104 for (auto const ¶meter : parameter_list) {
105 MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter name " << parameter.name;
106 MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter disabled " << parameter.disabled;
107 MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter value " << parameter.value;
108 MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter hit " << parameter.hit;
109 MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter actual_value " << parameter.actual_value;
110 }
111
112 std::vector<std::tuple<std::string, bool>> check_node_list;
113 std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_device_list;
114 std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_graph_list;
115 std::vector<DebugServices::parameter_t> parameter_list_backend;
116
117 (void)std::transform(check_nodes.begin(), check_nodes.end(), std::back_inserter(check_node_list),
118 [](auto &node) -> std::tuple<std::string, bool> {
119 auto attr_map = node.second;
120 return std::make_tuple(node.first, std::get<bool>(attr_map["is_output"]));
121 });
122
123 (void)std::transform(check_nodes.begin(), check_nodes.end(), std::back_inserter(check_node_device_list),
124 [](auto &node) -> std::tuple<std::string, std::vector<uint32_t>> {
125 auto attr_map = node.second;
126 std::vector<std::string> rank_id_str = std::get<std::vector<std::string>>(attr_map["rank_id"]);
127 std::vector<std::uint32_t> rank_id;
128 (void)std::transform(rank_id_str.begin(), rank_id_str.end(), std::back_inserter(rank_id),
129 [](std::string &id_str) -> std::uint32_t {
130 return static_cast<uint32_t>(std::stoul(id_str));
131 });
132 return std::make_tuple(node.first, rank_id);
133 });
134
135 (void)std::transform(
136 check_nodes.begin(), check_nodes.end(), std::back_inserter(check_node_graph_list),
137 [](auto &node) -> std::tuple<std::string, std::vector<uint32_t>> {
138 auto attr_map = node.second;
139 std::vector<std::string> root_graph_id_str = std::get<std::vector<std::string>>(attr_map["root_graph_id"]);
140 std::vector<std::uint32_t> root_graph_id;
141 (void)std::transform(
142 root_graph_id_str.begin(), root_graph_id_str.end(), std::back_inserter(root_graph_id),
143 [](std::string &graph_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(graph_str)); });
144 return std::make_tuple(node.first, root_graph_id);
145 });
146
147 (void)std::transform(
148 parameter_list.begin(), parameter_list.end(), std::back_inserter(parameter_list_backend),
149 [](const parameter_t ¶meter) -> DebugServices::parameter_t {
150 return DebugServices::parameter_t{parameter.name, parameter.disabled, parameter.value, parameter.hit};
151 });
152
153 debug_services_->AddWatchpoint(id, watch_condition, 0, check_node_list, parameter_list_backend,
154 &check_node_device_list, &check_node_graph_list);
155 MS_LOG(INFO) << "cpp DbgServices end AddWatchpoint";
156 return 0;
157 }
158
RemoveWatchpoint(unsigned int id)159 int32_t DbgServices::RemoveWatchpoint(unsigned int id) {
160 MS_LOG(INFO) << "cpp DbgServices RemoveWatchpoint id " << id;
161 debug_services_->RemoveWatchpoint(id);
162 return 0;
163 }
164
CheckWatchpoints(unsigned int iteration)165 std::vector<watchpoint_hit_t> DbgServices::CheckWatchpoints(unsigned int iteration) {
166 MS_LOG(INFO) << "cpp DbgServices CheckWatchpoint iteration " << iteration;
167
168 std::vector<std::string> name;
169 std::vector<std::string> slot;
170 std::vector<int> condition;
171 std::vector<unsigned int> watchpoint_id;
172 std::vector<std::string> overflow_ops;
173 std::vector<std::vector<DebugServices::parameter_t>> parameters;
174 std::vector<int32_t> error_codes;
175 std::vector<unsigned int> rank_id;
176 std::vector<unsigned int> root_graph_id;
177 std::vector<std::shared_ptr<TensorData>> tensor_list;
178 std::vector<std::string> file_paths;
179
180 const bool init_dbg_suspend = (iteration == UINT_MAX);
181
182 tensor_list = debug_services_->ReadNeededDumpedTensors(iteration, &file_paths);
183
184 debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, ¶meters, &error_codes, overflow_ops,
185 file_paths, &tensor_list, init_dbg_suspend, true, true, &rank_id, &root_graph_id);
186
187 std::vector<watchpoint_hit_t> hits;
188 for (unsigned int i = 0; i < name.size(); i++) {
189 std::vector<DebugServices::parameter_t> ¶meter = parameters[i];
190 std::vector<parameter_t> api_parameter_vector;
191 for (const auto &p : parameter) {
192 parameter_t api_parameter(p.name, p.disabled, p.value, p.hit, p.actual_value);
193 api_parameter_vector.push_back(api_parameter);
194 }
195 watchpoint_hit_t hit(name[i], std::stoi(slot[i]), condition[i], watchpoint_id[i], api_parameter_vector,
196 error_codes[i], rank_id[i], root_graph_id[i]);
197
198 MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t name " << hit.name;
199 MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t slot " << hit.slot;
200 MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t watchpoint_id " << hit.watchpoint_id;
201 MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t error_code " << hit.error_code;
202 MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t rank_id " << hit.rank_id;
203 MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t root_graph_id " << hit.root_graph_id;
204
205 for (auto const ¶meter_i : api_parameter_vector) {
206 MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter name " << parameter_i.name;
207 MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter disabled " << parameter_i.disabled;
208 MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter value " << parameter_i.value;
209 MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter hit " << parameter_i.hit;
210 MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter actual_value " << parameter_i.actual_value;
211 }
212
213 hits.push_back(hit);
214 }
215 return hits;
216 }
217
GetTensorFullName(const tensor_info_t info)218 std::string GetTensorFullName(const tensor_info_t info) { return info.node_name + ":" + std::to_string(info.slot); }
219
GetTensorRankId(const tensor_info_t info)220 unsigned int GetTensorRankId(const tensor_info_t info) { return info.rank_id; }
221
GetTensorRootGraphId(const tensor_info_t info)222 unsigned int GetTensorRootGraphId(const tensor_info_t info) { return info.root_graph_id; }
223
GetTensorIteration(const tensor_info_t info)224 unsigned int GetTensorIteration(const tensor_info_t info) { return info.iteration; }
225
GetTensorSlot(const tensor_info_t info)226 unsigned int GetTensorSlot(const tensor_info_t info) { return info.slot; }
227
GetTensorIsOutput(const tensor_info_t info)228 bool GetTensorIsOutput(const tensor_info_t info) { return info.is_output; }
229
ReadTensorsUtil(std::vector<tensor_info_t> info)230 std::vector<std::shared_ptr<TensorData>> DbgServices::ReadTensorsUtil(std::vector<tensor_info_t> info) {
231 for (auto i : info) {
232 MS_LOG(INFO) << "cpp DbgServices ReadTensor info name " << i.node_name << ", slot " << i.slot << ", iteration "
233 << i.iteration << ", rank_id " << i.rank_id << ", root_graph_id " << i.root_graph_id << ", is_output "
234 << i.is_output;
235 }
236 std::vector<std::string> backend_name;
237 std::vector<unsigned int> rank_id;
238 std::vector<unsigned int> root_graph_id;
239 std::vector<unsigned int> iteration;
240 std::vector<size_t> slot;
241 std::vector<std::shared_ptr<TensorData>> result_list;
242 std::vector<bool> is_output;
243
244 (void)std::transform(info.begin(), info.end(), std::back_inserter(backend_name), GetTensorFullName);
245 (void)std::transform(info.begin(), info.end(), std::back_inserter(slot), GetTensorSlot);
246 (void)std::transform(info.begin(), info.end(), std::back_inserter(rank_id), GetTensorRankId);
247 (void)std::transform(info.begin(), info.end(), std::back_inserter(root_graph_id), GetTensorRootGraphId);
248 (void)std::transform(info.begin(), info.end(), std::back_inserter(iteration), GetTensorIteration);
249 (void)std::transform(info.begin(), info.end(), std::back_inserter(is_output), GetTensorIsOutput);
250
251 MS_LOG(INFO) << "cpp before";
252 std::vector<std::string> file_paths;
253 auto t1 = std::chrono::high_resolution_clock::now();
254 // Convert the dumped data to npy format if it's async mode.
255 if (!debug_services_->GetSyncMode()) {
256 debug_services_->ConvertReadTensors(backend_name, slot, rank_id, iteration, root_graph_id, &file_paths);
257 }
258 debug_services_->ReadDumpedTensor(backend_name, slot, rank_id, iteration, root_graph_id, is_output, file_paths,
259 &result_list);
260 for (auto result : result_list) {
261 std::string output = "0";
262 if (result->GetIsOutput()) {
263 output = "1";
264 }
265 std::string key_name_in_cache = result->GetName() + ":" + std::to_string(result->GetDeviceId()) + ":" +
266 std::to_string(result->GetRootGraphId()) + ":" + output + ":" +
267 std::to_string(result->GetSlot());
268 debug_services_->AppendToCacheEvictQueue(key_name_in_cache);
269 }
270 auto t2 = std::chrono::high_resolution_clock::now();
271 /* Getting number of milliseconds as a double. */
272 std::chrono::duration<double, std::milli> ms_double = t2 - t1;
273
274 MS_LOG(INFO) << "ReadTensors Took: " << ms_double.count() / 1000 << "s";
275 MS_LOG(INFO) << "cpp after";
276
277 return result_list;
278 }
279
ReadTensors(const std::vector<tensor_info_t> info)280 std::vector<tensor_data_t> DbgServices::ReadTensors(const std::vector<tensor_info_t> info) {
281 std::vector<tensor_data_t> tensors_read;
282 std::vector<std::shared_ptr<TensorData>> result_list;
283 result_list = ReadTensorsUtil(info);
284 for (auto result : result_list) {
285 tensor_data_t tensor_data_item(result->GetDataPtr(), result->GetByteSize(), result->GetType(), result->GetShape());
286 tensors_read.push_back(tensor_data_item);
287 }
288 return tensors_read;
289 }
290
ReadTensorsBase(const std::vector<tensor_info_t> info)291 std::vector<TensorBaseData> DbgServices::ReadTensorsBase(const std::vector<tensor_info_t> info) {
292 std::vector<TensorBaseData> tensors_read_base;
293 std::vector<std::shared_ptr<TensorData>> result_list;
294 result_list = ReadTensorsUtil(info);
295 for (auto result : result_list) {
296 if (!result->GetByteSize()) {
297 // tensor not found, adding empty tensor base.
298 TensorBaseData tensor_data_item(0, 0, {});
299 tensors_read_base.push_back(tensor_data_item);
300 continue;
301 }
302 TensorBaseData tensor_data_item(result->GetByteSize(), result->GetType(), result->GetShape());
303 tensors_read_base.push_back(tensor_data_item);
304 }
305 return tensors_read_base;
306 }
307
AddTensorStatInfo(const DebugServices::TensorStat & tensor_statistics,std::vector<TensorStatData> * const tensors_read_stat)308 void AddTensorStatInfo(const DebugServices::TensorStat &tensor_statistics,
309 std::vector<TensorStatData> *const tensors_read_stat) {
310 if (tensors_read_stat == nullptr) {
311 MS_LOG(DEBUG) << "tensors_read_stat is nullptr.";
312 return;
313 }
314 TensorStatData tensor_data_item(
315 tensor_statistics.data_size, tensor_statistics.dtype, tensor_statistics.shape, tensor_statistics.is_bool,
316 tensor_statistics.max_value, tensor_statistics.min_value, tensor_statistics.avg_value, tensor_statistics.count,
317 tensor_statistics.neg_zero_count, tensor_statistics.pos_zero_count, tensor_statistics.nan_count,
318 tensor_statistics.neg_inf_count, tensor_statistics.pos_inf_count, tensor_statistics.zero_count);
319 tensors_read_stat->push_back(tensor_data_item);
320 }
321
ReadTensorsStat(const std::vector<tensor_info_t> info)322 std::vector<TensorStatData> DbgServices::ReadTensorsStat(const std::vector<tensor_info_t> info) {
323 std::vector<TensorStatData> tensors_read_stat;
324 std::vector<std::shared_ptr<TensorData>> result_list;
325 result_list = ReadTensorsUtil(info);
326 for (auto result : result_list) {
327 if (!result->GetByteSize()) {
328 DebugServices::TensorStat tensor_statistics;
329 AddTensorStatInfo(tensor_statistics, &tensors_read_stat);
330 continue;
331 }
332 DebugServices::TensorStat tensor_statistics = debug_services_->GetTensorStatistics(result);
333 AddTensorStatInfo(tensor_statistics, &tensors_read_stat);
334 }
335
336 return tensors_read_stat;
337 }
338