• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "debugger/offline_debug/dbg_services.h"
17 
18 #include <algorithm>
19 #include <chrono>
20 
DbgServices()21 DbgServices::DbgServices() { debug_services_ = std::make_shared<DebugServices>(); }
22 
DbgServices(const DbgServices & other)23 DbgServices::DbgServices(const DbgServices &other) {
24   MS_LOG(INFO) << "cpp DbgServices object is created via copy";
25   debug_services_ = other.debug_services_;
26 }
27 
operator =(const DbgServices & other)28 DbgServices &DbgServices::operator=(const DbgServices &other) {
29   MS_LOG(INFO) << "cpp DbgServices object is being assigned a different state";
30   if (this != &other) {
31     debug_services_ = other.debug_services_;
32   }
33   return *this;
34 }
35 
~DbgServices()36 DbgServices::~DbgServices() noexcept {
37   MS_LOG(INFO) << "cpp DbgServices object is deleted";
38   debug_services_ = nullptr;
39 }
40 
GetVersion() const41 std::string DbgServices::GetVersion() const {
42   MS_LOG(INFO) << "get version is called";
43   return "1.5.0";
44 }
45 
Initialize(const std::string net_name,const std::string dump_folder_path,bool is_sync_mode,uint64_t max_mem_usage)46 int32_t DbgServices::Initialize(const std::string net_name, const std::string dump_folder_path, bool is_sync_mode,
47                                 uint64_t max_mem_usage) {
48   MS_LOG(INFO) << "cpp DbgServices initialize network name " << net_name;
49   MS_LOG(INFO) << "cpp DbgServices initialize dump folder path " << dump_folder_path;
50   MS_LOG(INFO) << "cpp DbgServices initialize sync mode " << is_sync_mode;
51   MS_LOG(INFO) << "cpp DbgServices initialize maximum memory size for debugger internal cache " << max_mem_usage
52                << "MB.";
53   if (debug_services_ == nullptr) {
54     MS_LOG(EXCEPTION) << "Debugger services initialize failed as occur null pointer error,"
55                       << "may be due to memory allocation failure, check as: top";
56   }
57   debug_services_->SetNetName(net_name);
58   debug_services_->SetDumpDir(dump_folder_path);
59   debug_services_->SetSyncMode(is_sync_mode);
60   // Set the memory ratio used by tensor cache. Leave 50% for other debugger backend usage.
61   const uint64_t kMegabytesToBytes = 1048576;  // max_mem_usage will be bytes in unit in debugger backend.
62   const uint64_t ratio_inversion = 2;
63   const uint64_t memlimit = max_mem_usage * kMegabytesToBytes / ratio_inversion;
64   debug_services_->SetMemLimit(memlimit);
65   return 0;
66 }
67 
AddWatchpoint(unsigned int id,unsigned int watch_condition,std::map<std::string,std::map<std::string,std::variant<bool,std::vector<std::string>>>> check_nodes,std::vector<parameter_t> parameter_list)68 int32_t DbgServices::AddWatchpoint(
69   unsigned int id, unsigned int watch_condition,
70   std::map<std::string, std::map<std::string, std::variant<bool, std::vector<std::string>>>> check_nodes,
71   std::vector<parameter_t> parameter_list) {
72   MS_LOG(INFO) << "cpp DbgServices start AddWatchpoint";
73 
74   MS_LOG(INFO) << "cpp DbgServices AddWatchpoint id " << id;
75   MS_LOG(INFO) << "cpp DbgServices AddWatchpoint watch_condition " << watch_condition;
76   for (auto const &node : check_nodes) {
77     MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint name " << node.first;
78     auto attr_map = node.second;
79 
80     bool is_output = std::get<bool>(attr_map["is_output"]);
81     MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint is_output " << is_output;
82 
83     std::vector<std::string> rank_id_str = std::get<std::vector<std::string>>(attr_map["rank_id"]);
84     std::vector<std::uint32_t> rank_id;
85     (void)std::transform(
86       rank_id_str.begin(), rank_id_str.end(), std::back_inserter(rank_id),
87       [](std::string &id_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(id_str)); });
88     MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint rank_id: ";
89     for (auto const &i : rank_id) {
90       MS_LOG(DEBUG) << i << " ";
91     }
92 
93     std::vector<std::string> root_graph_id_str = std::get<std::vector<std::string>>(attr_map["root_graph_id"]);
94     std::vector<std::uint32_t> root_graph_id;
95     (void)std::transform(
96       root_graph_id_str.begin(), root_graph_id_str.end(), std::back_inserter(root_graph_id),
97       [](std::string &graph_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(graph_str)); });
98     MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint root_graph_id: ";
99     for (auto const &j : root_graph_id) {
100       MS_LOG(DEBUG) << j << " ";
101     }
102   }
103 
104   for (auto const &parameter : parameter_list) {
105     MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter name " << parameter.name;
106     MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter disabled " << parameter.disabled;
107     MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter value " << parameter.value;
108     MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter hit " << parameter.hit;
109     MS_LOG(INFO) << "cpp DbgServices AddWatchpoint parameter actual_value " << parameter.actual_value;
110   }
111 
112   std::vector<std::tuple<std::string, bool>> check_node_list;
113   std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_device_list;
114   std::vector<std::tuple<std::string, std::vector<uint32_t>>> check_node_graph_list;
115   std::vector<DebugServices::parameter_t> parameter_list_backend;
116 
117   (void)std::transform(check_nodes.begin(), check_nodes.end(), std::back_inserter(check_node_list),
118                        [](auto &node) -> std::tuple<std::string, bool> {
119                          auto attr_map = node.second;
120                          return std::make_tuple(node.first, std::get<bool>(attr_map["is_output"]));
121                        });
122 
123   (void)std::transform(check_nodes.begin(), check_nodes.end(), std::back_inserter(check_node_device_list),
124                        [](auto &node) -> std::tuple<std::string, std::vector<uint32_t>> {
125                          auto attr_map = node.second;
126                          std::vector<std::string> rank_id_str = std::get<std::vector<std::string>>(attr_map["rank_id"]);
127                          std::vector<std::uint32_t> rank_id;
128                          (void)std::transform(rank_id_str.begin(), rank_id_str.end(), std::back_inserter(rank_id),
129                                               [](std::string &id_str) -> std::uint32_t {
130                                                 return static_cast<uint32_t>(std::stoul(id_str));
131                                               });
132                          return std::make_tuple(node.first, rank_id);
133                        });
134 
135   (void)std::transform(
136     check_nodes.begin(), check_nodes.end(), std::back_inserter(check_node_graph_list),
137     [](auto &node) -> std::tuple<std::string, std::vector<uint32_t>> {
138       auto attr_map = node.second;
139       std::vector<std::string> root_graph_id_str = std::get<std::vector<std::string>>(attr_map["root_graph_id"]);
140       std::vector<std::uint32_t> root_graph_id;
141       (void)std::transform(
142         root_graph_id_str.begin(), root_graph_id_str.end(), std::back_inserter(root_graph_id),
143         [](std::string &graph_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(graph_str)); });
144       return std::make_tuple(node.first, root_graph_id);
145     });
146 
147   (void)std::transform(
148     parameter_list.begin(), parameter_list.end(), std::back_inserter(parameter_list_backend),
149     [](const parameter_t &parameter) -> DebugServices::parameter_t {
150       return DebugServices::parameter_t{parameter.name, parameter.disabled, parameter.value, parameter.hit};
151     });
152 
153   debug_services_->AddWatchpoint(id, watch_condition, 0, check_node_list, parameter_list_backend,
154                                  &check_node_device_list, &check_node_graph_list);
155   MS_LOG(INFO) << "cpp DbgServices end AddWatchpoint";
156   return 0;
157 }
158 
RemoveWatchpoint(unsigned int id)159 int32_t DbgServices::RemoveWatchpoint(unsigned int id) {
160   MS_LOG(INFO) << "cpp DbgServices RemoveWatchpoint id " << id;
161   debug_services_->RemoveWatchpoint(id);
162   return 0;
163 }
164 
CheckWatchpoints(unsigned int iteration)165 std::vector<watchpoint_hit_t> DbgServices::CheckWatchpoints(unsigned int iteration) {
166   MS_LOG(INFO) << "cpp DbgServices CheckWatchpoint iteration " << iteration;
167 
168   std::vector<std::string> name;
169   std::vector<std::string> slot;
170   std::vector<int> condition;
171   std::vector<unsigned int> watchpoint_id;
172   std::vector<std::string> overflow_ops;
173   std::vector<std::vector<DebugServices::parameter_t>> parameters;
174   std::vector<int32_t> error_codes;
175   std::vector<unsigned int> rank_id;
176   std::vector<unsigned int> root_graph_id;
177   std::vector<std::shared_ptr<TensorData>> tensor_list;
178   std::vector<std::string> file_paths;
179 
180   const bool init_dbg_suspend = (iteration == UINT_MAX);
181 
182   tensor_list = debug_services_->ReadNeededDumpedTensors(iteration, &file_paths);
183 
184   debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes, overflow_ops,
185                                     file_paths, &tensor_list, init_dbg_suspend, true, true, &rank_id, &root_graph_id);
186 
187   std::vector<watchpoint_hit_t> hits;
188   for (unsigned int i = 0; i < name.size(); i++) {
189     std::vector<DebugServices::parameter_t> &parameter = parameters[i];
190     std::vector<parameter_t> api_parameter_vector;
191     for (const auto &p : parameter) {
192       parameter_t api_parameter(p.name, p.disabled, p.value, p.hit, p.actual_value);
193       api_parameter_vector.push_back(api_parameter);
194     }
195     watchpoint_hit_t hit(name[i], std::stoi(slot[i]), condition[i], watchpoint_id[i], api_parameter_vector,
196                          error_codes[i], rank_id[i], root_graph_id[i]);
197 
198     MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t name " << hit.name;
199     MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t slot " << hit.slot;
200     MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t watchpoint_id " << hit.watchpoint_id;
201     MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t error_code " << hit.error_code;
202     MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t rank_id " << hit.rank_id;
203     MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t root_graph_id " << hit.root_graph_id;
204 
205     for (auto const &parameter_i : api_parameter_vector) {
206       MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter name " << parameter_i.name;
207       MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter disabled " << parameter_i.disabled;
208       MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter value " << parameter_i.value;
209       MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter hit " << parameter_i.hit;
210       MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter actual_value " << parameter_i.actual_value;
211     }
212 
213     hits.push_back(hit);
214   }
215   return hits;
216 }
217 
GetTensorFullName(const tensor_info_t info)218 std::string GetTensorFullName(const tensor_info_t info) { return info.node_name + ":" + std::to_string(info.slot); }
219 
GetTensorRankId(const tensor_info_t info)220 unsigned int GetTensorRankId(const tensor_info_t info) { return info.rank_id; }
221 
GetTensorRootGraphId(const tensor_info_t info)222 unsigned int GetTensorRootGraphId(const tensor_info_t info) { return info.root_graph_id; }
223 
GetTensorIteration(const tensor_info_t info)224 unsigned int GetTensorIteration(const tensor_info_t info) { return info.iteration; }
225 
GetTensorSlot(const tensor_info_t info)226 unsigned int GetTensorSlot(const tensor_info_t info) { return info.slot; }
227 
GetTensorIsOutput(const tensor_info_t info)228 bool GetTensorIsOutput(const tensor_info_t info) { return info.is_output; }
229 
ReadTensorsUtil(std::vector<tensor_info_t> info)230 std::vector<std::shared_ptr<TensorData>> DbgServices::ReadTensorsUtil(std::vector<tensor_info_t> info) {
231   for (auto i : info) {
232     MS_LOG(INFO) << "cpp DbgServices ReadTensor info name " << i.node_name << ", slot " << i.slot << ", iteration "
233                  << i.iteration << ", rank_id " << i.rank_id << ", root_graph_id " << i.root_graph_id << ", is_output "
234                  << i.is_output;
235   }
236   std::vector<std::string> backend_name;
237   std::vector<unsigned int> rank_id;
238   std::vector<unsigned int> root_graph_id;
239   std::vector<unsigned int> iteration;
240   std::vector<size_t> slot;
241   std::vector<std::shared_ptr<TensorData>> result_list;
242   std::vector<bool> is_output;
243 
244   (void)std::transform(info.begin(), info.end(), std::back_inserter(backend_name), GetTensorFullName);
245   (void)std::transform(info.begin(), info.end(), std::back_inserter(slot), GetTensorSlot);
246   (void)std::transform(info.begin(), info.end(), std::back_inserter(rank_id), GetTensorRankId);
247   (void)std::transform(info.begin(), info.end(), std::back_inserter(root_graph_id), GetTensorRootGraphId);
248   (void)std::transform(info.begin(), info.end(), std::back_inserter(iteration), GetTensorIteration);
249   (void)std::transform(info.begin(), info.end(), std::back_inserter(is_output), GetTensorIsOutput);
250 
251   MS_LOG(INFO) << "cpp before";
252   std::vector<std::string> file_paths;
253   auto t1 = std::chrono::high_resolution_clock::now();
254   // Convert the dumped data to npy format if it's async mode.
255   if (!debug_services_->GetSyncMode()) {
256     debug_services_->ConvertReadTensors(backend_name, slot, rank_id, iteration, root_graph_id, &file_paths);
257   }
258   debug_services_->ReadDumpedTensor(backend_name, slot, rank_id, iteration, root_graph_id, is_output, file_paths,
259                                     &result_list);
260   for (auto result : result_list) {
261     std::string output = "0";
262     if (result->GetIsOutput()) {
263       output = "1";
264     }
265     std::string key_name_in_cache = result->GetName() + ":" + std::to_string(result->GetDeviceId()) + ":" +
266                                     std::to_string(result->GetRootGraphId()) + ":" + output + ":" +
267                                     std::to_string(result->GetSlot());
268     debug_services_->AppendToCacheEvictQueue(key_name_in_cache);
269   }
270   auto t2 = std::chrono::high_resolution_clock::now();
271   /* Getting number of milliseconds as a double. */
272   std::chrono::duration<double, std::milli> ms_double = t2 - t1;
273 
274   MS_LOG(INFO) << "ReadTensors Took: " << ms_double.count() / 1000 << "s";
275   MS_LOG(INFO) << "cpp after";
276 
277   return result_list;
278 }
279 
ReadTensors(const std::vector<tensor_info_t> info)280 std::vector<tensor_data_t> DbgServices::ReadTensors(const std::vector<tensor_info_t> info) {
281   std::vector<tensor_data_t> tensors_read;
282   std::vector<std::shared_ptr<TensorData>> result_list;
283   result_list = ReadTensorsUtil(info);
284   for (auto result : result_list) {
285     tensor_data_t tensor_data_item(result->GetDataPtr(), result->GetByteSize(), result->GetType(), result->GetShape());
286     tensors_read.push_back(tensor_data_item);
287   }
288   return tensors_read;
289 }
290 
ReadTensorsBase(const std::vector<tensor_info_t> info)291 std::vector<TensorBaseData> DbgServices::ReadTensorsBase(const std::vector<tensor_info_t> info) {
292   std::vector<TensorBaseData> tensors_read_base;
293   std::vector<std::shared_ptr<TensorData>> result_list;
294   result_list = ReadTensorsUtil(info);
295   for (auto result : result_list) {
296     if (!result->GetByteSize()) {
297       // tensor not found, adding empty tensor base.
298       TensorBaseData tensor_data_item(0, 0, {});
299       tensors_read_base.push_back(tensor_data_item);
300       continue;
301     }
302     TensorBaseData tensor_data_item(result->GetByteSize(), result->GetType(), result->GetShape());
303     tensors_read_base.push_back(tensor_data_item);
304   }
305   return tensors_read_base;
306 }
307 
AddTensorStatInfo(const DebugServices::TensorStat & tensor_statistics,std::vector<TensorStatData> * const tensors_read_stat)308 void AddTensorStatInfo(const DebugServices::TensorStat &tensor_statistics,
309                        std::vector<TensorStatData> *const tensors_read_stat) {
310   if (tensors_read_stat == nullptr) {
311     MS_LOG(DEBUG) << "tensors_read_stat is nullptr.";
312     return;
313   }
314   TensorStatData tensor_data_item(
315     tensor_statistics.data_size, tensor_statistics.dtype, tensor_statistics.shape, tensor_statistics.is_bool,
316     tensor_statistics.max_value, tensor_statistics.min_value, tensor_statistics.avg_value, tensor_statistics.count,
317     tensor_statistics.neg_zero_count, tensor_statistics.pos_zero_count, tensor_statistics.nan_count,
318     tensor_statistics.neg_inf_count, tensor_statistics.pos_inf_count, tensor_statistics.zero_count);
319   tensors_read_stat->push_back(tensor_data_item);
320 }
321 
ReadTensorsStat(const std::vector<tensor_info_t> info)322 std::vector<TensorStatData> DbgServices::ReadTensorsStat(const std::vector<tensor_info_t> info) {
323   std::vector<TensorStatData> tensors_read_stat;
324   std::vector<std::shared_ptr<TensorData>> result_list;
325   result_list = ReadTensorsUtil(info);
326   for (auto result : result_list) {
327     if (!result->GetByteSize()) {
328       DebugServices::TensorStat tensor_statistics;
329       AddTensorStatInfo(tensor_statistics, &tensors_read_stat);
330       continue;
331     }
332     DebugServices::TensorStat tensor_statistics = debug_services_->GetTensorStatistics(result);
333     AddTensorStatInfo(tensor_statistics, &tensors_read_stat);
334   }
335 
336   return tensors_read_stat;
337 }
338