1 /**
2 * Copyright 2019-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "debug/debug_services.h"
17 #include <dirent.h>
18 #include <algorithm>
19 #include <functional>
20 #include <fstream>
21 #include <future>
22 #include <thread>
23 #include <iterator>
24 #include <map>
25 #include <numeric>
26 #include <unordered_set>
27 #include <utility>
28 #include "pybind11/embed.h"
29 #include "pybind11/stl.h"
30 #ifdef ONLINE_DBG_MODE
31 #include "debug/common.h"
32 #include "debug/debugger/debugger.h"
33 #include "debug/anf_ir_utils.h"
34 #include "backend/session/anf_runtime_algorithm.h"
35 #endif
36 #include "debug/debugger/tensor_summary.h"
37 #include "utils/file_utils.h"
38 #ifdef ONLINE_DBG_MODE
39 namespace mindspore {
40 #endif
DebugServices()41 DebugServices::DebugServices() { tensor_loader_ = std::make_shared<TensorLoader>(); }
42
DebugServices(const DebugServices & other)43 DebugServices::DebugServices(const DebugServices &other) {
44 wp_id_cache_ = other.wp_id_cache_;
45 net_name_ = other.net_name_;
46 dump_dir_ = other.dump_dir_;
47 is_sync_mode_ = other.is_sync_mode_;
48 tensor_loader_ = other.tensor_loader_;
49 watchpoint_table_ = other.watchpoint_table_;
50 }
51
operator =(const DebugServices & other)52 DebugServices &DebugServices::operator=(const DebugServices &other) {
53 if (this != &other) {
54 tensor_loader_ = other.tensor_loader_;
55 watchpoint_table_ = other.watchpoint_table_;
56 }
57 return *this;
58 }
59
AddWatchpoint(unsigned int id,unsigned int watch_condition,float parameter,const std::vector<std::tuple<std::string,bool>> & check_node_list,const std::vector<parameter_t> & parameter_list,const std::vector<std::tuple<std::string,std::vector<uint32_t>>> * check_node_device_list,const std::vector<std::tuple<std::string,std::vector<uint32_t>>> * check_node_graph_list)60 void DebugServices::AddWatchpoint(
61 unsigned int id, unsigned int watch_condition, float parameter,
62 const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> ¶meter_list,
63 const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_device_list,
64 const std::vector<std::tuple<std::string, std::vector<uint32_t>>> *check_node_graph_list) {
65 std::lock_guard<std::mutex> lg(lock_);
66
67 watchpoint_t watchpoint_item;
68 watchpoint_item.id = id;
69 watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
70 watchpoint_item.condition.parameter = parameter;
71 watchpoint_item.check_node_list = check_node_list;
72 if (check_node_device_list != nullptr) {
73 watchpoint_item.check_node_device_list = *check_node_device_list;
74 }
75 if (check_node_graph_list != nullptr) {
76 watchpoint_item.check_node_graph_list = *check_node_graph_list;
77 }
78 watchpoint_item.parameter_list = parameter_list;
79 watchpoint_table_[id] = watchpoint_item;
80 }
81
RemoveWatchpoint(unsigned int id)82 void DebugServices::RemoveWatchpoint(unsigned int id) {
83 std::lock_guard<std::mutex> lg(lock_);
84 (void)watchpoint_table_.erase(id);
85 }
86
GetSummaryPtr(const std::shared_ptr<TensorData> & tensor,const void * const previous_tensor_ptr,uint32_t num_elements,uint32_t prev_num_elements,int tensor_dtype)87 std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
88 const void *const previous_tensor_ptr, uint32_t num_elements,
89 uint32_t prev_num_elements, int tensor_dtype) {
90 switch (tensor_dtype) {
91 case DbgDataType::DT_UINT8: {
92 return std::make_unique<TensorSummary<uint8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
93 prev_num_elements);
94 }
95 case DbgDataType::DT_INT8: {
96 return std::make_unique<TensorSummary<int8_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
97 prev_num_elements);
98 }
99 case DbgDataType::DT_UINT16: {
100 return std::make_unique<TensorSummary<uint16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
101 prev_num_elements);
102 }
103 case DbgDataType::DT_INT16: {
104 return std::make_unique<TensorSummary<int16_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
105 prev_num_elements);
106 }
107 case DbgDataType::DT_UINT32: {
108 return std::make_unique<TensorSummary<uint32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
109 prev_num_elements);
110 }
111 case DbgDataType::DT_INT32:
112 case DbgDataType::DT_BASE_INT: {
113 return std::make_unique<TensorSummary<int32_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
114 prev_num_elements);
115 }
116 case DbgDataType::DT_UINT64: {
117 return std::make_unique<TensorSummary<uint64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
118 prev_num_elements);
119 }
120 case DbgDataType::DT_INT64: {
121 return std::make_unique<TensorSummary<int64_t>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
122 prev_num_elements);
123 }
124 case DbgDataType::DT_FLOAT16: {
125 return std::make_unique<TensorSummary<float16>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
126 prev_num_elements);
127 }
128 case DbgDataType::DT_FLOAT32:
129 case DbgDataType::DT_BASE_FLOAT: {
130 return std::make_unique<TensorSummary<float>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
131 prev_num_elements);
132 }
133 case DbgDataType::DT_FLOAT64: {
134 return std::make_unique<TensorSummary<double>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
135 prev_num_elements);
136 }
137 case DbgDataType::DT_BOOL: {
138 return std::make_unique<TensorSummary<bool>>(tensor->GetDataPtr(), previous_tensor_ptr, num_elements,
139 prev_num_elements);
140 }
141 default:
142 MS_LOG(INFO) << "Unsupported tensor type";
143 // return a null pointer
144 return std::unique_ptr<TensorSummary<int32_t>>{};
145 }
146 }
147
GetTensorStatistics(const std::shared_ptr<TensorData> & tensor)148 DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
149 if (tensor == nullptr) {
150 MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
151 TensorStat empty_tensor_stat_data;
152 return empty_tensor_stat_data;
153 }
154 std::unique_ptr<ITensorSummary> base_summary_ptr;
155 void *previous_tensor_ptr = nullptr;
156 base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, tensor->GetNumElements(), 0, tensor->GetType());
157 if (base_summary_ptr == nullptr) {
158 MS_LOG(WARNING) << "base_summary_ptr is nullptr, returning empty tensor statistics.";
159 TensorStat empty_tensor_stat_data;
160 return empty_tensor_stat_data;
161 }
162 base_summary_ptr->TensorStatistics(tensor->GetType());
163 TensorStat tensor_stat_data(tensor->GetByteSize(), tensor->GetType(), tensor->GetShape(), base_summary_ptr->is_bool(),
164 base_summary_ptr->max_value(), base_summary_ptr->min_value(),
165 base_summary_ptr->avg_value(), base_summary_ptr->count(),
166 base_summary_ptr->neg_zero_count(), base_summary_ptr->pos_zero_count(),
167 base_summary_ptr->nan_count(), base_summary_ptr->neg_inf_count(),
168 base_summary_ptr->pos_inf_count(), base_summary_ptr->zero_count());
169
170 return tensor_stat_data;
171 }
172 #ifdef OFFLINE_DBG_MODE
GetPrevTensor(const std::shared_ptr<TensorData> & tensor,bool previous_iter_tensor_needed,uint32_t * prev_num_elements)173 const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
174 uint32_t *prev_num_elements) {
175 const void *previous_tensor_ptr = nullptr;
176 std::shared_ptr<TensorData> tensor_prev;
177 if (previous_iter_tensor_needed && tensor->GetIteration() >= 1) {
178 // read data in offline mode
179 std::vector<std::string> file_paths;
180 if (!is_sync_mode_) {
181 ConvertReadTensors(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
182 std::vector<unsigned int>{tensor->GetDeviceId()},
183 std::vector<unsigned int>{tensor->GetIteration() - 1},
184 std::vector<unsigned int>{tensor->GetRootGraphId()}, &file_paths);
185 }
186 std::vector<std::shared_ptr<TensorData>> result_list_prev;
187 ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
188 std::vector<unsigned int>{tensor->GetDeviceId()},
189 std::vector<unsigned int>{tensor->GetIteration() - 1},
190 std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
191 file_paths, &result_list_prev);
192 tensor_prev = result_list_prev[0];
193 if (!tensor_prev->GetByteSize()) {
194 tensor_prev.reset();
195 } else {
196 previous_tensor_ptr = tensor_prev->GetDataPtr();
197 *prev_num_elements = tensor_prev->GetNumElements();
198 }
199 }
200 return previous_tensor_ptr;
201 }
202 #endif
203
AddWatchPointsToCheck(bool init_dbg_suspend,bool step_end,bool recheck,const std::shared_ptr<TensorData> & tensor,bool * previous_iter_tensor_needed,std::string * const qualified_tensor_name,std::vector<watchpoint_t> * const watchpoints_to_check)204 void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end, bool recheck,
205 const std::shared_ptr<TensorData> &tensor, bool *previous_iter_tensor_needed,
206 std::string *const qualified_tensor_name,
207 std::vector<watchpoint_t> *const watchpoints_to_check) {
208 if (tensor == nullptr) {
209 MS_LOG(DEBUG) << "tensor is nullptr.";
210 return;
211 }
212 const auto tensor_name = tensor->GetName();
213 const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
214 const auto tensor_device_id = tensor->GetDeviceId();
215 const auto tensor_root_graph_id = tensor->GetRootGraphId();
216 for (auto w_table_item : watchpoint_table_) {
217 auto wp = std::get<1>(w_table_item);
218 // check ONLY init conditions on initial suspended state.
219 // skip other conditions on initial suspended state
220 if (init_dbg_suspend && (wp.condition.type != INIT)) {
221 continue;
222 }
223 // skip init condition if not init suspend
224 if ((wp.condition.type == INIT) && !init_dbg_suspend) {
225 continue;
226 }
227 // check change conditions only on step end.
228 if (wp.change_condition() && !step_end) {
229 continue;
230 }
231 // if recheck, ignore the cache results and reanalyze everything.
232 // if not a recheck, check only unanalyzed tensors
233 if (!recheck) {
234 wp_lock_.lock();
235 bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
236 wp_lock_.unlock();
237 if (wp_cache_hit) {
238 continue;
239 }
240 }
241 std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot, tensor_device_id, tensor_root_graph_id);
242 if (!found.empty()) {
243 *qualified_tensor_name = found;
244 watchpoints_to_check->push_back(w_table_item.second);
245 #ifdef OFFLINE_DBG_MODE
246 if (wp.change_condition()) {
247 *previous_iter_tensor_needed = true;
248 }
249 #endif
250 }
251 }
252 }
253
AddAnalyzedTensorToCache(const bool recheck,const unsigned int id,const std::string & tensor_name)254 void DebugServices::AddAnalyzedTensorToCache(const bool recheck, const unsigned int id,
255 const std::string &tensor_name) {
256 // add analyzed tensor to cache
257 if (!recheck) {
258 wp_lock_.lock();
259 (void)wp_id_cache_[tensor_name].insert(id);
260 wp_lock_.unlock();
261 }
262 }
263
SetCheckWatchpointsResult(const int chunk_id,partitioned_names * const chunk_names,partitioned_names * const chunk_slots,partitioned_numbers * const chunk_conditions,partitioned_id * const chunk_watchpoint_id,partitioned_parameters * const chunk_parameters,partitioned_error_code * const chunk_error_codes,partitioned_numbers * const chunk_exec_orders,partitioned_names * const chunk_time_stamp,partitioned_id * const chunk_device_id,partitioned_id * const chunk_root_graph_id,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id,const int exec_order,const std::string time_stamp,const std::string & qualified_tensor_name,const std::string & tensor_slot,const watchpoint_t & wp,const unsigned int device_id_val,const unsigned int root_graph_id_val,const std::vector<parameter_t> & parameter_list,const int32_t error_code)264 void DebugServices::SetCheckWatchpointsResult(
265 const int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
266 partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
267 partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
268 partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
269 partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
270 std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
271 const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
272 const watchpoint_t &wp, const unsigned int device_id_val, const unsigned int root_graph_id_val,
273 const std::vector<parameter_t> ¶meter_list, const int32_t error_code) {
274 (void)(*chunk_exec_orders)[chunk_id].emplace_back(exec_order);
275 (void)(*chunk_names)[chunk_id].emplace_back(qualified_tensor_name);
276 (void)(*chunk_slots)[chunk_id].emplace_back(tensor_slot);
277 (void)(*chunk_conditions)[chunk_id].emplace_back(wp.condition.type);
278 (void)(*chunk_watchpoint_id)[chunk_id].emplace_back(wp.id);
279 if (device_id != nullptr) {
280 (void)(*chunk_device_id)[chunk_id].emplace_back(device_id_val);
281 }
282 if (root_graph_id != nullptr) {
283 (void)(*chunk_root_graph_id)[chunk_id].emplace_back(root_graph_id_val);
284 }
285 (void)(*chunk_parameters)[chunk_id].emplace_back(parameter_list);
286 (void)(*chunk_error_codes)[chunk_id].emplace_back(error_code);
287 (void)(*chunk_time_stamp)[chunk_id].emplace_back(time_stamp);
288 }
289
290 #ifdef OFFLINE_DBG_MODE
ProcessCheckpointsOutofMemory(const bool no_mem_to_read,const std::vector<watchpoint_t> watchpoints_to_check,int chunk_id,partitioned_names * const chunk_names,partitioned_names * const chunk_slots,partitioned_numbers * const chunk_conditions,partitioned_id * const chunk_watchpoint_id,partitioned_parameters * const chunk_parameters,partitioned_error_code * const chunk_error_codes,partitioned_numbers * const chunk_exec_orders,partitioned_names * const chunk_time_stamp,partitioned_id * const chunk_device_id,partitioned_id * const chunk_root_graph_id,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id,const int exec_order,const std::string time_stamp,const std::string & qualified_tensor_name,const std::string & tensor_slot,const unsigned int device_id_val,const unsigned int root_graph_id_val,const std::vector<parameter_t> & parameter_list)291 void DebugServices::ProcessCheckpointsOutofMemory(
292 const bool no_mem_to_read, const std::vector<watchpoint_t> watchpoints_to_check, int chunk_id,
293 partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
294 partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
295 partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
296 partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
297 partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
298 std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id, const int exec_order,
299 const std::string time_stamp, const std::string &qualified_tensor_name, const std::string &tensor_slot,
300 const unsigned int device_id_val, const unsigned int root_graph_id_val,
301 const std::vector<parameter_t> ¶meter_list) {
302 if (no_mem_to_read) {
303 // bit 3 denotes failed to load tensor because tensor is oversized and no enough memory to fit in
304 int32_t oversize_error_code = 8;
305 for (auto &wp : watchpoints_to_check) {
306 SetCheckWatchpointsResult(chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
307 chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp,
308 chunk_device_id, chunk_root_graph_id, device_id, root_graph_id, exec_order, time_stamp,
309 qualified_tensor_name, tensor_slot, wp, device_id_val, root_graph_id_val,
310 parameter_list, oversize_error_code);
311 }
312 }
313 }
314 #endif
315
CheckWatchpointsForTensor(partitioned_names * const chunk_names,partitioned_names * const chunk_slots,partitioned_numbers * const chunk_conditions,partitioned_id * const chunk_watchpoint_id,partitioned_parameters * const chunk_parameters,partitioned_error_code * const chunk_error_codes,const std::vector<std::string> & op_overflows,const std::vector<std::string> & async_file_pool,partitioned_numbers * const chunk_exec_orders,std::vector<std::shared_ptr<TensorData>> * const tensor_list,int begin,int end,int chunk_id,const bool init_dbg_suspend,const bool step_end,const bool recheck,partitioned_id * const chunk_device_id,partitioned_id * const chunk_root_graph_id,std::vector<uint64_t> * const chunk_tensor_byte_size,partitioned_names * const chunk_time_stamp,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id)316 void DebugServices::CheckWatchpointsForTensor(
317 partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
318 partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
319 partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
320 const std::vector<std::string> &op_overflows, const std::vector<std::string> &async_file_pool,
321 partitioned_numbers *const chunk_exec_orders, std::vector<std::shared_ptr<TensorData>> *const tensor_list, int begin,
322 int end, int chunk_id, const bool init_dbg_suspend, const bool step_end, const bool recheck,
323 partitioned_id *const chunk_device_id, partitioned_id *const chunk_root_graph_id,
324 std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_names *const chunk_time_stamp,
325 std::vector<unsigned int> *const device_id, std::vector<unsigned int> *const root_graph_id) {
326 int list_size = tensor_list->size();
327 if (end > list_size) {
328 end = list_size;
329 }
330 for (int i = begin; i < end; i++) {
331 auto &tensor = (*tensor_list)[i];
332 const auto tensor_name = tensor->GetName();
333 const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
334 const auto tensor_slot = std::to_string(tensor->GetSlot());
335 std::vector<watchpoint_t> watchpoints_to_check;
336 std::string qualified_tensor_name;
337 bool previous_iter_tensor_needed = false;
338 AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor, &previous_iter_tensor_needed,
339 &qualified_tensor_name, &watchpoints_to_check);
340 // no wp set on current tensor
341 if (watchpoints_to_check.empty()) {
342 continue;
343 }
344 #ifdef OFFLINE_DBG_MODE
345 // read data in offline mode
346 bool no_mem_to_read = false;
347 std::vector<std::shared_ptr<TensorData>> result_list;
348 ReadDumpedTensor(std::vector<std::string>{tensor->GetName()}, std::vector<size_t>{tensor->GetSlot()},
349 std::vector<unsigned int>{tensor->GetDeviceId()},
350 std::vector<unsigned int>{tensor->GetIteration()},
351 std::vector<unsigned int>{tensor->GetRootGraphId()}, std::vector<bool>{tensor->GetIsOutput()},
352 async_file_pool, &result_list, &no_mem_to_read);
353 tensor = result_list[0];
354 if (!tensor->GetByteSize()) {
355 ProcessCheckpointsOutofMemory(
356 no_mem_to_read, watchpoints_to_check, chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id,
357 chunk_parameters, chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id,
358 device_id, root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name,
359 tensor_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), std::vector<parameter_t>());
360 tensor.reset();
361 continue;
362 }
363 #endif
364 // no elements to analyze
365 if (tensor->GetByteSize() == 0) {
366 continue;
367 }
368 (*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
369 int tensor_dtype = tensor->GetType();
370 uint32_t num_elements = tensor->GetNumElements();
371 uint32_t prev_num_elements = 0;
372 const void *previous_tensor_ptr = nullptr;
373 #ifdef OFFLINE_DBG_MODE
374 previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed, &prev_num_elements);
375 #else
376 std::shared_ptr<TensorData> prev_tensor_data = tensor_loader_->GetPrevTensor(tensor_name);
377 if (prev_tensor_data) {
378 previous_tensor_ptr = prev_tensor_data->GetDataPtr();
379 prev_num_elements = prev_tensor_data->GetNumElements();
380 }
381 #endif
382
383 std::unique_ptr<ITensorSummary> base_summary_ptr;
384 if (!(watchpoints_to_check.size() == 1 && watchpoints_to_check[0].condition.type == IS_OVERFLOW)) {
385 base_summary_ptr = GetSummaryPtr(tensor, previous_tensor_ptr, num_elements, prev_num_elements, tensor_dtype);
386 if (base_summary_ptr != nullptr) {
387 base_summary_ptr->SummarizeTensor(watchpoints_to_check);
388 }
389 }
390 for (auto &wp : watchpoints_to_check) {
391 bool is_hit = false;
392 int error_code = 0;
393 std::vector<parameter_t> parameter_list = {};
394 if (wp.condition.type == IS_OVERFLOW) {
395 is_hit =
396 CheckOpOverflow(tensor_name_no_slot, tensor->GetDeviceId(), tensor->GetRootGraphId(), tensor->GetIteration());
397 } else if (base_summary_ptr != nullptr) {
398 auto item = base_summary_ptr->IsWatchpointHit(wp);
399 is_hit = std::get<ITensorSummary::eHitPos>(item);
400 error_code = std::get<ITensorSummary::eErrorCodePos>(item);
401 parameter_list = std::get<ITensorSummary::eParamListPos>(item);
402 }
403 AddAnalyzedTensorToCache(recheck, wp.id, tensor_name);
404 if (is_hit || error_code) {
405 SetCheckWatchpointsResult(
406 chunk_id, chunk_names, chunk_slots, chunk_conditions, chunk_watchpoint_id, chunk_parameters,
407 chunk_error_codes, chunk_exec_orders, chunk_time_stamp, chunk_device_id, chunk_root_graph_id, device_id,
408 root_graph_id, tensor->GetExecutionOrder(), tensor->GetTimeStamp(), qualified_tensor_name, tensor_slot, wp,
409 tensor->GetDeviceId(), tensor->GetRootGraphId(), parameter_list, error_code);
410 }
411 }
412
413 #ifdef OFFLINE_DBG_MODE
414 // set the tensor into not-in-use status in tensor_loader.
415 std::string key_name_in_cache = tensor_name + ":" + std::to_string(tensor->GetDeviceId()) + ":" +
416 std::to_string(tensor->GetRootGraphId()) + ":" +
417 std::to_string(tensor->GetIsOutput()) + ":" + std::to_string(tensor->GetSlot());
418 AppendToCacheEvictQueue(key_name_in_cache);
419 if (previous_tensor_ptr != nullptr) {
420 AppendToCacheEvictQueue(key_name_in_cache + ":prev");
421 }
422 // in offline mode remove the need for the data
423 tensor.reset();
424 #endif
425 }
426 }
CheckWatchpoints(std::vector<std::string> * const name,std::vector<std::string> * const slot,std::vector<int> * const condition,std::vector<unsigned int> * const watchpoint_id,std::vector<std::vector<parameter_t>> * const parameters,std::vector<int32_t> * const error_codes,const std::vector<std::string> & op_overflows,const std::vector<std::string> & async_file_pool,std::vector<std::shared_ptr<TensorData>> * const tensor_list,const bool init_dbg_suspend,const bool step_end,const bool recheck,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id)427 void DebugServices::CheckWatchpoints(
428 std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
429 std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
430 std::vector<int32_t> *const error_codes, const std::vector<std::string> &op_overflows,
431 const std::vector<std::string> &async_file_pool, std::vector<std::shared_ptr<TensorData>> *const tensor_list,
432 const bool init_dbg_suspend, const bool step_end, const bool recheck, std::vector<unsigned int> *const device_id,
433 std::vector<unsigned int> *const root_graph_id) {
434 std::lock_guard<std::mutex> lg(lock_);
435 auto t1 = std::chrono::high_resolution_clock::now();
436 if (watchpoint_table_.empty()) {
437 return;
438 }
439 // vector to store execution order of tensors hit
440 std::vector<int> exec_order;
441 std::vector<std::string> time_stamps;
442 int tensor_list_size = tensor_list->size();
443 uint64_t tensor_list_byte_size = 0;
444 MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
445 if (tensor_list_size <= 0) {
446 return;
447 }
448 // default value for number of threads
449 const int default_thread_num = 16;
450 int max_thread_num = default_thread_num;
451 if (max_thread_num > tensor_list_size) {
452 max_thread_num = tensor_list_size;
453 }
454 MS_LOG(INFO) << "Number of threads used for checkwatchpoint: " << max_thread_num;
455 int chunk_size = tensor_list_size / max_thread_num;
456 int remainder = tensor_list_size % max_thread_num;
457 partitioned_numbers chunk_exec_orders(max_thread_num);
458 partitioned_names chunk_names(max_thread_num);
459 partitioned_names chunk_slots(max_thread_num);
460 partitioned_numbers chunk_conditions(max_thread_num);
461 partitioned_id chunk_watchpoint_id(max_thread_num);
462 partitioned_parameters chunk_parameters(max_thread_num);
463 partitioned_error_code chunk_error_codes(max_thread_num);
464 partitioned_id chunk_device_id(max_thread_num);
465 partitioned_id chunk_root_graph_id(max_thread_num);
466 std::vector<uint64_t> chunk_tensor_byte_size(max_thread_num, 0);
467 partitioned_names chunk_time_stamp(max_thread_num);
468
469 std::vector<std::future<void>> tensor_future_vec;
470 int begin = 0;
471 int end = begin;
472 for (int i = 0; i < max_thread_num; i++) {
473 end += chunk_size;
474 if (remainder > 0) {
475 end++;
476 remainder--;
477 }
478 (void)tensor_future_vec.emplace_back(std::async(
479 std::launch::async, &DebugServices::CheckWatchpointsForTensor, this, &chunk_names, &chunk_slots,
480 &chunk_conditions, &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, op_overflows, async_file_pool,
481 &chunk_exec_orders, tensor_list, begin, end, i, init_dbg_suspend, step_end, recheck, &chunk_device_id,
482 &chunk_root_graph_id, &chunk_tensor_byte_size, &chunk_time_stamp, device_id, root_graph_id));
483 begin = end;
484 }
485
486 SortWatchpointsInfo(&tensor_future_vec, &exec_order, &time_stamps, &tensor_list_byte_size, name, slot, condition,
487 watchpoint_id, parameters, error_codes, &chunk_names, &chunk_slots, &chunk_conditions,
488 &chunk_watchpoint_id, &chunk_parameters, &chunk_error_codes, &chunk_exec_orders,
489 &chunk_time_stamp, &chunk_tensor_byte_size, &chunk_device_id, &chunk_root_graph_id, device_id,
490 root_graph_id);
491
492 auto t2 = std::chrono::high_resolution_clock::now();
493 std::chrono::duration<double, std::milli> ms_double = t2 - t1;
494 MS_LOG(INFO) << "tensor_list byte size is " << tensor_list_byte_size / pow(10.0, 6.0) << " MB";
495 MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s";
496 }
497
SortWatchpointsInfo(std::vector<std::future<void>> * const tensor_future_vec,std::vector<int> * const exec_order,std::vector<std::string> * const time_stamps,uint64_t * const tensor_list_byte_size,std::vector<std::string> * const name,std::vector<std::string> * const slot,std::vector<int> * const condition,std::vector<unsigned int> * const watchpoint_id,std::vector<std::vector<parameter_t>> * const parameters,std::vector<int32_t> * const error_codes,partitioned_names * const chunk_names,partitioned_names * const chunk_slots,partitioned_numbers * const chunk_conditions,partitioned_id * const chunk_watchpoint_id,partitioned_parameters * const chunk_parameters,partitioned_error_code * const chunk_error_codes,partitioned_numbers * const chunk_exec_orders,partitioned_names * const chunk_time_stamp,std::vector<uint64_t> * const chunk_tensor_byte_size,partitioned_id * const chunk_device_id,partitioned_id * const chunk_root_graph_id,std::vector<unsigned int> * const device_id,std::vector<unsigned int> * const root_graph_id)498 void DebugServices::SortWatchpointsInfo(
499 std::vector<std::future<void>> *const tensor_future_vec, std::vector<int> *const exec_order,
500 std::vector<std::string> *const time_stamps, uint64_t *const tensor_list_byte_size,
501 std::vector<std::string> *const name, std::vector<std::string> *const slot, std::vector<int> *const condition,
502 std::vector<unsigned int> *const watchpoint_id, std::vector<std::vector<parameter_t>> *const parameters,
503 std::vector<int32_t> *const error_codes, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
504 partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
505 partitioned_parameters *const chunk_parameters, partitioned_error_code *const chunk_error_codes,
506 partitioned_numbers *const chunk_exec_orders, partitioned_names *const chunk_time_stamp,
507 std::vector<uint64_t> *const chunk_tensor_byte_size, partitioned_id *const chunk_device_id,
508 partitioned_id *const chunk_root_graph_id, std::vector<unsigned int> *const device_id,
509 std::vector<unsigned int> *const root_graph_id) {
510 for (unsigned int i = 0; i < (*tensor_future_vec).size(); i++) {
511 (*tensor_future_vec)[i].wait();
512 (*tensor_future_vec)[i].get();
513 for (unsigned int j = 0; j < (*chunk_exec_orders)[i].size(); j++) {
514 #ifdef ONLINE_DBG_MODE
515 // if the execution order is repeated,inserts the new one before the others with same execution order.
516 std::vector<int>::iterator iter =
517 std::lower_bound(exec_order->begin(), exec_order->end(), (*chunk_exec_orders)[i][j]);
518 int position = iter - exec_order->begin();
519 (void)exec_order->emplace(iter, (*chunk_exec_orders)[i][j]);
520 #endif
521 #ifdef OFFLINE_DBG_MODE
522 std::vector<std::string>::iterator iter =
523 std::lower_bound(time_stamps->begin(), time_stamps->end(), (*chunk_time_stamp)[i][j]);
524 int position = iter - time_stamps->begin();
525 (void)time_stamps->emplace(iter, (*chunk_time_stamp)[i][j]);
526 #endif
527 (void)name->emplace(name->begin() + position, (*chunk_names)[i][j]);
528 (void)slot->emplace(slot->begin() + position, (*chunk_slots)[i][j]);
529 (void)condition->emplace(condition->begin() + position, (*chunk_conditions)[i][j]);
530 (void)watchpoint_id->emplace(watchpoint_id->begin() + position, (*chunk_watchpoint_id)[i][j]);
531 if (device_id != nullptr) {
532 (void)device_id->emplace(device_id->begin() + position, (*chunk_device_id)[i][j]);
533 }
534 if (root_graph_id != nullptr) {
535 (void)root_graph_id->emplace(root_graph_id->begin() + position, (*chunk_root_graph_id)[i][j]);
536 }
537 (void)parameters->emplace(parameters->begin() + position, (*chunk_parameters)[i][j]);
538 (void)error_codes->emplace(error_codes->begin() + position, (*chunk_error_codes)[i][j]);
539 }
540 // free the memory for used vectors
541 std::vector<int>().swap((*chunk_exec_orders)[i]);
542 std::vector<std::string>().swap((*chunk_time_stamp)[i]);
543 std::vector<std::string>().swap((*chunk_names)[i]);
544 std::vector<std::string>().swap((*chunk_slots)[i]);
545 std::vector<int>().swap((*chunk_conditions)[i]);
546 std::vector<unsigned int>().swap((*chunk_watchpoint_id)[i]);
547 std::vector<std::vector<parameter_t>>().swap((*chunk_parameters)[i]);
548 std::vector<int32_t>().swap((*chunk_error_codes)[i]);
549 std::vector<unsigned int>().swap((*chunk_device_id)[i]);
550 std::vector<unsigned int>().swap((*chunk_root_graph_id)[i]);
551 (*tensor_list_byte_size) += (*chunk_tensor_byte_size)[i];
552 }
553 }
554
555 #ifdef OFFLINE_DBG_MODE
ReadTensorFromNpy(const std::string & tensor_name,const std::string & file_name,std::string * const tensor_type,std::size_t * const size,std::vector<int64_t> * const shape,std::vector<char> ** const data_buffer,bool * no_mem_to_read)556 void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name,
557 std::string *const tensor_type, std::size_t *const size,
558 std::vector<int64_t> *const shape, std::vector<char> **const data_buffer,
559 bool *no_mem_to_read) {
560 std::ifstream infile;
561 std::string file_path = file_name;
562 MS_LOG(INFO) << "Reading in file: " << file_path;
563 infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
564 if (!infile.is_open()) {
565 MS_LOG(ERROR) << "Failed to open file (In ReadTensorFromNpy) " << file_path << " Errno:" << errno;
566 const int kMaxFilenameLength = 128;
567 char err_info[kMaxFilenameLength];
568 auto ret = strerror_r(errno, err_info, sizeof(err_info));
569 if (ret != nullptr) {
570 MS_LOG(ERROR) << " ErrInfo:" << ret;
571 }
572 return;
573 }
574 const int substr_len = 2;
575 const int header_len_offset = 8;
576 const int header_offset = 9;
577 const int header_len_buffer_size = 2;
578 const int type_offset = 10;
579 // get header length
580 (void)infile.seekg(0, std::ios::beg);
581 auto header_len_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len_buffer_size);
582 if (!infile.read(header_len_buffer->data(), header_len_offset + header_len_buffer_size)) {
583 MS_LOG(ERROR) << "Failed to parse header length from " << file_path;
584 return;
585 }
586 uint16_t header_len = *reinterpret_cast<uint16_t *>(header_len_buffer->data() + header_len_offset);
587 header_len_buffer.reset();
588 // read in header
589 (void)infile.seekg(0, std::ios::beg);
590 auto header_buffer = std::make_unique<std::vector<char>>(header_len_offset + header_len);
591 if (!infile.read(header_buffer->data(), header_len_offset + header_len)) {
592 MS_LOG(ERROR) << "Failed to read header from " << file_path;
593 return;
594 }
595 std::string header(header_buffer->data() + header_offset, header_len);
596 header_buffer.reset();
597 std::size_t type_i = header.find("descr") + type_offset;
598 if (header.length() < type_i + substr_len) {
599 MS_LOG(ERROR) << "Cannot get tensor_type, header length is " << header.length();
600 return;
601 }
602 *tensor_type = header.substr(type_i, substr_len);
603 std::size_t shape_i_open = header.find("(");
604 std::size_t shape_i_close = header.find(")");
605 std::string shape_str = header.substr(shape_i_open + 1, shape_i_close - shape_i_open - 1);
606 std::string intermediate;
607 std::stringstream check_shape(shape_str);
608 MS_LOG(INFO) << "Shape of " << file_name << " is: [" << shape_str << "]";
609 while (getline(check_shape, intermediate, ',')) {
610 shape->push_back(std::stoi(intermediate));
611 }
612 std::size_t word_size = std::stoul(std::string(1, (*tensor_type)[1]));
613 std::size_t data_len = std::accumulate(shape->begin(), shape->end(), 1, std::multiplies<uint64_t>());
614 std::size_t data_size = data_len * word_size;
615 if (!data_size) {
616 return;
617 }
618 // Check memory available before loading tensor into host.
619 bool has_enough_memory = true;
620 if (tensor_loader_->EnableMemoryControl()) {
621 has_enough_memory = tensor_loader_->CheckMemoryAvailable(tensor_name, data_size);
622 }
623 if (!has_enough_memory) {
624 MS_LOG(ERROR) << "No enough memory available for loading " << tensor_name << " into host memory.";
625 *no_mem_to_read = true;
626 } else {
627 (void)infile.seekg(header_len + type_offset);
628 *data_buffer = new std::vector<char>(data_size);
629 if ((*data_buffer) == nullptr || !infile.read((*data_buffer)->data(), data_size)) {
630 MS_LOG(ERROR) << "Unable to get tensor data from npy";
631 }
632 *size = data_size;
633 }
634 }
635
ConvertToHostFormat(const std::map<std::string,std::vector<std::string>> & dir_to_files_map,std::vector<std::string> * const result_list)636 void DebugServices::ConvertToHostFormat(const std::map<std::string, std::vector<std::string>> &dir_to_files_map,
637 std::vector<std::string> *const result_list) {
638 std::string file_format = "npy";
639 for (auto const &d : dir_to_files_map) {
640 std::vector<std::string> files_to_convert_in_dir;
641 std::vector<std::string> files_after_convert_in_dir;
642 std::string dump_key = d.first;
643 for (auto const &file_name : d.second) {
644 bool already_converted = false;
645 // Remove scope from the file_name for matching files converted by mindinsight tool.
646 std::size_t found_first_dot = file_name.find(".");
647 std::size_t found_last_underscore = file_name.find_last_of("_");
648 std::string file_name_without_scope = file_name;
649 if (found_last_underscore != std::string::npos && found_last_underscore > found_first_dot) {
650 file_name_without_scope =
651 file_name_without_scope.erase(found_first_dot + 1, found_last_underscore - found_first_dot);
652 }
653 for (std::string &file_found : *result_list) {
654 if (file_found.find(file_name_without_scope) != std::string::npos) {
655 already_converted = true;
656 break;
657 }
658 }
659 if (!already_converted) {
660 (void)files_to_convert_in_dir.emplace_back(dump_key + "/" + file_name);
661 (void)files_after_convert_in_dir.emplace_back(dump_key + "/" + file_name_without_scope);
662 }
663 }
664 MS_LOG(INFO) << "Number of files to convert: " << files_to_convert_in_dir.size();
665 if (!files_to_convert_in_dir.empty()) {
666 // Look for the installation path to the conver_async package. If not found, throw exception and terminate the
667 // later task.
668 try {
669 auto pkg = pybind11::module::import("mindspore.offline_debug.convert_async");
670 auto convert_obj = pkg.attr("AsyncDumpConverter")(pybind11::cast(files_to_convert_in_dir), dump_key);
671 (void)convert_obj.attr("convert_files")();
672 } catch (pybind11::error_already_set &e) {
673 MS_LOG(EXCEPTION) << "Failed to convert async dump data: " << e.what();
674 }
675 ProcessConvertToHostFormat(files_after_convert_in_dir, dump_key, result_list, file_format);
676 }
677 }
678 }
679
ProcessConvertToHostFormat(const std::vector<std::string> & files_after_convert_in_dir,const std::string & dump_key,std::vector<std::string> * const result_list,const std::string & file_format)680 void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
681 const std::string &dump_key, std::vector<std::string> *const result_list,
682 const std::string &file_format) {
683 std::string real_dump_iter_dir = RealPath(dump_key);
684 DIR *d_handle = opendir(real_dump_iter_dir.c_str());
685 if (d_handle == nullptr) {
686 MS_LOG(ERROR) << "Directory does not exit in ConvertToHostFormat.";
687 return;
688 }
689 struct dirent *dir = nullptr;
690 while ((dir = readdir(d_handle)) != nullptr) {
691 if (dir->d_type == DT_REG) {
692 std::string candidate = dir->d_name;
693 for (const std::string &file_to_find : files_after_convert_in_dir) {
694 std::string file_n = file_to_find;
695 auto last_slash_pos = file_to_find.find_last_of("\\/");
696 if (last_slash_pos != std::string::npos) {
697 file_n = file_to_find.substr(last_slash_pos + 1);
698 }
699 if (candidate.find(file_n) != std::string::npos && candidate.rfind(file_format) != std::string::npos) {
700 // we found a converted file for this op
701 std::string found_file = dump_key + "/" + candidate;
702 if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
703 result_list->push_back(found_file);
704 }
705 }
706 }
707 }
708 }
709 (void)closedir(d_handle);
710 }
711
GetNodeNameWithoutScope(const std::string & dump_style_name)712 std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
713 if (dump_style_name.empty()) {
714 return "";
715 }
716 std::size_t last_scope_marker;
717 std::string delim = "/";
718 last_scope_marker = dump_style_name.rfind(delim);
719 if (last_scope_marker == std::string::npos) {
720 return dump_style_name;
721 }
722 return dump_style_name.substr(last_scope_marker + delim.size());
723 }
724
ConvertReadTensors(std::vector<std::string> backend_name,std::vector<size_t> slot,std::vector<unsigned int> device_id,std::vector<unsigned int> iteration,std::vector<unsigned int> root_graph_id,std::vector<std::string> * const result_list)725 void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
726 std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
727 std::vector<unsigned int> root_graph_id,
728 std::vector<std::string> *const result_list) {
729 std::string file_format = "npy";
730 std::map<std::string, std::vector<std::string>> dir_to_files_map;
731 for (unsigned int i = 0; i < backend_name.size(); i++) {
732 // form prefix of the tensor file to read from graph pb node name
733 std::string dump_style_kernel_name = backend_name[i];
734
735 // remove slot from name
736 std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
737 dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
738
739 std::string prefix_dump_file_name = GetNodeNameWithoutScope(dump_style_kernel_name);
740
741 std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
742 std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
743
744 // search files in dir for the one that meets the filename prefix and read the file into memory
745 std::string abspath = RealPath(specific_dump_dir);
746 DIR *d = opendir(abspath.c_str());
747 if (d == nullptr) {
748 MS_LOG(ERROR) << "Directory does not exist in ConvertReadTensors.";
749 return;
750 }
751 ProcessConvertList(prefix_dump_file_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
752 (void)closedir(d);
753 }
754 ConvertToHostFormat(dir_to_files_map, result_list);
755 }
756
ConvertWatchPointNodes(const std::vector<std::tuple<std::string,std::string>> & proto_dump,const std::string & specific_dump_dir,std::vector<std::string> * const result_list)757 void DebugServices::ConvertWatchPointNodes(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
758 const std::string &specific_dump_dir,
759 std::vector<std::string> *const result_list) {
760 std::string file_format = "npy";
761 std::map<std::string, std::vector<std::string>> dir_to_files_map;
762 for (const auto &node : proto_dump) {
763 std::string dump_name = std::get<1>(node);
764 dump_name = dump_name.substr(0, dump_name.rfind("."));
765 // search files in dir for the one that meets the filename prefix and read the file into memory
766 std::string abspath = RealPath(specific_dump_dir);
767 DIR *d = opendir(abspath.c_str());
768 if (d == nullptr) {
769 MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ConvertWatchPointNodes.";
770 return;
771 }
772 ProcessConvertList(dump_name, file_format, specific_dump_dir, &dir_to_files_map, result_list);
773 (void)closedir(d);
774 }
775 ConvertToHostFormat(dir_to_files_map, result_list);
776 }
777
ProcessConvertList(const std::string & prefix_dump_file_name,const std::string & file_format,const std::string & specific_dump_dir,std::map<std::string,std::vector<std::string>> * dir_to_files_map,std::vector<std::string> * const result_list)778 void DebugServices::ProcessConvertList(const std::string &prefix_dump_file_name, const std::string &file_format,
779 const std::string &specific_dump_dir,
780 std::map<std::string, std::vector<std::string>> *dir_to_files_map,
781 std::vector<std::string> *const result_list) {
782 DIR *d = opendir(specific_dump_dir.c_str());
783 struct dirent *dir = nullptr;
784 while ((dir = readdir(d)) != nullptr) {
785 if (dir->d_type != DT_REG) {
786 continue;
787 }
788 std::string file_name = dir->d_name;
789 std::string file_name_w_o_perfix = file_name;
790 auto type_pos = file_name.find('.');
791 if (type_pos == std::string::npos || file_name.find(prefix_dump_file_name, type_pos + 1) == std::string::npos) {
792 continue;
793 }
794 if (file_name.rfind(file_format) == std::string::npos) {
795 // if file matches prefix and is in device format add to candidate files to convert.
796 (*dir_to_files_map)[specific_dump_dir].push_back(file_name);
797 } else {
798 // otherwise, if file matches prefix and already has been converted to host format
799 // add to result of converted files.
800 std::string found_file = specific_dump_dir + "/" + file_name;
801 if (std::find(result_list->begin(), result_list->end(), found_file) == result_list->end()) {
802 result_list->push_back(found_file);
803 }
804 }
805 }
806 (void)closedir(d);
807 }
808
GetTensorDataInfoAsync(const std::vector<std::tuple<std::string,std::string>> & proto_dump,const std::string & specific_dump_dir,uint32_t iteration,uint32_t device_id,uint32_t root_graph_id,const std::vector<std::string> & async_file_pool,std::vector<std::shared_ptr<TensorData>> * const tensor_list)809 void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::string, std::string>> &proto_dump,
810 const std::string &specific_dump_dir, uint32_t iteration, uint32_t device_id,
811 uint32_t root_graph_id, const std::vector<std::string> &async_file_pool,
812 std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
813 for (auto &node : proto_dump) {
814 std::vector<size_t> slot_list;
815 std::string dump_style_name = std::get<1>(node);
816 // Get dump_name and output_str from the second element of tuple
817 std::size_t found_dot = dump_style_name.rfind(".");
818 std::string dump_name = dump_style_name.substr(0, found_dot);
819 std::string output_str = dump_style_name.substr(found_dot + 1);
820 bool output_flag = (output_str == "output");
821
822 for (const std::string &file_name : async_file_pool) {
823 std::size_t found = file_name.find(dump_name);
824 std::size_t found_out = file_name.find(output_str);
825 std::size_t found_dot_start = file_name.find(".", found_out);
826 std::size_t found_dot_end = file_name.find(".", found_dot_start);
827
828 if (file_name.find(specific_dump_dir) != std::string::npos && found != std::string::npos &&
829 found_out != std::string::npos) {
830 slot_list.push_back(std::stoul(file_name.substr(found_dot_start + 1, found_dot_end - found_dot_start - 1)));
831 }
832 }
833 for (auto slot : slot_list) {
834 // add a TensorData entry (data will be read when needed)
835 std::vector<int64_t> shape;
836 std::string orig_name = std::get<0>(node);
837 auto tensor_data = std::make_shared<TensorData>();
838 tensor_data->SetName(orig_name);
839 tensor_data->SetExecutionOrder(0);
840 tensor_data->SetSlot(slot);
841 tensor_data->SetIteration(iteration);
842 tensor_data->SetDeviceId(device_id);
843 tensor_data->SetRootGraphId(root_graph_id);
844 tensor_data->SetDataPtr(nullptr);
845 tensor_data->SetByteSize(0);
846 tensor_data->SetType("");
847 tensor_data->SetShape(shape);
848 tensor_data->SetIsOutput(output_flag);
849
850 tensor_list->push_back(tensor_data);
851 }
852 }
853 }
854
AddToTensorData(const std::string & backend_name,const std::string & time_stamp,const std::size_t slot,const unsigned int iteration,const unsigned int device_id,const unsigned int root_graph_id,const bool is_output,const std::size_t data_size,const std::string & type_name,const std::vector<int64_t> & shape,std::vector<char> * buffer,std::vector<std::shared_ptr<TensorData>> * const result_list)855 void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
856 const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
857 const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
858 const std::string &type_name, const std::vector<int64_t> &shape,
859 std::vector<char> *buffer,
860 std::vector<std::shared_ptr<TensorData>> *const result_list) {
861 // call LoadNewTensor to store tensor in internal cache
862 auto tensor_data = std::make_shared<TensorData>();
863 tensor_data->SetName(backend_name);
864 tensor_data->SetExecutionOrder(0);
865 tensor_data->SetSlot(slot);
866 tensor_data->SetIteration(iteration);
867 tensor_data->SetDeviceId(device_id);
868 tensor_data->SetRootGraphId(root_graph_id);
869 tensor_data->SetIsOutput(is_output);
870 if (buffer != nullptr) {
871 tensor_data->SetDataPtr(buffer->data());
872 } else {
873 tensor_data->SetDataPtr(nullptr);
874 }
875 tensor_data->SetByteSize(data_size);
876 tensor_data->SetType(type_name);
877 tensor_data->SetShape(shape);
878 tensor_data->SetTimeStamp(time_stamp);
879 if (data_size) {
880 (void)tensor_loader_->LoadNewTensor(tensor_data, false);
881 }
882
883 // add to result_list
884 result_list->push_back(tensor_data);
885 }
886
SetPrefixToCheck(std::string * const prefix_dump_file_name,std::string * const slot_string_to_check,std::string * const dump_style_kernel_name,size_t slot,bool is_output)887 void DebugServices::SetPrefixToCheck(std::string *const prefix_dump_file_name, std::string *const slot_string_to_check,
888 std::string *const dump_style_kernel_name, size_t slot, bool is_output) {
889 std::string dump_style_name_part = *dump_style_kernel_name;
890 dump_style_name_part = GetNodeNameWithoutScope(dump_style_name_part);
891 std::string slot_str;
892 if (is_output) {
893 slot_str = ".output." + std::to_string(slot);
894 } else {
895 slot_str = ".input." + std::to_string(slot);
896 }
897 dump_style_name_part += slot_str;
898 *prefix_dump_file_name = dump_style_name_part;
899 *slot_string_to_check = slot_str;
900 }
901
GetNewestFilePath(std::vector<std::string> file_list)902 std::string GetNewestFilePath(std::vector<std::string> file_list) {
903 // get file with the newest timestamp from the list.
904 if (file_list.empty()) {
905 return "";
906 }
907 std::sort(file_list.begin(), file_list.end());
908 return file_list.back();
909 }
910
GetTimeStampStr(std::string file_path)911 std::string GetTimeStampStr(std::string file_path) {
912 // get the file_name from file_path.
913 size_t pos = file_path.rfind("/");
914 std::string file_name = file_path.substr(pos + 1);
915 size_t first_dot = file_name.rfind(".");
916 size_t second_dot = file_name.rfind(".", first_dot - 1);
917 size_t third_dot = file_name.rfind(".", second_dot - 1);
918 size_t fourth_dot = file_name.rfind(".", third_dot - 1);
919 size_t fifth_dot = file_name.rfind(".", fourth_dot - 1);
920 if (fourth_dot != std::string::npos && fifth_dot != std::string::npos && fourth_dot > fifth_dot) {
921 std::string time_stamp = file_name.substr(fifth_dot + 1, fourth_dot - fifth_dot - 1);
922 return time_stamp;
923 }
924 return "";
925 }
926
ReadDumpedTensor(std::vector<std::string> backend_name,std::vector<size_t> slot,std::vector<unsigned int> device_id,std::vector<unsigned int> iteration,std::vector<unsigned int> root_graph_id,const std::vector<bool> & is_output,const std::vector<std::string> & async_file_pool,std::vector<std::shared_ptr<TensorData>> * const result_list,bool * no_mem_to_read)927 void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
928 std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
929 std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
930 const std::vector<std::string> &async_file_pool,
931 std::vector<std::shared_ptr<TensorData>> *const result_list,
932 bool *no_mem_to_read) {
933 for (unsigned int i = 0; i < backend_name.size(); i++) {
934 // form prefix of the tensor file to read from graph pb node name
935 std::string dump_style_kernel_name = backend_name[i];
936
937 // remove slot from name
938 std::size_t found_colon = dump_style_kernel_name.find_last_of(":");
939 dump_style_kernel_name = dump_style_kernel_name.substr(0, found_colon);
940
941 std::string slot_string_to_check;
942 std::string prefix_dump_file_name;
943 SetPrefixToCheck(&prefix_dump_file_name, &slot_string_to_check, &dump_style_kernel_name, slot[i], is_output[i]);
944 std::string prefix_dump_to_check = GetNodeNameWithoutScope(dump_style_kernel_name);
945
946 std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id[i]) + "/" + net_name_ + "/" +
947 std::to_string(root_graph_id[i]) + "/" + IterationString(iteration[i]);
948
949 // search files in dir for the one that meets the filename prefix and read the file into memory
950 if (is_sync_mode_) {
951 ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i],
952 iteration[i], root_graph_id[i], is_output[i], result_list, no_mem_to_read);
953 } else {
954 ReadDumpedTensorAsync(specific_dump_dir, prefix_dump_to_check, slot_string_to_check, backend_name[i], slot[i],
955 device_id[i], iteration[i], root_graph_id[i], is_output[i], async_file_pool, result_list,
956 no_mem_to_read);
957 }
958 }
959 }
960
ReadFileAndAddToTensor(const bool found,const std::vector<std::string> & matched_paths,const std::string & backend_name,const unsigned int device_id,const unsigned int root_graph_id,const bool & is_output,size_t slot,bool * no_mem_to_read,unsigned int iteration,std::vector<std::shared_ptr<TensorData>> * result_list)961 void DebugServices::ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,
962 const std::string &backend_name, const unsigned int device_id,
963 const unsigned int root_graph_id, const bool &is_output, size_t slot,
964 bool *no_mem_to_read, unsigned int iteration,
965 std::vector<std::shared_ptr<TensorData>> *result_list) {
966 std::string time_stamp = "";
967 std::string type_name = "";
968 uint64_t data_size = 0;
969 std::vector<int64_t> shape;
970 std::vector<char> *buffer = nullptr;
971 if (found) {
972 std::string result_path = GetNewestFilePath(matched_paths);
973 time_stamp = GetTimeStampStr(result_path);
974 std::string key_name_in_cache = backend_name + ":" + std::to_string(device_id) + ":" +
975 std::to_string(root_graph_id) + ":" + std::to_string(is_output) + ":" +
976 std::to_string(slot);
977 ReadTensorFromNpy(key_name_in_cache, result_path, &type_name, &data_size, &shape, &buffer, no_mem_to_read);
978 AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, data_size,
979 type_name, shape, buffer, result_list);
980 } else {
981 AddToTensorData(backend_name, time_stamp, slot, iteration, device_id, root_graph_id, is_output, 0, type_name, shape,
982 buffer, result_list);
983 MS_LOG(INFO) << "Target tensor has not been found.";
984 }
985 }
986
ReadDumpedTensorSync(const std::string & prefix_dump_file_name,const std::string & specific_dump_dir,const std::string & backend_name,size_t slot,const unsigned int device_id,unsigned int iteration,unsigned int root_graph_id,const bool & is_output,std::vector<std::shared_ptr<TensorData>> * result_list,bool * no_mem_to_read)987 void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
988 const std::string &backend_name, size_t slot, const unsigned int device_id,
989 unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
990 std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
991 std::string abspath = RealPath(specific_dump_dir);
992 DIR *d = opendir(abspath.c_str());
993 bool found_file = false;
994 std::vector<std::string> matched_paths;
995 if (d == nullptr) {
996 MS_LOG(INFO) << "Directory " << specific_dump_dir << " does not exist!";
997 } else {
998 struct dirent *dir = nullptr;
999 while ((dir = readdir(d)) != nullptr) {
1000 if (dir->d_type == DT_REG) {
1001 std::string file_name = dir->d_name;
1002 std::string stripped_file_name = GetStrippedFilename(file_name);
1003 if (stripped_file_name.empty()) {
1004 continue;
1005 }
1006 std::size_t found = stripped_file_name.rfind(prefix_dump_file_name, 0);
1007 if (found != 0) {
1008 continue;
1009 }
1010 std::string full_path = specific_dump_dir + "/" + file_name;
1011 matched_paths.push_back(full_path);
1012 found_file = true;
1013 }
1014 }
1015 (void)closedir(d);
1016 }
1017 ReadFileAndAddToTensor(found_file, matched_paths, backend_name, device_id, root_graph_id, is_output, slot,
1018 no_mem_to_read, iteration, result_list);
1019 }
1020
ReadDumpedTensorAsync(const std::string & specific_dump_dir,const std::string & prefix_dump_to_check,const std::string & slot_string_to_check,const std::string & backend_name,size_t slot,unsigned int device_id,unsigned int iteration,unsigned int root_graph_id,const bool & is_output,const std::vector<std::string> & async_file_pool,std::vector<std::shared_ptr<TensorData>> * result_list,bool * no_mem_to_read)1021 void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
1022 const std::string &slot_string_to_check, const std::string &backend_name,
1023 size_t slot, unsigned int device_id, unsigned int iteration,
1024 unsigned int root_graph_id, const bool &is_output,
1025 const std::vector<std::string> &async_file_pool,
1026 std::vector<std::shared_ptr<TensorData>> *result_list, bool *no_mem_to_read) {
1027 bool found = false;
1028 std::vector<std::string> matched_paths;
1029 // if async mode
1030 for (const std::string &file_path : async_file_pool) {
1031 if (file_path.find(specific_dump_dir) != std::string::npos &&
1032 file_path.find(prefix_dump_to_check) != std::string::npos &&
1033 file_path.find(slot_string_to_check) != std::string::npos) {
1034 matched_paths.push_back(file_path);
1035 found = true;
1036 }
1037 }
1038 ReadFileAndAddToTensor(found, matched_paths, backend_name, device_id, root_graph_id, is_output, slot, no_mem_to_read,
1039 iteration, result_list);
1040 }
1041
GetStrippedFilename(const std::string & file_name)1042 std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
1043 // strip off the task_id, stream_id, and timestamp, then compare
1044 size_t first_dot = file_name.find(".");
1045 size_t seventh_dot = file_name.rfind(".", file_name.rfind(".") - 1);
1046 size_t fifth_dot = file_name.rfind(".", file_name.rfind(".", seventh_dot - 1) - 1);
1047
1048 if (fifth_dot == std::string::npos || fifth_dot <= first_dot) {
1049 return std::string();
1050 }
1051
1052 // Look for the second dot's position from the back to avoid issue due to dots in the node name.
1053 size_t second_dot = fifth_dot;
1054 const int8_t kSecondDotPosition = 2;
1055 for (int8_t pos = 5; pos > kSecondDotPosition; pos--) {
1056 second_dot = file_name.rfind(".", second_dot - 1);
1057 }
1058
1059 if (second_dot == std::string::npos || second_dot <= first_dot) {
1060 return std::string();
1061 }
1062
1063 std::string start_string = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
1064 std::string end_string = file_name.substr(fifth_dot, seventh_dot - fifth_dot);
1065 std::string stripped_file_name = start_string + end_string;
1066 return stripped_file_name;
1067 }
1068
ReadNeededDumpedTensors(unsigned int iteration,std::vector<std::string> * const async_file_pool)1069 std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
1070 unsigned int iteration, std::vector<std::string> *const async_file_pool) {
1071 // get a list of nodes and the devices they are on to monitor
1072 std::vector<std::shared_ptr<TensorData>> tensor_list;
1073 std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> device_and_graph_to_nodes;
1074 for (auto w_table_item : watchpoint_table_) {
1075 auto wp = std::get<1>(w_table_item);
1076 unsigned int index = 0;
1077 for (auto check_node : wp.check_node_list) {
1078 std::vector<uint32_t> devices = std::get<1>(wp.check_node_device_list[index]);
1079 std::vector<uint32_t> graphs = std::get<1>(wp.check_node_graph_list[index]);
1080 for (auto device : devices) {
1081 for (auto graph : graphs) {
1082 std::tuple<uint32_t, uint32_t> key(device, graph);
1083 device_and_graph_to_nodes[key].push_back(check_node);
1084 }
1085 }
1086
1087 index++;
1088 }
1089 }
1090
1091 // scan each device/iteration dir for the watched nodes for each device, and add to tensor_list
1092 // as they are found
1093 for (auto const &device_and_graph_item : device_and_graph_to_nodes) {
1094 std::tuple<uint32_t, uint32_t> device_and_graph = device_and_graph_item.first;
1095 uint32_t device_id = std::get<0>(device_and_graph);
1096 uint32_t root_graph_id = std::get<1>(device_and_graph);
1097 std::vector<std::tuple<std::string, bool>> wp_nodes = device_and_graph_item.second;
1098 std::vector<std::tuple<std::string, std::string>> proto_to_dump;
1099
1100 std::string specific_dump_dir = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
1101 std::to_string(root_graph_id) + "/" + IterationString(iteration);
1102
1103 // convert node names to dump style
1104 for (auto node : wp_nodes) {
1105 std::string orig_name = std::get<0>(node);
1106 // Remove the scope from the fully qualified name to compare for both sync and async case.
1107 std::string dump_style_name = GetNodeNameWithoutScope(orig_name);
1108
1109 bool node_is_out = std::get<1>(node);
1110 if (node_is_out) {
1111 dump_style_name += ".output";
1112 } else {
1113 dump_style_name += ".input";
1114 }
1115 if (std::find(proto_to_dump.begin(), proto_to_dump.end(),
1116 std::tuple<std::string, std::string>(orig_name, dump_style_name)) == proto_to_dump.end()) {
1117 proto_to_dump.push_back(std::tuple<std::string, std::string>(orig_name, dump_style_name));
1118 }
1119 }
1120
1121 if (is_sync_mode_) {
1122 std::string abspath = RealPath(specific_dump_dir);
1123 ProcessTensorDataSync(proto_to_dump, abspath, specific_dump_dir, iteration, device_id, root_graph_id,
1124 &tensor_list);
1125 } else {
1126 // convert all files in proto_to_dump to npy and add to pool of async file names
1127 ConvertWatchPointNodes(proto_to_dump, specific_dump_dir, async_file_pool);
1128 GetTensorDataInfoAsync(proto_to_dump, specific_dump_dir, iteration, device_id, root_graph_id, *async_file_pool,
1129 &tensor_list);
1130 }
1131 }
1132
1133 return tensor_list;
1134 }
1135
ProcessTensorDataSync(const std::vector<std::tuple<std::string,std::string>> & proto_to_dump,const std::string & abspath,const std::string & specific_dump_dir,unsigned int iteration,unsigned int device_id,unsigned int root_graph_id,std::vector<std::shared_ptr<TensorData>> * const tensor_list)1136 void DebugServices::ProcessTensorDataSync(const std::vector<std::tuple<std::string, std::string>> &proto_to_dump,
1137 const std::string &abspath, const std::string &specific_dump_dir,
1138 unsigned int iteration, unsigned int device_id, unsigned int root_graph_id,
1139 std::vector<std::shared_ptr<TensorData>> *const tensor_list) {
1140 DIR *d = opendir(abspath.c_str());
1141 if (d == nullptr) {
1142 MS_LOG(ERROR) << "Directory " << specific_dump_dir.c_str() << " does not exist in ReadNeededDumpedTensors.";
1143 } else {
1144 struct dirent *dir = nullptr;
1145 while ((dir = readdir(d)) != nullptr) {
1146 if (dir->d_type == DT_REG) {
1147 std::string file_name = dir->d_name;
1148 for (auto &node : proto_to_dump) {
1149 std::string dump_name = std::get<1>(node);
1150
1151 std::string stripped_file_name = GetStrippedFilename(file_name);
1152 if (stripped_file_name.empty() || stripped_file_name.length() <= dump_name.length()) {
1153 continue;
1154 }
1155 std::size_t found = stripped_file_name.rfind(dump_name, 0);
1156 if (found == 0) {
1157 size_t slot = std::stoul(stripped_file_name.substr(dump_name.length() + 1));
1158 std::vector<int64_t> shape;
1159 std::string orig_name = std::get<0>(node);
1160 std::string output_str = dump_name.substr(dump_name.rfind(".") + 1);
1161 bool output_flag = (output_str == "output");
1162
1163 AddToTensorData(orig_name, "", slot, iteration, device_id, root_graph_id, output_flag, 0, "", shape,
1164 nullptr, tensor_list);
1165 break;
1166 }
1167 }
1168 }
1169 }
1170 (void)closedir(d);
1171 }
1172 }
1173
IterationString(unsigned int iteration)1174 std::string DebugServices::IterationString(unsigned int iteration) {
1175 std::string iteration_string;
1176 bool init_dbg_suspend = (iteration == UINT_MAX);
1177 if (init_dbg_suspend) {
1178 iteration_string = "init";
1179 } else {
1180 iteration_string = std::to_string(iteration);
1181 }
1182 return iteration_string;
1183 }
1184 #endif
1185
ReadNodesTensors(const std::vector<std::string> & name,std::vector<std::string> * const ret_name,std::vector<const char * > * const data_ptr,std::vector<ssize_t> * const data_size,std::vector<unsigned int> * const dtype,std::vector<std::vector<int64_t>> * const shape)1186 void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
1187 std::vector<const char *> *const data_ptr, std::vector<ssize_t> *const data_size,
1188 std::vector<unsigned int> *const dtype,
1189 std::vector<std::vector<int64_t>> *const shape) {
1190 std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
1191 tensor_loader_->SearchTensors(name, &result_list);
1192
1193 for (auto result : result_list) {
1194 if (std::get<1>(result) == nullptr) {
1195 continue;
1196 }
1197 (void)ret_name->emplace_back(std::get<0>(result));
1198 (void)data_ptr->emplace_back(reinterpret_cast<const char *>(std::get<1>(result)->GetDataPtr()));
1199 (void)data_size->emplace_back(std::get<1>(result)->GetByteSize());
1200 (void)dtype->emplace_back(std::get<1>(result)->GetType());
1201 (void)shape->emplace_back(std::get<1>(result)->GetShape());
1202 }
1203 }
1204
SearchNodesTensors(const std::vector<std::string> & name,std::vector<std::tuple<std::string,std::shared_ptr<TensorData>>> * result_list)1205 void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
1206 std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
1207 if (result_list == nullptr) {
1208 MS_LOG(DEBUG) << "result_list is nullptr.";
1209 return;
1210 }
1211 tensor_loader_->SearchTensors(name, result_list);
1212 }
1213
1214 #ifdef ONLINE_DBG_MODE
IsWatchPoint(const std::string & kernel_name,const CNodePtr & kernel) const1215 bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
1216 bool ret = false;
1217 for (auto w_table_item : watchpoint_table_) {
1218 auto check_node_list = std::get<1>(w_table_item).check_node_list;
1219 for (auto check_node : check_node_list) {
1220 std::string w_name = std::get<0>(check_node);
1221 bool w_type = std::get<1>(check_node);
1222 if ((w_type == true &&
1223 ((kernel_name.find(w_name) != string::npos && kernel_name.rfind(w_name, 0) == 0) || w_name == "*")) ||
1224 (w_type == false && (kernel_name == w_name || IsWatchPointNodeInput(w_name, kernel)))) {
1225 ret = true;
1226 return ret;
1227 }
1228 }
1229 }
1230 return ret;
1231 }
1232
IsWatchPointNodeInput(const std::string & w_name,const CNodePtr & kernel) const1233 bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
1234 if (kernel != nullptr && w_name.length() > 0) {
1235 auto input_size = AnfAlgo::GetInputTensorNum(kernel);
1236 for (size_t j = 0; j < input_size; ++j) {
1237 auto input_kernel = kernel->input(j + 1);
1238 std::string input_kernel_name = GetKernelNodeName(input_kernel);
1239 auto found = w_name.find_last_of('/');
1240 if (found != std::string::npos && w_name.size() > found && w_name.substr(found + 1) == input_kernel_name)
1241 return true;
1242 }
1243 return false;
1244 } else {
1245 return false;
1246 }
1247 }
1248 #endif
1249
GetTensor() const1250 std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
1251
EmptyCurrentTensor()1252 void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
1253
1254 #ifdef ONLINE_DBG_MODE
DumpTensorToFile(const std::string & tensor_name,bool trans_flag,const std::string & filepath,const std::string & host_fmt,const std::vector<int64_t> & host_shape,TypeId host_type,TypeId device_type,const std::string & addr_format,size_t slot) const1255 bool DebugServices::DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
1256 const std::string &host_fmt, const std::vector<int64_t> &host_shape,
1257 TypeId host_type, TypeId device_type, const std::string &addr_format,
1258 size_t slot) const {
1259 return tensor_loader_->DumpTensorToFile(tensor_name, trans_flag, filepath, host_fmt, host_shape, host_type,
1260 device_type, addr_format, slot);
1261 }
1262 #endif
1263
LoadNewTensor(const std::shared_ptr<TensorData> & tensor,bool keep_prev)1264 bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
1265 return tensor_loader_->LoadNewTensor(tensor, keep_prev);
1266 }
1267
ResetLoadedTensors()1268 void DebugServices::ResetLoadedTensors() {
1269 wp_id_cache_.clear();
1270 MS_LOG(INFO) << "Resetting loaded tensors";
1271 tensor_loader_->MoveParametersCurrentToPrev();
1272 tensor_loader_->EmptyCurrentTensor();
1273 // will move parameters from previous to current map
1274 tensor_loader_->SwapCurrentPrev();
1275 overflow_ops_.clear();
1276 }
1277
1278 #ifdef ONLINE_DBG_MODE
GetNodeTensor(const CNodePtr & kernel)1279 std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNodePtr &kernel) {
1280 MS_EXCEPTION_IF_NULL(kernel);
1281 std::vector<std::shared_ptr<TensorData>> result;
1282 auto output_size = AnfAlgo::GetOutputTensorNum(kernel);
1283 auto kernel_name = GetKernelNodeName(kernel);
1284 for (size_t j = 0; j < output_size; ++j) {
1285 auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
1286 auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
1287 if (tensor != nullptr) {
1288 result.push_back(tensor);
1289 }
1290 }
1291 return result;
1292 }
1293 #endif
1294
CheckOpOverflow(std::string node_name_to_find,unsigned int device_id,unsigned int root_graph_id,unsigned int iteration)1295 bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
1296 unsigned int iteration) {
1297 std::replace(node_name_to_find.begin(), node_name_to_find.end(), '/', '_');
1298 std::vector<std::string> op_names;
1299 std::string overflow_bin_path;
1300
1301 #ifdef ONLINE_DBG_MODE
1302 if (DumpJsonParser::GetInstance().path().empty()) {
1303 // Dump config is not set.
1304 return false;
1305 }
1306 auto debugger = Debugger::GetInstance();
1307 overflow_bin_path = DumpJsonParser::GetInstance().GetOpOverflowBinPath(debugger->GetGraphPtr()->root_graph_id());
1308 auto realpath = FileUtils::GetRealPath(overflow_bin_path.c_str());
1309 if (!realpath.has_value()) {
1310 MS_LOG(INFO) << "Get real path failed for overflow_bin_path.";
1311 return false;
1312 }
1313 overflow_bin_path = realpath.value() + '/';
1314 #else
1315 overflow_bin_path = dump_dir_ + "/rank_" + std::to_string(device_id) + "/" + net_name_ + "/" +
1316 std::to_string(root_graph_id) + "/" + IterationString(iteration) + "/";
1317 overflow_bin_path = RealPath(overflow_bin_path);
1318 #endif
1319
1320 overflow_wp_lock_.lock();
1321
1322 MS_LOG(INFO) << "Searching for overflow in node " << node_name_to_find;
1323 auto found_overflows = overflow_ops_.find(overflow_bin_path);
1324 if (found_overflows != overflow_ops_.end()) {
1325 MS_LOG(INFO) << "Found already computed overflows for " << overflow_bin_path;
1326 op_names = overflow_ops_[overflow_bin_path];
1327 } else {
1328 std::map<std::pair<uint64_t, uint64_t>, std::string> task_stream_to_opname;
1329 std::vector<std::pair<uint64_t, uint64_t>> task_stream_hit;
1330 const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
1331
1332 MS_LOG(INFO) << "Processing bin file path " << overflow_bin_path;
1333
1334 std::string abspath = RealPath(overflow_bin_path);
1335 DIR *d = opendir(abspath.c_str());
1336 if (d == nullptr) {
1337 MS_LOG(ERROR) << "OverFlow bin directory does not exist!";
1338 } else {
1339 struct dirent *dir = nullptr;
1340 while ((dir = readdir(d)) != nullptr) {
1341 if (dir->d_type == DT_REG) {
1342 // form fully qualified filename
1343 std::string file_path = overflow_bin_path;
1344 std::string file_name = dir->d_name;
1345 (void)file_path.append(file_name);
1346 // attempt to read the file
1347 std::ifstream infile;
1348 infile.open(file_path.c_str(), std::ios::ate | std::ios::binary | std::ios::in);
1349 if (!infile.is_open()) {
1350 MS_LOG(ERROR) << "Failed to open overflow bin file " << file_name << " Errno:" << errno;
1351 continue;
1352 }
1353
1354 std::string node_name;
1355 uint64_t task_id = 0;
1356 uint64_t stream_id = 0;
1357 // detect overflow bin file
1358 if (file_name.rfind(overflow_file_prefix, 0) == 0) {
1359 // start of op overflow data in bin file
1360 const uint32_t offset = 321;
1361 (void)infile.seekg(offset, std::ios::beg);
1362 std::vector<char> buffer;
1363 // size of op overflow info section
1364 const size_t buf_size = 256;
1365 buffer.resize(buf_size);
1366 (void)infile.read(buffer.data(), buf_size);
1367 if (infile.gcount() != buf_size) {
1368 MS_LOG(ERROR) << "The file: " << file_path << "may be damaged!";
1369 continue;
1370 }
1371 const uint8_t stream_id_offset = 16;
1372 const uint8_t task_id_offset = 24;
1373 // The stream_id and task_id in the dump file are 8 byte fields for extensibility purpose, but only hold 4
1374 // byte values currently.
1375 stream_id = BytestoUInt64(std::vector<char>(buffer.begin() + stream_id_offset, buffer.end()));
1376 task_id = BytestoUInt64(std::vector<char>(buffer.begin() + task_id_offset, buffer.end()));
1377 MS_LOG(INFO) << "Overflow bin file " << file_name << ", task_id " << task_id << ", stream_id " << stream_id
1378 << ".";
1379 task_stream_hit.push_back(std::make_pair(task_id, stream_id));
1380 } else {
1381 // regular bin file
1382 bool success_parse = GetAttrsFromAsyncFilename(file_name, &node_name, &task_id, &stream_id);
1383 if (success_parse) {
1384 task_stream_to_opname[std::make_pair(task_id, stream_id)] = node_name;
1385 }
1386 }
1387 infile.close();
1388 }
1389 }
1390 (void)closedir(d);
1391 }
1392
1393 // find the op_names with an overflow hit
1394 for (auto &task_stream : task_stream_hit) {
1395 auto op_name = task_stream_to_opname[task_stream];
1396 if (!op_name.empty()) {
1397 MS_LOG(INFO) << "Operation overflow detected in " << op_name;
1398 op_names.push_back(op_name);
1399 }
1400 }
1401
1402 overflow_ops_[overflow_bin_path] = op_names;
1403 }
1404
1405 overflow_wp_lock_.unlock();
1406
1407 // determine if overflow wp has been triggered for node_name_to_find
1408 if (find(op_names.begin(), op_names.end(), node_name_to_find) != op_names.end()) {
1409 MS_LOG(INFO) << "Operation overflow watchpoint triggered for " << node_name_to_find;
1410 return true;
1411 }
1412
1413 return false;
1414 }
1415
GetAttrsFromAsyncFilename(const std::string & file_name,std::string * const node_name,uint64_t * task_id,uint64_t * stream_id)1416 bool DebugServices::GetAttrsFromAsyncFilename(const std::string &file_name, std::string *const node_name,
1417 uint64_t *task_id, uint64_t *stream_id) {
1418 // get the node_name, task_id, and stream_id from async dump filename
1419 // node_type.node_name.task_id.stram_id.timestamp
1420 // WARNING: node_name may have dots in it
1421 size_t fourth_dot = file_name.rfind(".");
1422 size_t third_dot = file_name.rfind(".", fourth_dot - 1);
1423 size_t second_dot = file_name.rfind(".", third_dot - 1);
1424 size_t first_dot = file_name.find(".");
1425
1426 // check if dots were found
1427 if (first_dot == std::string::npos || second_dot == std::string::npos || third_dot == std::string::npos ||
1428 fourth_dot == std::string::npos) {
1429 return false;
1430 }
1431
1432 // check if its not an async bin file
1433 if (file_name.substr(fourth_dot) == ".npy") {
1434 return false;
1435 }
1436
1437 // get node_name
1438 if (first_dot < second_dot) {
1439 *node_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1);
1440 } else {
1441 MS_LOG(ERROR) << "Async filename parse error to get node_name.";
1442 return false;
1443 }
1444
1445 // get task id
1446 if (second_dot < third_dot) {
1447 std::string extracted_task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1);
1448 try {
1449 *task_id = std::stoull(extracted_task_id);
1450 } catch (std::invalid_argument &e) {
1451 MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, invalid argument.";
1452 return false;
1453 } catch (std::out_of_range &e) {
1454 MS_LOG(ERROR) << "stoull failed on extracted_task_id to get task_id, out of range.";
1455 return false;
1456 }
1457 } else {
1458 MS_LOG(ERROR) << "Async filename parse error to get task_id.";
1459 return false;
1460 }
1461
1462 // get stream id
1463 if (third_dot < fourth_dot) {
1464 std::string extracted_stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1);
1465 try {
1466 *stream_id = std::stoull(extracted_stream_id);
1467 } catch (std::invalid_argument &e) {
1468 MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, invalid argument.";
1469 return false;
1470 } catch (std::out_of_range &e) {
1471 MS_LOG(ERROR) << "stoull failed on extracted_stream_id to get stream_id, out of range.";
1472 return false;
1473 }
1474 } else {
1475 MS_LOG(ERROR) << "Async filename parse error to get stream_id.";
1476 return false;
1477 }
1478
1479 return true;
1480 }
1481
RealPath(const std::string & input_path)1482 std::string DebugServices::RealPath(const std::string &input_path) {
1483 if (input_path.length() >= PATH_MAX) {
1484 MS_LOG(EXCEPTION) << "The length of path: " << input_path << " exceeds limit: " << PATH_MAX;
1485 }
1486
1487 size_t path_split_pos = input_path.find_last_of('/');
1488
1489 // get real path
1490 char real_path[PATH_MAX] = {0};
1491
1492 // input_path is dir + file_name
1493 if (path_split_pos != std::string::npos) {
1494 std::string prefix_path = input_path.substr(0, path_split_pos);
1495 std::string file_name = input_path.substr(path_split_pos);
1496
1497 if (file_name.length() > NAME_MAX) {
1498 MS_LOG(EXCEPTION) << "The length of file name : " << file_name.length() << " exceeds limit: " << NAME_MAX;
1499 }
1500 if (realpath(prefix_path.c_str(), real_path) == nullptr) {
1501 MS_LOG(ERROR) << "The dir " << prefix_path << " does not exist.";
1502 return "";
1503 }
1504
1505 return std::string(real_path) + file_name;
1506 }
1507
1508 // input_path is only file_name
1509 if (input_path.length() > NAME_MAX) {
1510 MS_LOG(EXCEPTION) << "The length of file name : " << input_path.length() << " exceeds limit: " << NAME_MAX;
1511 }
1512 if (realpath(input_path.c_str(), real_path) == nullptr) {
1513 MS_LOG(INFO) << "The file " << input_path << " does not exist, it will be created.";
1514 }
1515
1516 return std::string(real_path);
1517 }
1518
BytestoUInt64(const std::vector<char> & buffer)1519 uint64_t DebugServices::BytestoUInt64(const std::vector<char> &buffer) {
1520 return le64toh(*reinterpret_cast<const uint64_t *>(buffer.data()));
1521 }
1522
TensorExistsInCurrent(const std::string & tensor_name)1523 bool DebugServices::TensorExistsInCurrent(const std::string &tensor_name) {
1524 return tensor_loader_->TensorExistsInCurrent(tensor_name);
1525 }
MoveTensorCurrentToPrev(const std::string & tensor_name)1526 void DebugServices::MoveTensorCurrentToPrev(const std::string &tensor_name) {
1527 tensor_loader_->MoveTensorCurrentToPrev(tensor_name);
1528 }
1529
AppendToCacheEvictQueue(const std::string & tensor_name)1530 void DebugServices::AppendToCacheEvictQueue(const std::string &tensor_name) {
1531 if (tensor_loader_->EnableMemoryControl()) {
1532 tensor_loader_->AppendToCacheEvictQueue(tensor_name);
1533 }
1534 }
1535
SetNetName(std::string net_name)1536 void DebugServices::SetNetName(std::string net_name) { this->net_name_ = net_name; }
1537
GetNetName()1538 std::string DebugServices::GetNetName() { return net_name_; }
1539
SetDumpDir(std::string dump_dir)1540 void DebugServices::SetDumpDir(std::string dump_dir) { this->dump_dir_ = dump_dir; }
1541
GetDumpDir()1542 std::string DebugServices::GetDumpDir() { return dump_dir_; }
1543
SetSyncMode(bool is_sync_mode)1544 void DebugServices::SetSyncMode(bool is_sync_mode) { this->is_sync_mode_ = is_sync_mode; }
1545
GetSyncMode()1546 bool DebugServices::GetSyncMode() { return is_sync_mode_; }
1547
SetMemLimit(uint64_t max_mem_size)1548 void DebugServices::SetMemLimit(uint64_t max_mem_size) { tensor_loader_->SetMemTotal(max_mem_size); }
1549
1550 #ifdef ONLINE_DBG_MODE
1551 } // namespace mindspore
1552 #endif
1553