1 /**
2 * Copyright 2020-2022 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "include/backend/debug/debugger/debugger.h"
18 #include <dirent.h>
19 #include <tuple>
20 #include <vector>
21 #include <algorithm>
22 #include <iostream>
23 #include <map>
24 #include <regex>
25 #include "include/backend/debug/data_dump/dump_json_parser.h"
26 #include "backend/common/session/session_basic.h"
27 #include "include/backend/anf_runtime_algorithm.h"
28 #include "include/common/utils/anfalgo.h"
29 #include "runtime/device/kernel_runtime.h"
30 #include "include/backend/debug/data_dump/e2e_dump.h"
31 #include "include/common/utils/config_manager.h"
32 #include "include/common/debug/env_config_parser.h"
33 #include "include/common/utils/comm_manager.h"
34 #include "runtime/hardware/device_context_manager.h"
35 #include "include/common/debug/anf_ir_dump.h"
36 #include "include/common/debug/anf_dump_utils.h"
37 #include "runtime/graph_scheduler/device_tensor_store.h"
38 #ifdef ENABLE_DEBUGGER
39 #include "debug/debugger/proto_exporter.h"
40 #endif
41 #include "include/backend/debug/debugger/proto_exporter.h"
42 #include "debug/debugger/debugger_utils.h"
43 #include "debug/debugger/grpc_client.h"
44 #include "debug/debug_services.h"
45 #include "runtime/device/ms_device_shape_transfer.h"
46
47 using debugger::Chunk;
48 using debugger::EventReply;
49 using debugger::GraphProto;
50 using debugger::ModelProto;
51 using debugger::Statistics;
52 using debugger::TensorProto;
53 using debugger::WatchCondition;
54 using debugger::WatchCondition_Condition_inf;
55 using debugger::WatchCondition_Condition_nan;
56 using debugger::WatchCondition_Parameter;
57 using debugger::WatchNode;
58 using debugger::WatchpointHit;
59 using mindspore::runtime::DeviceTensorStore;
60
61 namespace mindspore {
62
63 static constexpr auto g_chunk_size = 1024 * 1024 * 3;
64 static constexpr int32_t heartbeat_period_second = 30;
65
GetInstance()66 std::shared_ptr<Debugger> Debugger::GetInstance() {
67 std::lock_guard<std::mutex> i_lock(instance_lock_);
68 if (debugger_ == nullptr) {
69 debugger_ = std::shared_ptr<Debugger>(new (std::nothrow) Debugger());
70 }
71 return debugger_;
72 }
73
Debugger()74 Debugger::Debugger()
75 : grpc_client_(nullptr),
76 debug_services_(nullptr),
77 heartbeat_thread_(nullptr),
78 device_id_(0),
79 device_target_(""),
80 num_step_(0),
81 debugger_enabled_(false),
82 suspended_at_last_kernel_(false),
83 run_level_(""),
84 node_name_(""),
85 cur_name_(""),
86 training_done_(false),
87 send_metadata_done_(false),
88 received_new_graph_(false),
89 is_dataset_graph_(false),
90 partial_memory_(false),
91 initial_suspend_(true),
92 enable_heartbeat_(false),
93 not_dataset_graph_sum_(0),
94 ascend_kernel_by_kernel_(false),
95 enable_debugger_called_(false),
96 version_("") {
97 CheckDebuggerEnabledParam();
98 auto ms_context = MsContext::GetInstance();
99 MS_EXCEPTION_IF_NULL(ms_context);
100 std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
101 MS_LOG(INFO) << "Debugger got device_target: " << device_target;
102 if (!CheckDebuggerEnabled()) {
103 return;
104 } else if (device_target == kCPUDevice) {
105 MS_LOG(WARNING) << "Not enabling debugger. Debugger does not support CPU.";
106 } else {
107 // configure partial memory reuse
108 partial_memory_ = CheckDebuggerPartialMemoryEnabled();
109
110 // switch memory reuse on or off
111 EnvConfigParser::GetInstance().SetSysMemreuse(partial_memory_);
112 // print some message about memory reuse to user
113 if (partial_memory_) {
114 MS_LOG(WARNING)
115 << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
116 "step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
117 } else {
118 MS_LOG(WARNING)
119 << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
120 "usage for large models.";
121 }
122 }
123 }
124
Init(const uint32_t device_id,const std::string device_target)125 void Debugger::Init(const uint32_t device_id, const std::string device_target) {
126 // access lock for public method
127 std::lock_guard<std::mutex> a_lock(access_lock_);
128 // save device_id
129 MS_LOG(INFO) << "Debugger got device_id: " << device_id;
130 device_id_ = device_id;
131 MS_LOG(INFO) << "Debugger got device_target: " << device_target;
132 device_target_ = device_target;
133 version_ = MSVERSION;
134 }
135
IsTypeDebuggerSupported(TypeId type)136 bool IsTypeDebuggerSupported(TypeId type) {
137 if (type < TypeId::kNumberTypeEnd && type > TypeId::kNumberTypeBegin && type != kNumberTypeComplex64) {
138 return true;
139 }
140 MS_LOG(INFO) << "Debugger does not support type: " << TypeIdLabel(type);
141 return false;
142 }
143
EnableDebugger()144 void Debugger::EnableDebugger() {
145 // reset some of the class members
146 num_step_ = 0;
147 debugger_enabled_ = false;
148 enable_heartbeat_ = false;
149 partial_memory_ = false;
150 grpc_client_ = nullptr;
151 debug_services_ = nullptr;
152 heartbeat_thread_ = nullptr;
153 enable_debugger_called_ = true;
154
155 // see if dump using debugger backend is enabled
156 bool dump_enabled = CheckDebuggerDumpEnabled();
157 MS_LOG(INFO) << "dump using debugger backend = " << dump_enabled;
158
159 // check if debugger enabled
160 debugger_enabled_ = CheckDebuggerEnabled();
161 MS_LOG(INFO) << "debugger_enabled_ = " << debugger_enabled_;
162
163 if (!debugger_enabled_ && !dump_enabled) {
164 MS_LOG(INFO) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
165 return;
166 }
167
168 if (debugger_enabled_) {
169 // configure grpc host
170 std::string env_host_str = common::GetEnv("MS_DEBUGGER_HOST");
171 std::string host;
172 if (!env_host_str.empty()) {
173 if (CheckIp(env_host_str)) {
174 MS_LOG(INFO) << "Getenv MS_DEBUGGER_HOST: " << env_host_str;
175 host = env_host_str;
176 } else {
177 debugger_enabled_ = false;
178 MS_EXCEPTION(ValueError) << "Environment variable MS_DEBUGGER_HOST isn't a valid IP address. "
179 "Please set environment variable MS_DEBUGGER_HOST=x.x.x.x to a valid IP";
180 }
181 } else {
182 MS_LOG(INFO) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost";
183 host = "localhost";
184 }
185 // configure grpc port
186 std::string env_port_str = common::GetEnv("MS_DEBUGGER_PORT");
187 std::string port;
188 if (!env_port_str.empty()) {
189 if (CheckPort(env_port_str)) {
190 MS_LOG(INFO) << "Getenv MS_DEBUGGER_PORT: " << env_port_str;
191 port = env_port_str;
192 } else {
193 debugger_enabled_ = false;
194 MS_EXCEPTION(ValueError) << "Environment variable MS_DEBUGGER_PORT is not valid. Custom port ranging from 1 to "
195 "65535";
196 }
197 } else {
198 port = "50051";
199 if (!CheckPort(port)) {
200 MS_EXCEPTION(ValueError) << "Default MS_DEBUGGER_PORT is not valid. Custom port ranging from 1 to 65535";
201 }
202 MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
203 }
204 // initialize grpc client
205 grpc_client_ = std::make_unique<GrpcClient>(host, port);
206 // initialize sending heartbeat
207 heartbeat_thread_ = std::make_unique<std::thread>([this]() { SendHeartbeat(heartbeat_period_second); });
208 }
209 debug_services_ = std::make_unique<DebugServices>();
210 }
211
CheckDatasetSinkMode(const KernelGraphPtr & graph_ptr)212 void Debugger::CheckDatasetSinkMode(const KernelGraphPtr &graph_ptr) {
213 bool sink_mode =
214 ConfigManager::GetInstance().dataset_mode() == DatasetMode::DS_SINK_MODE || graph_ptr->IsDatasetGraph();
215 if (CheckDebuggerDumpEnabled() && sink_mode && device_target_ == kGPUDevice) {
216 MS_EXCEPTION(NotSupportError)
217 << "e2e_dump is not supported on GPU with dataset_sink_mode=True. Please set dataset_sink_mode=False";
218 }
219
220 if (CheckDebuggerEnabled() && sink_mode) {
221 MS_EXCEPTION(NotSupportError)
222 << "Debugger is not supported with dataset_sink_mode=True. Please set dataset_sink_mode=False";
223 }
224 }
225
CheckDebuggerDumpEnabled() const226 bool Debugger::CheckDebuggerDumpEnabled() const {
227 // see if dump is enabled
228 auto &dump_json_parser = DumpJsonParser::GetInstance();
229 if (device_target_ == kGPUDevice) {
230 return dump_json_parser.e2e_dump_enabled();
231 } else if (device_target_ == kAscendDevice) {
232 return dump_json_parser.async_dump_enabled() || dump_json_parser.e2e_dump_enabled();
233 }
234 return false;
235 }
236
CheckDebuggerEnabled() const237 bool Debugger::CheckDebuggerEnabled() const {
238 // get env variables to configure debugger
239 std::string env_enable_str = common::GetEnv("ENABLE_MS_DEBUGGER");
240 if (!env_enable_str.empty()) {
241 (void)std::transform(env_enable_str.begin(), env_enable_str.end(), env_enable_str.begin(), ::tolower);
242 if ((env_enable_str == "1" || env_enable_str == "true") && device_target_ != kCPUDevice) {
243 return true;
244 }
245 }
246 return false;
247 }
248
CheckDebuggerEnabledParam() const249 void Debugger::CheckDebuggerEnabledParam() const {
250 // check the value of env variable ENABLE_MS_DEBUGGER
251 std::string env_enable_str = common::GetEnv("ENABLE_MS_DEBUGGER");
252 if (!env_enable_str.empty()) {
253 (void)std::transform(env_enable_str.begin(), env_enable_str.end(), env_enable_str.begin(), ::tolower);
254 if (env_enable_str != "0" && env_enable_str != "1" && env_enable_str != "false" && env_enable_str != "true") {
255 MS_LOG(WARNING) << "Env variable ENABLE_MS_DEBUGGER should be True/False/1/0 (case insensitive), but get: "
256 << env_enable_str;
257 }
258 }
259 }
260
CheckDebuggerPartialMemoryEnabled() const261 bool Debugger::CheckDebuggerPartialMemoryEnabled() const {
262 std::string env_partial_mem_str = common::GetEnv("MS_DEBUGGER_PARTIAL_MEM");
263 if (!env_partial_mem_str.empty()) {
264 MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str;
265 if (env_partial_mem_str == "1") {
266 return true;
267 }
268 }
269 return false;
270 }
271
272 /*
273 * Feature group: Dump, Online debugger.
274 * Target device group: Ascend, GPU.
275 * Runtime category: Old runtime, MindRT
276 * Description: Returns true if online debugger or dump is enabled.
277 */
DebuggerBackendEnabled() const278 bool Debugger::DebuggerBackendEnabled() const { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); }
279
Reset()280 void Debugger::Reset() {
281 // access lock for public method
282 std::lock_guard<std::mutex> a_lock(access_lock_);
283 // reset components
284 if (heartbeat_thread_ && heartbeat_thread_->joinable()) {
285 SetEnableHeartbeat(false);
286 heartbeat_thread_->join();
287 MS_LOG(INFO) << "Join Heartbeat thread.";
288 }
289 heartbeat_thread_ = nullptr;
290 device_id_ = 0;
291 device_target_ = "";
292 num_step_ = 0;
293 debugger_enabled_ = false;
294 is_dataset_graph_ = false;
295 partial_memory_ = false;
296 graph_ptr_ = nullptr;
297 grpc_client_ = nullptr;
298 debug_services_ = nullptr;
299 graph_proto_list_.clear();
300 graph_ptr_list_.clear();
301 graph_ptr_step_vec_.clear();
302 executed_graph_ptr_set_.clear();
303 parameters_mindRT_.clear();
304 visited_root_graph_ids_.clear();
305 MS_LOG(INFO) << "Release Debugger resource.";
306 }
307
308 /*
309 * Feature group: Dump, Online debugger.
310 * Target device group: Ascend, GPU.
311 * Runtime category: MindRT.
312 * Description: Sets root_graph_id for all the graphs in the compiled graph list. Sets cur_root_graph_id_ and
313 * prev_root_graph_id_ and calls PreExecute function for all the graphs.
314 */
PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> & graphs,const std::vector<AnfNodePtr> & origin_parameters_order)315 void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs,
316 const std::vector<AnfNodePtr> &origin_parameters_order) {
317 // MindRTBackend for GPU and Ascend
318 if (device_target_ == kCPUDevice) {
319 return;
320 }
321 // Store graphs that are run in one step.
322 graph_ptr_step_vec_ = graphs;
323 parameters_mindRT_ = origin_parameters_order;
324 prev_root_graph_id_ = cur_root_graph_id_;
325 // set first run graph as the root graph
326 cur_root_graph_id_ = graph_ptr_step_vec_[0]->graph_id();
327 MS_LOG(DEBUG) << "Current root graph id: " << cur_root_graph_id_ << " prev_root_graph_id_: " << prev_root_graph_id_
328 << " for step: " << num_step_ << ".";
329 MS_LOG(DEBUG) << "Set root graph for all the subgraphs:";
330 for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
331 const auto &graph = graphs[graph_index];
332 // set root graph id for GPU mindrt runtime.
333 MS_LOG(INFO) << "Set root graph for graph: " << graph->graph_id() << " to: " << cur_root_graph_id_ << ".";
334 graph->set_root_graph_id(cur_root_graph_id_);
335 if (debugger_) {
336 debugger_->PreExecute(graph);
337 }
338 }
339 }
340
341 /*
342 * Feature group: Dump.
343 * Target device group: Ascend.
344 * Runtime category: Old runtime, MindRT.
345 * Description: When async dump is enabled and dataset_sink_mode is true, graph_iter_num_map_ stores the number of
346 * iterations per epoch for each running graph.
347 */
UpdateGraphIterMap(uint32_t graph_id,int32_t iter_num)348 void Debugger::UpdateGraphIterMap(uint32_t graph_id, int32_t iter_num) {
349 if (graph_iter_num_map_.find(graph_id) == graph_iter_num_map_.end()) {
350 graph_iter_num_map_[graph_id] = iter_num;
351 }
352 }
353
354 /*
355 * Feature group: Dump, Online debugger.
356 * Target device group: Ascend.
357 * Runtime category: Old runtime.
358 * Description: For Ascend old runtime, this function sets the current and previous root graph id.
359 */
SetCurrentAndPrevRootGraph(uint32_t root_graph_id)360 void Debugger::SetCurrentAndPrevRootGraph(uint32_t root_graph_id) {
361 // for GPU and ascend MindRT root graphs are set in PreExecuteGraphDebugger.
362 if (device_target_ != kAscendDevice || MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
363 return;
364 }
365 prev_root_graph_id_ = cur_root_graph_id_;
366 cur_root_graph_id_ = root_graph_id;
367 MS_LOG(DEBUG) << "Current root graph id: " << cur_root_graph_id_ << " prev_root_graph_id_: " << prev_root_graph_id_
368 << " for step: " << num_step_ << ".";
369 }
370
371 /*
372 * Feature group: Dump, Online debugger.
373 * Target device group: GPU.
374 * Runtime category: Old runtime.
375 * Description: In the case of GPU old runtime and when we have multiple subgraphs, we use the first run graph id to
376 * update the step number.
377 */
StoreRunGraphIdList(uint32_t graph_id)378 void Debugger::StoreRunGraphIdList(uint32_t graph_id) {
379 // collect rungrap_ids to update step number in multigraph case for GPU old runtime
380 if (rungraph_id_list_.size() > 0) {
381 rungraph_id_list_.push_back(graph_id);
382 } else {
383 if (std::find(rungraph_id_list_.begin(), rungraph_id_list_.end(), graph_id) == rungraph_id_list_.end()) {
384 rungraph_id_list_.push_back(graph_id);
385 }
386 }
387 }
388
389 /*
390 * Feature group: Dump, Online debugger.
391 * Target device group: Ascend, GPU.
392 * Runtime category: Old runtime, MindRT.
393 * Description: Sets previous and current root_graph_id for Ascend old runtime, sends graphs to online debugger when
394 * debugger_enabled_ is true.
395 */
PreExecute(const KernelGraphPtr & graph_ptr)396 void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
397 MS_EXCEPTION_IF_NULL(graph_ptr);
398 // access lock for public method
399 std::lock_guard<std::mutex> a_lock(access_lock_);
400 if (!MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
401 // Checking dataset_sink_mode for mindRT is done in debug_actor
402 CheckDatasetSinkMode(graph_ptr);
403 }
404 auto graph_id = graph_ptr->graph_id();
405 MS_LOG(DEBUG) << "PreExecute for graph: " << graph_id << " in step: " << num_step_ << ".";
406 StoreRunGraphIdList(graph_id);
407 SetCurrentAndPrevRootGraph(graph_ptr->root_graph_id());
408 // multiple graphs
409 if (graph_proto_list_.size() > 1) {
410 // there are more than one graphs are not dataset_graph
411 if (not_dataset_graph_sum_ > 0) {
412 SendMultiGraphsAndClear(graph_ptr);
413 }
414 } else if (graph_proto_list_.size() == 1) {
415 // single graph, and not the initial step
416 if (device_target_ == kGPUDevice && !MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT) &&
417 num_step_ != 0) {
418 if (debugger_enabled_ && !(run_level_ == "node" && suspended_at_last_kernel_)) {
419 CommandLoop();
420 }
421 debug_services_->ResetLoadedTensors();
422 }
423 // In single graph case, reset graph_ptr_ to be nullptr when debugger receives a new graph
424 if (received_new_graph_) {
425 graph_ptr_ = nullptr;
426 CheckGraphPtr(graph_ptr);
427 }
428 } else if (debugger_enabled_ && graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice &&
429 !MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
430 // Multiple graph, and not the initial step,
431 // stop only when receive the first sub run graph for each step for old runtime
432 // if we have stopped for the last kernel before, no need to stop again
433 if (Common::GetDebugTerminate()) {
434 return;
435 }
436 if (!(run_level_ == "node" && suspended_at_last_kernel_)) {
437 CommandLoop();
438 }
439 debug_services_->ResetLoadedTensors();
440 }
441 // resets for the new graph
442 suspended_at_last_kernel_ = false;
443 }
444
445 /*
446 * Feature group: Online debugger.
447 * Target device group: Ascend, GPU.
448 * Runtime category: Old runtime, MindRT.
449 * Description: Sends all the subgraphs to online debugger when debugger_enabled_ is true.
450 */
SendMultiGraphsAndClear(const KernelGraphPtr & graph_ptr)451 void Debugger::SendMultiGraphsAndClear(const KernelGraphPtr &graph_ptr) {
452 // only try to enable debugger if they are not all dataset graphs
453 if (!enable_debugger_called_) {
454 EnableDebugger();
455 }
456 if (debugger_enabled_) {
457 // only send compiled graphs once at the initial step.
458 auto dbg_graph_ptr = graph_ptr_;
459 // use current graph ptr to load parameters
460 graph_ptr_ = graph_ptr;
461 LoadParametersAndConst();
462 // revert graph ptr to original value
463 graph_ptr_ = dbg_graph_ptr;
464
465 SendMultiGraphsAndSuspend(graph_proto_list_);
466
467 graph_proto_list_.clear();
468 received_new_graph_ = false;
469 }
470 }
471
472 /*
473 * Feature group: Dump.
474 * Target device group: Ascend, GPU.
475 * Runtime category: MindRT.
476 * Description: Returns the rank_id for GPU and Ascend kernel-bykernel mindRT.
477 */
GetRankID()478 uint32_t Debugger::GetRankID() {
479 uint32_t rank_id = GetRankId();
480 return rank_id;
481 }
482
483 /*
484 * Feature group: Dump.
485 * Target device group: Ascend, GPU.
486 * Runtime category: MindRT.
487 * Description: When dump is enabled, this function: 1) Dumps parameters for the current root_graph_id to the
488 * root_graph's directory. 2) Dumps constant data once for each graph. 3) Dumps graph run history for each graph.
489 */
DumpParamsAndConstAndHistory()490 void Debugger::DumpParamsAndConstAndHistory() {
491 if (!CheckDebuggerDumpEnabled()) {
492 return;
493 }
494 LoadParametersAllGraphs();
495 E2eDump::DumpParametersData(GetRankID(), debugger_.get());
496 // Whether constant data was already dumped for the current root graph.
497 bool cur_root_graph_checked = std::find(visited_root_graph_ids_.begin(), visited_root_graph_ids_.end(),
498 cur_root_graph_id_) != visited_root_graph_ids_.end();
499 for (auto graph : graph_ptr_step_vec_) {
500 if (!cur_root_graph_checked) {
501 LoadConstsForGraph(graph);
502 // Dump constant data for GPU.
503 E2eDump::DumpConstantData(graph.get(), GetRankID(), debugger_.get());
504 // Dump constant data for Ascend.
505 DumpConstantDataAscend(graph);
506 }
507 }
508 for (auto kernel_graph = executed_graph_ptr_set_.cbegin(); kernel_graph != executed_graph_ptr_set_.cend();
509 ++kernel_graph) {
510 auto debugger = Debugger::GetInstance();
511 MS_EXCEPTION_IF_NULL(debugger);
512 // Dump graph run hisotry for each graph.
513 if (debugger->GetAscendKernelByKernelFlag() && (*kernel_graph)->graph_id() != (*kernel_graph)->root_graph_id()) {
514 MS_LOG(INFO) << "current graph graph_id = " << (*kernel_graph)->graph_id() << " is not root graph.";
515 } else {
516 E2eDump::DumpRunIter(*kernel_graph, GetRankID());
517 }
518 }
519 if (!cur_root_graph_checked) {
520 visited_root_graph_ids_.push_back(cur_root_graph_id_);
521 }
522 }
523
DumpConstantDataAscend(const KernelGraphPtr & graph)524 void Debugger::DumpConstantDataAscend(const KernelGraphPtr &graph) {
525 if (device_target_ != kAscendDevice) {
526 return;
527 }
528 auto &json_parser = DumpJsonParser::GetInstance();
529 if (json_parser.e2e_dump_enabled() || json_parser.async_dump_enabled()) {
530 // Dump constant data for ascend mindRT, for old runtime constant data is dumped in session_basic.
531 uint32_t rank_id = GetRankID();
532 std::string cst_file_dir = GenerateDumpPath(graph->root_graph_id(), rank_id, true);
533 DumpConstantInfo(graph, cst_file_dir);
534 }
535 }
536
537 /*
538 * Feature group: Dump.
539 * Target device group: Ascend, GPU.
540 * Runtime category: MindRT.
541 * Description: Dumps a single node for given graph_id.
542 */
DumpSingleNode(const CNodePtr & node,uint32_t graph_id) const543 void Debugger::DumpSingleNode(const CNodePtr &node, uint32_t graph_id) const {
544 if (debugger_ && debugger_->DebuggerBackendEnabled()) {
545 uint32_t rank_id = GetRankID();
546 (void)E2eDump::DumpSingleNodeData(node, graph_id, rank_id, debugger_.get());
547 }
548 }
549
550 /*
551 * Feature group: Dump.
552 * Target device group: GPU.
553 * Runtime category: MindRT.
554 * Description: This function is used for new GPU runtime using MindRTBackend, on Ascend platform, graphs are saved in
555 * session_basic.
556 */
DumpInGraphCompiler(const KernelGraphPtr & kernel_graph)557 void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) {
558 if (device_target_ == kAscendDevice) {
559 return;
560 }
561 auto &json_parser = DumpJsonParser::GetInstance();
562 if (json_parser.e2e_dump_enabled()) {
563 uint32_t rank_id = GetRankID();
564 kernel_graph->set_root_graph_id(kernel_graph->graph_id());
565 std::string final_graph = "trace_code_graph_" + std::to_string(kernel_graph->graph_id());
566 std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id);
567 std::string target_dir = root_dir + "/graphs";
568 std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir";
569 DumpIRProtoWithSrcInfo(kernel_graph, final_graph, target_dir, kDebugWholeStack);
570 DumpIR("trace_code_graph", kernel_graph, true, kWholeStack, ir_file_path);
571 DumpGraphExeOrder("ms_execution_order_graph_" + std::to_string(kernel_graph->graph_id()) + ".csv", root_dir,
572 kernel_graph->execution_order());
573 }
574 }
575
576 /*
577 * Feature group: Dump, Online debugger.
578 * Target device group: GPU and CPU.
579 * Runtime category: MindRT.
580 * Description: Load and dump parameters and constant data, call postExecute and update dump iter.
581 */
PostExecuteGraphDebugger()582 void Debugger::PostExecuteGraphDebugger() {
583 if (device_target_ == kAscendDevice) {
584 MS_LOG(DEBUG) << "On Ascend, parameters and constant data is not dumped here.";
585 return;
586 }
587 // On CPU, update dump iteration, Parameters and consts are not dumped here
588 if (device_target_ == kCPUDevice) {
589 DumpJsonParser::GetInstance().UpdateDumpIter();
590 return;
591 }
592 DumpParamsAndConstAndHistory();
593 // debug used for dump
594 if (CheckDebuggerDumpEnabled() && !debugger_enabled()) {
595 ClearCurrentData();
596 }
597 if (debugger_) {
598 debugger_->PostExecute();
599 }
600 E2eDump::UpdateIterMindRTDump();
601 executed_graph_ptr_set_.clear();
602 }
603
604 /*
605 * Feature group: Online debugger.
606 * Target device group: Ascend, GPU.
607 * Runtime category: Old runtime, MindRT.
608 * Description: Send hit watchpoints, update the step number and reset loaded tensors.
609 */
PostExecute()610 void Debugger::PostExecute() {
611 // access lock for public method
612 std::lock_guard<std::mutex> a_lock(access_lock_);
613 if (Common::GetDebugTerminate()) {
614 return;
615 }
616 if (debugger_ && debugger_->DebuggerBackendEnabled()) {
617 // analyze tensor data and send the watchpoints been hit
618 if (debugger_enabled_ && !is_dataset_graph_) {
619 SendWatchpoints(CheckWatchpoints());
620 // no need to suspend at each graph for GPU old runtime, suspension happens in preExecute
621 if (device_target_ == kAscendDevice) {
622 CommandLoop();
623 } else if (device_target_ == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
624 if (!(run_level_ == "node" && suspended_at_last_kernel_)) {
625 CommandLoop();
626 }
627 }
628 if (device_target_ != kGPUDevice) {
629 num_step_++;
630 }
631 }
632 // Only keep parameters in th current map
633 // GPU ResetLoadedTensors for old runtime happens in preExecute
634 if ((device_target_ == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
635 device_target_ == kAscendDevice) {
636 if (debug_services_ != nullptr) {
637 debug_services_->ResetLoadedTensors();
638 } else {
639 MS_LOG(DEBUG) << "debug_services_ is nullptr";
640 }
641 }
642 }
643 }
644
ReadNodeDataRequired(const CNodePtr & kernel) const645 bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) const {
646 if (debugger_enabled_ && !is_dataset_graph_) {
647 auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, kernel);
648 // if node has a watchpoint on it, is next_to node, or continue_to node then read the kernel tensor data
649 if (is_watchpoint || (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_))) {
650 return true;
651 }
652 }
653 return false;
654 }
655
656 /*
657 * Feature group: Online debugger.
658 * Target device group: GPU.
659 * Runtime category: Old runtime, MindRT.
660 * Description: Check and send watchpoint hit for a single node, suspend if a watchpoint is hit or we are continuing
661 * in node level.
662 */
PostExecuteNode(const CNodePtr & kernel,bool last_kernel)663 void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
664 // access lock for public method
665 std::lock_guard<std::mutex> a_lock(access_lock_);
666 if (Common::GetDebugTerminate()) {
667 return;
668 }
669 if (debugger_enabled_ && !is_dataset_graph_) {
670 auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, kernel);
671
672 // if kernel is watchpoint,and get hit. suspend.
673 bool hit_empty_flag = true;
674 if (is_watchpoint) {
675 auto hits = CheckWatchpoints(cur_name_, kernel);
676 if (!hits.empty()) {
677 SendWatchpoints(hits);
678 CommandLoop();
679
680 hit_empty_flag = false;
681 }
682 }
683 if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) {
684 // if kernel is not watchpoint and is next_to or continue_to node, suspend
685 // sets a bool to be checked in preExecute to avoid double stopping at last kernel in the last graph
686 if (last_kernel) {
687 suspended_at_last_kernel_ = true;
688 }
689 CommandLoop();
690 }
691 return;
692 }
693 }
694
695 /*
696 * Feature group: Dump, Online debugger.
697 * Target device group: Ascend, GPU.
698 * Runtime category: Old runtime, MindRT.
699 * Description: Get graph proto and add it to graph proto list and add loaded graph pointers to a list.
700 */
LoadGraphs(const KernelGraphPtr & graph_ptr)701 void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) {
702 MS_EXCEPTION_IF_NULL(graph_ptr);
703 if (graph_ptr_ != graph_ptr) {
704 MS_LOG(INFO) << "LoadGraphs Debugger got new graph: " << graph_ptr->graph_id();
705 received_new_graph_ = true;
706 // save new graph_ptr
707 graph_ptr_ = graph_ptr;
708 CheckDatasetGraph();
709 if (!is_dataset_graph_) {
710 // get proto for new graph_ptr
711 auto graph_proto = GetGraphProto(graph_ptr);
712 // add new graph proto to graph_proto_list_
713 graph_proto_list_.push_back(graph_proto);
714 graph_ptr_list_.push_back(graph_ptr);
715 not_dataset_graph_sum_++;
716 }
717 // reset is_dataset_graph to be false
718 is_dataset_graph_ = false;
719 }
720 }
721
722 // In single graph cases, check single graph ptr
CheckGraphPtr(const KernelGraphPtr & graph_ptr)723 void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
724 MS_EXCEPTION_IF_NULL(graph_ptr);
725 if (graph_ptr_ != graph_ptr) {
726 MS_LOG(INFO) << "CheckGraphPtr Debugger got new graph: " << graph_ptr->graph_id();
727 // save new graph_ptr
728 graph_ptr_ = graph_ptr;
729 if (!is_dataset_graph_) {
730 // only try to enable debugger if it is not a dataset graph
731 if (!enable_debugger_called_) {
732 EnableDebugger();
733 }
734 if (debugger_enabled_) {
735 LoadParametersAndConst();
736 // get graph proto and send to MindInsight
737 auto graph_proto = graph_proto_list_.front();
738 SendGraphAndSuspend(graph_proto);
739 graph_proto_list_.clear();
740 received_new_graph_ = false;
741 }
742 }
743 }
744 }
745
CheckDatasetGraph()746 void Debugger::CheckDatasetGraph() {
747 // print parameter node names
748 MS_EXCEPTION_IF_NULL(graph_ptr_);
749 const auto ¶ms = graph_ptr_->inputs();
750 for (const auto ¶m : params) {
751 MS_LOG(INFO) << "param: " << GetKernelNodeName(param);
752 }
753 // check if there is GetNext or InitDataSetQueue node
754 const auto &nodes = graph_ptr_->execution_order();
755 for (const auto &node : nodes) {
756 auto node_name = common::AnfAlgo::GetCNodeName(node);
757 MS_LOG(INFO) << "node: " << GetKernelNodeName(node);
758 if (node_name == "GetNext" || node_name == "InitDataSetQueue") {
759 MS_LOG(INFO) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node "
760 << node_name;
761 is_dataset_graph_ = true;
762 return;
763 }
764 }
765 is_dataset_graph_ = false;
766 }
767
GetGraphProto(const KernelGraphPtr & graph_ptr) const768 GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const {
769 // convert kernel graph to debugger modelproto
770 ModelProto model = GetDebuggerFuncGraphProto(graph_ptr);
771 return model.graph();
772 }
773
774 /*
775 * Feature group: Online debugger.
776 * Target device group: Ascend, GPU.
777 * Runtime category: Old runtime, MindRT.
778 * Description: Send debugger backend heartbeat to online debugger every few seconds.
779 */
SendHeartbeat(int32_t period)780 void Debugger::SendHeartbeat(int32_t period) {
781 int num_heartbeat_fail = 0;
782 const int max_num_heartbeat_fail = 5;
783 const int retry_milliseconds = 500;
784
785 Heartbeat heartbeat;
786 heartbeat.set_message("Debugger is alive");
787 heartbeat.set_period(heartbeat_period_second);
788
789 SetEnableHeartbeat(CheckDebuggerEnabled());
790 while (enable_heartbeat_) {
791 MS_EXCEPTION_IF_NULL(grpc_client_);
792 EventReply reply = grpc_client_->SendHeartbeat(heartbeat);
793 if (reply.status() != EventReply::OK) {
794 MS_LOG(ERROR) << "Error: SendHeartbeat failed";
795 num_heartbeat_fail++;
796 if (num_heartbeat_fail >= max_num_heartbeat_fail) {
797 MS_LOG(ERROR) << "Maximum number of failure for SendHeartbeat reached : exiting training session.";
798 SetEnableHeartbeat(false);
799 break;
800 } else {
801 MS_LOG(ERROR) << "Number of consecutive SendHeartbeat fail:" << num_heartbeat_fail;
802 std::this_thread::sleep_for(std::chrono::milliseconds(retry_milliseconds));
803 }
804 } else {
805 int recheck_period_ms = 200;
806 for (int i = 0; i < (period * 1000 / recheck_period_ms); i++) {
807 if (enable_heartbeat_) {
808 std::this_thread::sleep_for(std::chrono::milliseconds(recheck_period_ms));
809 } else {
810 break;
811 }
812 }
813 }
814 }
815 }
816
SendGraphAndSuspend(const GraphProto & graph_proto)817 void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
818 if (!CheckSendMetadata()) {
819 return;
820 }
821 // send graph to MindInsight server
822 MS_EXCEPTION_IF_NULL(grpc_client_);
823 EventReply reply = grpc_client_->SendGraph(graph_proto);
824 if (reply.status() != EventReply::OK) {
825 MS_LOG(ERROR) << "Error: SendGraph failed";
826 }
827 // enter command loop, wait and process commands
828 CommandLoop();
829 }
830
SendMetadata(bool version_check)831 bool Debugger::SendMetadata(bool version_check) {
832 // prepare metadata
833 MS_EXCEPTION_IF_NULL(graph_ptr_);
834 std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
835 Metadata metadata;
836 metadata.set_device_name(device_name);
837 metadata.set_cur_step(num_step_);
838 metadata.set_backend(device_target_);
839 metadata.set_cur_node(cur_name_);
840 metadata.set_training_done(training_done_);
841 metadata.set_ms_version(version_);
842 MS_LOG(INFO) << "Is training done?" << training_done_;
843 // set graph number to not_dataset_graph_sum_
844 metadata.set_graph_num(not_dataset_graph_sum_);
845
846 MS_EXCEPTION_IF_NULL(grpc_client_);
847 EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
848
849 bool ret = false;
850 if (reply_metadata.status() == EventReply::OK) {
851 if (version_check) {
852 // get type of the command in meta data reply, it should be version matched
853 DebuggerCommand cmd = GetCommand(reply_metadata);
854 if (cmd != DebuggerCommand::kVersionMatchedCMD) {
855 MS_LOG(ERROR) << "MindInsight version is too old, Mindspore version is " << version_;
856 Exit();
857 } else {
858 if (GetMiVersionMatched(reply_metadata)) {
859 MS_LOG(INFO) << "MindSpore version is " << version_ << " matches MindInsight version.";
860 ret = true;
861 } else {
862 MS_LOG(ERROR) << "MindSpore version " << version_ << ", did not match MindInsight version.";
863 CommandLoop();
864 }
865 }
866 } else {
867 // version check is done before so we can just return true here
868 ret = true;
869 }
870 } else {
871 MS_LOG(ERROR) << "Error: SendMetadata failed";
872 }
873
874 return ret;
875 }
876
SendMultiGraphsAndSuspend(const std::list<GraphProto> & graph_proto_list)877 void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_proto_list) {
878 if (!CheckSendMetadata()) {
879 return;
880 }
881 MS_EXCEPTION_IF_NULL(grpc_client_);
882 // send multiple graphs to mindinght server
883 // split graph into chunks if one graph is larger than chunk size
884 std::list<Chunk> chunked_graph_proto_list;
885 Chunk chunk;
886 for (auto graph : graph_proto_list) {
887 std::string str = graph.SerializeAsString();
888 auto graph_size = graph.ByteSize();
889 if (graph_size > g_chunk_size) {
890 auto sub_graph_str = grpc_client_->ChunkString(str, graph_size);
891
892 for (unsigned int i = 0; i < sub_graph_str.size(); i++) {
893 chunk.set_buffer(sub_graph_str[i]);
894 if (i < sub_graph_str.size() - 1) {
895 chunk.set_finished(false);
896 } else {
897 chunk.set_finished(true);
898 }
899 chunked_graph_proto_list.push_back(chunk);
900 }
901 } else {
902 chunk.set_buffer(str);
903 chunk.set_finished(true);
904 chunked_graph_proto_list.push_back(chunk);
905 }
906 }
907 EventReply reply = grpc_client_->SendMultiGraphs(chunked_graph_proto_list);
908 if (reply.status() != EventReply::OK) {
909 MS_LOG(ERROR) << "Error: SendGraph failed";
910 }
911 // enter command loop, wait and process commands
912 CommandLoop();
913 }
914
CheckSendMetadata()915 bool Debugger::CheckSendMetadata() {
916 if (!send_metadata_done_) {
917 if (!SendMetadata(true)) {
918 return false;
919 }
920 send_metadata_done_ = true;
921 }
922 return true;
923 }
924
CommandLoop()925 void Debugger::CommandLoop() {
926 // prepare metadata
927 MS_EXCEPTION_IF_NULL(graph_ptr_);
928 std::string device_name = std::to_string(device_id_) + ":" + std::to_string(cur_root_graph_id_);
929 Metadata metadata;
930
931 metadata.set_device_name(device_name);
932 metadata.set_cur_step(num_step_);
933 metadata.set_backend(device_target_);
934 metadata.set_cur_node(cur_name_);
935 metadata.set_training_done(training_done_);
936
937 // loop exit flag
938 bool run = false;
939 int num_wait_fail = 0;
940 const int max_num_wait_fail = 5;
941
942 while (!run) {
943 // wait for command
944 MS_EXCEPTION_IF_NULL(grpc_client_);
945 EventReply reply = grpc_client_->WaitForCommand(metadata);
946 if (reply.status() != EventReply::OK) {
947 MS_LOG(ERROR) << "Error: WaitForCommand failed";
948 num_wait_fail++;
949 if (num_wait_fail > max_num_wait_fail) {
950 MS_LOG(ERROR) << "Maximum number of WaitForCommand retry reached: exiting training session.";
951 MS_LOG(ERROR) << "Failed to connect to MindInsight debugger server. Please check the config "
952 "of debugger host and port.";
953 Exit();
954 run = true;
955 } else {
956 MS_LOG(ERROR) << "Number of consecutive WaitForCommand fail:" << num_wait_fail << "; Retry after "
957 << num_wait_fail << "s";
958 std::this_thread::sleep_for(std::chrono::seconds(num_wait_fail));
959 }
960 continue;
961 }
962
963 // get type of the command in reply
964 DebuggerCommand cmd = GetCommand(reply);
965 if (cmd == DebuggerCommand::kUnknownCMD) {
966 MS_LOG(DEBUG) << "Debug: debugger received unknown command";
967 continue;
968 }
969
970 MS_LOG(INFO) << "received command: ";
971 switch (cmd) {
972 case DebuggerCommand::kUnknownCMD:
973 MS_LOG(INFO) << "UnknownCMD";
974 break;
975 case DebuggerCommand::kExitCMD:
976 MS_LOG(INFO) << "ExitCMD";
977 Exit(true);
978 // Used for debugger termination
979 run = true;
980 break;
981 case DebuggerCommand::kRunCMD:
982 ProcessRunCMD(reply);
983 if (GetRunLevel(reply) != "recheck") {
984 // exit loop
985 run = true;
986 }
987 break;
988 case DebuggerCommand::kSetCMD:
989 ProcessKSetCMD(reply);
990 break;
991 case DebuggerCommand::kViewCMD:
992 ProcessKViewCMD(reply);
993 break;
994 case DebuggerCommand::kVersionMatchedCMD:
995 MS_LOG(ERROR) << "Received unexpected Version Matched CMD from MindInsight.";
996 Exit();
997 break;
998 default:
999 MS_LOG(ERROR) << "Received unknown CMD from MindInsight";
1000 Exit();
1001 break;
1002 }
1003 }
1004 }
1005
ProcessRunCMD(const EventReply & reply)1006 void Debugger::ProcessRunCMD(const EventReply &reply) {
1007 MS_LOG(INFO) << "RunCMD";
1008 if (GetRunLevel(reply) == "recheck") {
1009 MS_LOG(INFO) << "rechecking all watchpoints";
1010 SendWatchpoints(CheckWatchpoints("", nullptr, true));
1011 } else {
1012 // no longer the initial suspension.
1013 initial_suspend_ = false;
1014 // print run cmd content
1015 // get run_level and node_name
1016 run_level_ = GetRunLevel(reply);
1017 node_name_ = GetNodeName(reply);
1018
1019 MS_LOG(INFO) << "run_level: " << run_level_;
1020 MS_LOG(INFO) << "node_name_: " << node_name_;
1021 }
1022 }
1023
ProcessKSetCMD(const EventReply & reply)1024 void Debugger::ProcessKSetCMD(const EventReply &reply) {
1025 MS_LOG(INFO) << "SetCMD";
1026 MS_LOG(INFO) << "id: " << GetWatchpointID(reply);
1027 MS_LOG(INFO) << "delete: " << GetWatchpointDelete(reply);
1028 if (GetWatchpointDelete(reply)) {
1029 MS_LOG(INFO) << "Deleting watchpoint";
1030 RemoveWatchpoint(GetWatchpointID(reply));
1031 } else {
1032 MS_LOG(INFO) << "Setting watchpoint";
1033 MS_LOG(INFO) << "condition: " << GetWatchcondition(reply).condition();
1034 ProtoVector<WatchNode> recieved_nodes = GetWatchnodes(reply);
1035 for (const auto &node : recieved_nodes) {
1036 MS_LOG(INFO) << "node name: " << node.node_name();
1037 MS_LOG(INFO) << "node type: " << node.node_type();
1038 }
1039 ProtoVector<WatchCondition_Parameter> parameters = GetParameters(reply);
1040 for (const auto ¶meter : parameters) {
1041 MS_LOG(INFO) << "parameter name: " << parameter.name();
1042 MS_LOG(INFO) << "parameter is disabled: " << parameter.disabled();
1043 MS_LOG(INFO) << "parameter value: " << parameter.value();
1044 }
1045 SetWatchpoint(GetWatchnodes(reply), GetWatchcondition(reply), GetWatchpointID(reply), GetParameters(reply));
1046 }
1047 }
1048
ProcessKViewCMD(const EventReply & reply)1049 void Debugger::ProcessKViewCMD(const EventReply &reply) {
1050 MS_LOG(INFO) << "ViewCMD";
1051 // print view cmd content
1052 ProtoVector<TensorProto> received_tensors = GetTensors(reply);
1053 for (auto received_tensor : received_tensors) {
1054 MS_LOG(INFO) << "tensor node name: " << received_tensor.node_name();
1055 MS_LOG(INFO) << "tensor slot: " << received_tensor.slot();
1056 MS_LOG(INFO) << "tensor finished: " << std::boolalpha << received_tensor.finished() << std::noboolalpha;
1057 MS_LOG(INFO) << "tensor iter: " << received_tensor.iter();
1058 MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << received_tensor.truncate() << std::noboolalpha;
1059 }
1060
1061 switch (reply.view_cmd().level()) {
1062 case debugger::ViewCMD_Level::ViewCMD_Level_base:
1063 MS_LOG(INFO) << "Tensor base request.";
1064 ViewBaseLevel(reply);
1065 break;
1066
1067 case debugger::ViewCMD_Level::ViewCMD_Level_statistics:
1068 MS_LOG(INFO) << "Tensor statistics request.";
1069 ViewStatLevel(reply);
1070 break;
1071
1072 case debugger::ViewCMD_Level::ViewCMD_Level_value:
1073 MS_LOG(INFO) << "Tensor value request.";
1074 ViewValueLevel(reply);
1075 break;
1076 default:
1077 MS_LOG(DEBUG) << "Debug: Unknown tensor info level";
1078 break;
1079 }
1080 }
1081
ViewValueLevel(const EventReply & reply)1082 void Debugger::ViewValueLevel(const EventReply &reply) {
1083 MS_LOG(INFO) << "Sending tensors";
1084 std::list<TensorProto> tensors = LoadTensors(GetTensors(reply));
1085 // print view cmd reply
1086 for (auto tensor = tensors.cbegin(); tensor != tensors.cend(); ++tensor) {
1087 MS_LOG(INFO) << "tensor node name: " << tensor->node_name();
1088 MS_LOG(INFO) << "tensor slot: " << tensor->slot();
1089 MS_LOG(INFO) << "tensor finished: " << std::boolalpha << tensor->finished() << std::noboolalpha;
1090 MS_LOG(INFO) << "tensor iter: " << tensor->iter();
1091 MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << tensor->truncate() << std::noboolalpha;
1092 MS_LOG(INFO) << "tensor dims: ";
1093 for (auto dim = tensor->dims().cbegin(); dim != tensor->dims().cend(); dim++) {
1094 MS_LOG(INFO) << *dim << ",";
1095 }
1096 MS_LOG(INFO) << "tensor dtype: " << tensor->data_type();
1097 }
1098 MS_EXCEPTION_IF_NULL(grpc_client_);
1099 EventReply send_tensors_reply = grpc_client_->SendTensors(tensors);
1100 if (send_tensors_reply.status() != debugger::EventReply::OK) {
1101 MS_LOG(ERROR) << "Error: SendTensors failed";
1102 }
1103 }
1104
ViewStatLevel(const EventReply & reply)1105 void Debugger::ViewStatLevel(const EventReply &reply) {
1106 std::list<TensorSummary> tensor_stats_list = LoadTensorsStat(GetTensors(reply));
1107 EventReply send_tensors_stat_reply = grpc_client_->SendTensorStats(tensor_stats_list);
1108 if (send_tensors_stat_reply.status() != debugger::EventReply::OK) {
1109 MS_LOG(ERROR) << "Error: SendTensorsStats failed.";
1110 }
1111 }
1112
ViewBaseLevel(const EventReply & reply)1113 void Debugger::ViewBaseLevel(const EventReply &reply) {
1114 std::list<TensorBase> tensor_base_list = LoadTensorsBase(GetTensors(reply));
1115 EventReply send_tensor_base_reply = grpc_client_->SendTensorBase(tensor_base_list);
1116 if (send_tensor_base_reply.status() != debugger::EventReply::OK) {
1117 MS_LOG(ERROR) << "Error: SendTensorsBase failed.";
1118 }
1119 }
1120
AddTensorProtoInfo(TensorProto * tensor_item,const TensorProto & tensor)1121 void AddTensorProtoInfo(TensorProto *tensor_item, const TensorProto &tensor) {
1122 tensor_item->set_node_name(tensor.node_name());
1123 tensor_item->set_slot(tensor.slot());
1124 tensor_item->set_iter(tensor.iter());
1125 tensor_item->set_truncate(tensor.truncate());
1126 tensor_item->clear_tensor_content();
1127 tensor_item->clear_data_type();
1128 tensor_item->clear_dims();
1129 }
1130
AddTensorStatInfo(const DebugServices::TensorStat & tensor_stat,std::list<TensorSummary> * const tensor_summary_list)1131 void AddTensorStatInfo(const DebugServices::TensorStat &tensor_stat,
1132 std::list<TensorSummary> *const tensor_summary_list) {
1133 if (tensor_summary_list == nullptr) {
1134 MS_LOG(DEBUG) << "tensor_summary_list is nullptr.";
1135 return;
1136 }
1137 TensorSummary tensor_summary_item;
1138 TensorBase *tensor_base = tensor_summary_item.mutable_tensor_base();
1139 tensor_base->set_data_type(tensor_stat.dtype);
1140 tensor_base->set_data_size(static_cast<int64_t>(tensor_stat.data_size));
1141 for (auto elem : tensor_stat.shape) {
1142 tensor_base->add_shape(elem);
1143 }
1144
1145 Statistics *tensor_statistics = tensor_summary_item.mutable_statistics();
1146 tensor_statistics->set_is_bool(tensor_stat.is_bool);
1147 tensor_statistics->set_max_value(static_cast<float>(tensor_stat.max_value));
1148 tensor_statistics->set_min_value(static_cast<float>(tensor_stat.min_value));
1149 tensor_statistics->set_avg_value(static_cast<float>(tensor_stat.avg_value));
1150 tensor_statistics->set_count(tensor_stat.count);
1151 tensor_statistics->set_neg_zero_count(tensor_stat.neg_zero_count);
1152 tensor_statistics->set_pos_zero_count(tensor_stat.pos_zero_count);
1153 tensor_statistics->set_nan_count(tensor_stat.nan_count);
1154 tensor_statistics->set_neg_inf_count(tensor_stat.neg_inf_count);
1155 tensor_statistics->set_pos_inf_count(tensor_stat.pos_inf_count);
1156 tensor_statistics->set_zero_count(tensor_stat.zero_count);
1157
1158 tensor_summary_list->push_back(tensor_summary_item);
1159 }
1160
SetWatchpoint(const ProtoVector<WatchNode> & nodes,const WatchCondition & condition,const int32_t id,const ProtoVector<WatchCondition_Parameter> & parameters)1161 void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
1162 const ProtoVector<WatchCondition_Parameter> ¶meters) {
1163 std::vector<std::tuple<std::string, bool>> check_node_list;
1164 std::vector<DebugServices::parameter_t> parameter_list;
1165
1166 std::transform(nodes.begin(), nodes.end(), std::back_inserter(check_node_list),
1167 [](const WatchNode &node) -> std::tuple<std::string, bool> {
1168 return make_tuple(node.node_name(), node.node_type() == "scope");
1169 });
1170
1171 std::transform(
1172 parameters.begin(), parameters.end(), std::back_inserter(parameter_list),
1173 [](const WatchCondition_Parameter ¶meter) -> DebugServices::parameter_t {
1174 return DebugServices::parameter_t{parameter.name(), parameter.disabled(), parameter.value(), parameter.hit()};
1175 });
1176 debug_services_->AddWatchpoint(id, static_cast<int>(condition.condition()), condition.value(), check_node_list,
1177 parameter_list);
1178 }
1179
RemoveWatchpoint(const int32_t id)1180 void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); }
1181
LoadTensors(const ProtoVector<TensorProto> & tensors) const1182 std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &tensors) const {
1183 std::vector<std::string> name;
1184 std::vector<std::string> ret_name;
1185 std::vector<const char *> data_ptr;
1186 std::vector<ssize_t> data_size;
1187 std::vector<unsigned int> dtype;
1188 std::vector<std::vector<int64_t>> shape;
1189
1190 std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
1191
1192 // ret_name will contain tensor names that are found in TensorLoader
1193 // items in ret_name will be in the same order with tensors if found
1194 debug_services_->ReadNodesTensors(name, &ret_name, &data_ptr, &data_size, &dtype, &shape);
1195 std::list<TensorProto> tensor_list;
1196 size_t result_index = 0;
1197
1198 for (auto tensor : tensors) {
1199 ssize_t size_iter = 0;
1200 if (result_index >= ret_name.size() || ret_name[result_index] != GetTensorFullName(tensor)) {
1201 TensorProto tensor_item;
1202 tensor_item.set_finished(true);
1203 AddTensorProtoInfo(&tensor_item, tensor);
1204 tensor_list.push_back(tensor_item);
1205 continue;
1206 }
1207 ssize_t tensor_size = data_size[result_index];
1208 while (size_iter < tensor_size) {
1209 ssize_t chunk_size = g_chunk_size;
1210 TensorProto tensor_item;
1211 tensor_item.set_finished(false);
1212 if (tensor_size - size_iter <= g_chunk_size) {
1213 chunk_size = tensor_size - size_iter;
1214 tensor_item.set_finished(true);
1215 }
1216 AddTensorProtoInfo(&tensor_item, tensor);
1217 // return empty tensor if didn't find the requested tensor
1218
1219 tensor_item.set_tensor_content(data_ptr[result_index] + size_iter, chunk_size);
1220
1221 tensor_item.set_data_type(static_cast<debugger::DataType>(dtype[result_index]));
1222 for (auto &elem : shape[result_index]) {
1223 tensor_item.add_dims(elem);
1224 }
1225 // add tensor to result list and increment result_index to check next item in ret_name
1226 tensor_list.push_back(tensor_item);
1227 if (size_iter > INT_MAX - g_chunk_size) {
1228 MS_EXCEPTION(ValueError) << size_iter << " + " << g_chunk_size << " would lead to integer overflow!";
1229 }
1230 size_iter += g_chunk_size;
1231 }
1232 result_index++;
1233 }
1234 return tensor_list;
1235 }
1236
LoadTensorsBase(const ProtoVector<TensorProto> & tensors) const1237 std::list<TensorBase> Debugger::LoadTensorsBase(const ProtoVector<TensorProto> &tensors) const {
1238 std::list<TensorBase> tensor_base_list;
1239 std::vector<std::string> name;
1240 std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
1241 std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
1242 debug_services_->SearchNodesTensors(name, &result_list);
1243 for (auto result : result_list) {
1244 auto tensor = std::get<1>(result);
1245 if (!tensor || ((cur_root_graph_id_ != tensor->GetRootGraphId()) &&
1246 MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT))) {
1247 // tensor was not found or tensor's graph was not executed in the current step, creating empty tensor base.
1248 TensorBase tensor_base_item;
1249 tensor_base_item.set_data_size(0);
1250 tensor_base_item.set_data_type(0);
1251 tensor_base_item.add_shape(0);
1252 tensor_base_list.push_back(tensor_base_item);
1253 continue;
1254 }
1255 // tensor was found creating tensor base object.
1256 TensorBase tensor_base_item;
1257 tensor_base_item.set_data_size(static_cast<int64_t>(tensor->GetByteSize()));
1258 tensor_base_item.set_data_type(static_cast<int32_t>(tensor->GetType()));
1259 for (auto elem : tensor->GetShape()) {
1260 tensor_base_item.add_shape(elem);
1261 }
1262 tensor_base_list.push_back(tensor_base_item);
1263 }
1264 return tensor_base_list;
1265 }
1266
LoadTensorsStat(const ProtoVector<TensorProto> & tensors) const1267 std::list<TensorSummary> Debugger::LoadTensorsStat(const ProtoVector<TensorProto> &tensors) const {
1268 std::list<TensorSummary> tensor_summary_list;
1269 std::vector<std::string> name;
1270 std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
1271 std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
1272 debug_services_->SearchNodesTensors(name, &result_list);
1273 for (auto result : result_list) {
1274 auto tensor = std::get<1>(result);
1275 if (!tensor || ((cur_root_graph_id_ != tensor->GetRootGraphId()) &&
1276 MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT))) {
1277 // tensor was not found or tensor's graph was not executed in the current step, creating empty tensor summary.
1278 DebugServices::TensorStat tensor_stat;
1279 AddTensorStatInfo(tensor_stat, &tensor_summary_list);
1280 continue;
1281 }
1282 // tensor was found creating tensor summary object.
1283 DebugServices::TensorStat tensor_stat = DebugServices::GetTensorStatistics(tensor);
1284 AddTensorStatInfo(tensor_stat, &tensor_summary_list);
1285 }
1286 return tensor_summary_list;
1287 }
1288
GetTensor(const std::string & tensor_name) const1289 std::shared_ptr<TensorData> Debugger::GetTensor(const std::string &tensor_name) const {
1290 return debug_services_->GetTensor(tensor_name);
1291 }
1292
Exit(bool exit_success)1293 void Debugger::Exit(bool exit_success) {
1294 // debugger will notify main thread to exit because main thread can only exit at step boundary.
1295 MS_LOG(INFO) << "Exit Debugger";
1296 SetEnableHeartbeat(false);
1297 Common::DebugTerminate(true, exit_success);
1298 }
1299
CheckWatchpoints(const std::string & watchnode,const CNodePtr & kernel,bool recheck)1300 std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode, const CNodePtr &kernel,
1301 bool recheck) {
1302 std::vector<std::string> name;
1303 std::vector<std::string> slot;
1304 std::vector<int> condition;
1305 std::vector<unsigned int> watchpoint_id;
1306 std::vector<std::vector<DebugServices::parameter_t>> parameters;
1307 std::vector<int32_t> error_codes;
1308 std::vector<std::shared_ptr<TensorData>> tensor_list;
1309 if (watchnode.empty()) {
1310 tensor_list = debug_services_->GetTensor();
1311 } else {
1312 tensor_list = debug_services_->GetNodeTensor(kernel);
1313 }
1314 DebugServices::ProcessedNPYFiles processed_npy_files;
1315 MS_LOG(INFO) << "checkwatchpoints call for step " << num_step_;
1316 debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, ¶meters, &error_codes,
1317 &processed_npy_files, &tensor_list, initial_suspend_, watchnode.empty(), recheck);
1318 std::list<WatchpointHit> hits;
1319 for (unsigned int i = 0; i < name.size(); i++) {
1320 WatchpointHit hit;
1321 std::vector<DebugServices::parameter_t> ¶meter = parameters[i];
1322 hit.set_id(watchpoint_id[i]);
1323 hit.set_error_code(error_codes[i]);
1324 // here TensorProto act as a tensor indicator, not sending tensor content
1325 TensorProto *tensor_item = hit.mutable_tensor();
1326 tensor_item->set_node_name(name[i]);
1327 tensor_item->set_slot(slot[i]);
1328 tensor_item->set_finished(true);
1329
1330 WatchCondition *condition_item = hit.mutable_watch_condition();
1331 condition_item->set_condition(debugger::WatchCondition_Condition(condition[i]));
1332 for (const auto &p : parameter) {
1333 auto x = condition_item->mutable_params()->Add();
1334 x->set_name(p.name);
1335 x->set_disabled(p.disabled);
1336 x->set_value(p.value);
1337 x->set_hit(p.hit);
1338 x->set_actual_value(p.actual_value);
1339 }
1340 hits.push_back(hit);
1341 }
1342 return hits;
1343 }
1344
SendWatchpoints(const std::list<WatchpointHit> & points)1345 void Debugger::SendWatchpoints(const std::list<WatchpointHit> &points) {
1346 // send info about watchpoint
1347 if (!points.empty()) {
1348 MS_EXCEPTION_IF_NULL(grpc_client_);
1349 EventReply reply = grpc_client_->SendWatchpointHits(points);
1350 if (reply.status() != EventReply::OK) {
1351 MS_LOG(ERROR) << "Error: SendWatchpointHits failed";
1352 }
1353 }
1354 }
1355
DumpTensorToFile(const std::string & filepath,const std::string & tensor_name,size_t slot) const1356 bool Debugger::DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const {
1357 if (debug_services_ == nullptr) {
1358 MS_LOG(INFO) << "The debug_services_ is nullptr.";
1359 return false;
1360 }
1361 return debug_services_.get()->DumpTensorToFile(filepath, tensor_name, slot);
1362 }
1363
LoadNewTensor(const std::shared_ptr<TensorData> & tensor,bool keep_prev)1364 bool Debugger::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
1365 if (debug_services_ == nullptr) {
1366 debug_services_ = std::make_unique<DebugServices>();
1367 }
1368 return debug_services_.get()->LoadNewTensor(tensor, keep_prev);
1369 }
1370
debugger_enabled() const1371 bool Debugger::debugger_enabled() const { return debugger_enabled_; }
1372
partial_memory() const1373 bool Debugger::partial_memory() const { return partial_memory_; }
1374
SetEnableHeartbeat(bool enabled)1375 void Debugger::SetEnableHeartbeat(bool enabled) { enable_heartbeat_ = enabled; }
1376
SetCurNode(const std::string & cur_name)1377 void Debugger::SetCurNode(const std::string &cur_name) {
1378 // access lock for public method
1379 std::lock_guard<std::mutex> a_lock(access_lock_);
1380 cur_name_ = cur_name;
1381 }
1382
run_level() const1383 std::string Debugger::run_level() const { return run_level_; }
1384
SetTrainingDone(bool training_done)1385 void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; }
1386
CheckPort(const std::string & port) const1387 bool Debugger::CheckPort(const std::string &port) const {
1388 int num = 0;
1389 const int min_port_num = 1;
1390 const int max_port_num = 65535;
1391 const int decimal = 10;
1392 if (port[0] == '0' && port[1] != '\0') {
1393 return false;
1394 }
1395 size_t i = 0;
1396 while (port[i] != '\0') {
1397 if (port[i] < '0' || port[i] > '9') {
1398 return false;
1399 }
1400 num = num * decimal + (port[i] - '0');
1401 if (num > max_port_num) {
1402 return false;
1403 }
1404 i++;
1405 }
1406 if (num < min_port_num) {
1407 return false;
1408 }
1409 return true;
1410 }
1411
CheckIp(const std::string & host) const1412 bool Debugger::CheckIp(const std::string &host) const {
1413 std::regex reg_ip(
1414 "(25[0-4]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[1-9])"
1415 "[.](25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])"
1416 "[.](25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])"
1417 "[.](25[0-4]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[1-9])");
1418 std::smatch smat;
1419 std::string host_str = host;
1420 return std::regex_match(host_str, smat, reg_ip);
1421 }
1422
GetFirstRunGraphId() const1423 uint32_t Debugger::GetFirstRunGraphId() const { return rungraph_id_list_.front(); }
1424
1425 /*
1426 * Feature group: Dump.
1427 * Target device group: Ascend, GPU.
1428 * Runtime category: Old runtime, MindRT.
1429 * Description: Load a single parameter or value node.
1430 */
LoadSingleAnfnode(const AnfNodePtr & anf_node,const size_t output_index,uint32_t root_graph_id)1431 void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index, uint32_t root_graph_id) {
1432 MS_EXCEPTION_IF_NULL(anf_node);
1433 if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
1434 return;
1435 }
1436 // When MindRT is used, only ValueNodes and ParameterWeights can be loaded from device to host
1437 if (MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
1438 if (!anf_node->isa<ValueNode>() &&
1439 !(anf_node->isa<Parameter>() && common::AnfAlgo::IsParameterWeight(anf_node->cast<ParameterPtr>()))) {
1440 return;
1441 }
1442 }
1443 // for parameters and value nodes, set its execution order to be 0;
1444 int exec_order = 0;
1445 std::string node_name = GetKernelNodeName(anf_node);
1446 GetFileKernelName(NOT_NULL(&node_name));
1447 // check if output adde exists, if not, return;
1448 if (!AnfAlgo::OutputAddrExist(anf_node, output_index)) {
1449 return;
1450 }
1451 auto addr = AnfAlgo::GetOutputAddr(anf_node, output_index);
1452 MS_EXCEPTION_IF_NULL(addr);
1453 auto type = common::AnfAlgo::GetOutputInferDataType(anf_node, output_index);
1454 if (!IsTypeDebuggerSupported(type)) {
1455 return;
1456 }
1457 auto format = kOpFormat_DEFAULT;
1458 string tensor_name = node_name + ':' + "0";
1459 ShapeVector int_shapes = trans::GetRuntimePaddingShape(anf_node, output_index);
1460 bool keep_prev;
1461 if (anf_node->isa<Parameter>()) {
1462 keep_prev = true;
1463 debug_services_->MoveTensorCurrentToPrev(tensor_name);
1464 } else {
1465 keep_prev = false;
1466 }
1467 bool ret =
1468 addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev, root_graph_id, false, true);
1469 if (!ret) {
1470 MS_LOG(ERROR) << "LoadMemToHost:"
1471 << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
1472 }
1473 }
1474
LoadSingleParameterMindRT(const AnfNodePtr & node)1475 void Debugger::LoadSingleParameterMindRT(const AnfNodePtr &node) {
1476 MS_EXCEPTION_IF_NULL(node);
1477 auto root_graph_id = cur_root_graph_id_;
1478 // This function is only for loading parameters mindRT.
1479 std::string node_name = GetKernelNodeName(node);
1480 GetFileKernelName(NOT_NULL(&node_name));
1481 TypeId type;
1482 TypeId device_type;
1483 ShapeVector int_shapes;
1484 auto device_addr = GetParameterInfo(node, NOT_NULL(&int_shapes), NOT_NULL(&type), NOT_NULL(&device_type));
1485 if (device_addr == nullptr || device_addr->GetPtr() == nullptr) {
1486 MS_LOG(DEBUG) << "Skip node: " << node_name << ". Parameter data is not available for mindRT.";
1487 return;
1488 }
1489 if (!IsTypeDebuggerSupported(type)) {
1490 return;
1491 }
1492 auto format = kOpFormat_DEFAULT;
1493 string tensor_name = node_name + ':' + "0";
1494 if (debug_services_ != nullptr) {
1495 debug_services_->MoveTensorCurrentToPrev(tensor_name);
1496 }
1497 // Keep_prev is True for parameters.
1498 // force update for parameters.
1499 bool ret = device_addr->LoadMemToHost(tensor_name, 0, format, int_shapes, type, 0, true, root_graph_id, true, true);
1500 if (!ret) {
1501 MS_LOG(ERROR) << "LoadMemToHost:"
1502 << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
1503 }
1504 }
1505
1506 /*
1507 * Feature group: Dump, Online debugger.
1508 * Target device group: Ascend, GPU.
1509 * Runtime category: Old runtime, MindRT.
1510 * Description: Load all the parameters and value nodes for the last loaded graph.
1511 */
LoadParametersAndConst()1512 void Debugger::LoadParametersAndConst() {
1513 if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) {
1514 return;
1515 }
1516 MS_EXCEPTION_IF_NULL(graph_ptr_);
1517 // load parameters
1518 MS_LOG(INFO) << "Start to load Parameters for graph " << graph_ptr_->graph_id() << ".";
1519 auto root_graph_id = graph_ptr_->root_graph_id();
1520 const auto ¶meters = graph_ptr_->inputs();
1521 for (auto &item : parameters) {
1522 LoadSingleAnfnode(item, kParameterOutputIndex, root_graph_id);
1523 }
1524 // load value nodes
1525 // get all constant values from the graph
1526 MS_LOG(INFO) << "Start to load value nodes for graph " << graph_ptr_->graph_id() << ".";
1527 const auto value_nodes = graph_ptr_->graph_value_nodes();
1528 for (auto &item : value_nodes) {
1529 LoadSingleAnfnode(item, kValueNodeOutputIndex, root_graph_id);
1530 }
1531 }
1532
1533 /*
1534 * Feature group: Dump, Online debugger.
1535 * Target device group: Ascend, GPU.
1536 * Runtime category: Old runtime, MindRT.
1537 * Description: Load all the parameters and value nodes for the given graph.
1538 */
LoadParametersAndConst(const KernelGraphPtr & graph)1539 void Debugger::LoadParametersAndConst(const KernelGraphPtr &graph) {
1540 if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) {
1541 return;
1542 }
1543 MS_EXCEPTION_IF_NULL(graph);
1544 // load parameters
1545 MS_LOG(INFO) << "Start to load Parameters for graph " << graph->graph_id() << ".";
1546 auto root_graph_id = graph->root_graph_id();
1547 const auto ¶meters = graph->inputs();
1548 for (auto &item : parameters) {
1549 LoadSingleAnfnode(item, kParameterOutputIndex, root_graph_id);
1550 }
1551 // load value nodes
1552 // get all constant values from the graph
1553 MS_LOG(INFO) << "Start to load value nodes for graph " << graph->graph_id() << ".";
1554 const auto value_nodes = graph->graph_value_nodes();
1555 for (auto &item : value_nodes) {
1556 LoadSingleAnfnode(item, kValueNodeOutputIndex, root_graph_id);
1557 }
1558 }
1559
1560 /*
1561 * Feature group: Dump.
1562 * Target device group: GPU.
1563 * Runtime category: MindRT.
1564 * Description: This function is for loading parameters' data from device to host into tensor_list_map_ for GPU dump.
1565 * Ascend does not use tensor_map_list_ for dump so it is not needed for ascend dump.
1566 */
LoadParametersAllGraphs()1567 void Debugger::LoadParametersAllGraphs() {
1568 if (!(device_target_ == kGPUDevice && CheckDebuggerDumpEnabled())) {
1569 return;
1570 }
1571 for (auto &node : parameters_mindRT_) {
1572 LoadSingleParameterMindRT(node);
1573 }
1574 }
1575
1576 /*
1577 * Feature group: Dump.
1578 * Target device group: GPU.
1579 * Runtime category: MindRT.
1580 * Description: This function is for loading constant data from device to host into tensor_list_map_ for GPU dump.
1581 * Ascend does not use tensor_map_list_ for dump so it is not needed for ascend dump.
1582 */
LoadConstsForGraph(const KernelGraphPtr & graph)1583 void Debugger::LoadConstsForGraph(const KernelGraphPtr &graph) {
1584 if (!(device_target_ == kGPUDevice && CheckDebuggerDumpEnabled())) {
1585 return;
1586 }
1587 // load value nodes
1588 // get all constant values from the graph
1589 MS_LOG(INFO) << "Start to load value nodes for graph " << graph->graph_id() << ".";
1590 auto root_graph_id = graph->root_graph_id();
1591 const auto value_nodes = graph->graph_value_nodes();
1592 for (auto &item : value_nodes) {
1593 LoadSingleAnfnode(item, kValueNodeOutputIndex, root_graph_id);
1594 }
1595 }
1596
1597 /*
1598 * Feature group: Online debugger.
1599 * Target device group: Ascend.
1600 * Runtime category: Old runtime, MindRT.
1601 * Description: Load all the kernels for the last loaded graph.
1602 */
LoadGraphOutputs()1603 void Debugger::LoadGraphOutputs() {
1604 if (!(debugger_enabled() && device_target_ == kAscendDevice)) {
1605 return;
1606 }
1607 MS_EXCEPTION_IF_NULL(graph_ptr_);
1608 const auto &apply_kernels = graph_ptr_->execution_order();
1609 auto root_graph_id = graph_ptr_->root_graph_id();
1610 // for kernels, execution order starts from 1
1611 int exec_order = 1;
1612 for (const auto &node : apply_kernels) {
1613 MS_EXCEPTION_IF_NULL(node);
1614 std::string kernel_name = GetKernelNodeName(node);
1615 auto output_size = AnfAlgo::GetOutputTensorNum(node);
1616 if (partial_memory_) {
1617 if (!debug_services_->IsWatchPoint(kernel_name, node)) {
1618 continue;
1619 }
1620 }
1621 for (size_t j = 0; j < output_size; ++j) {
1622 if (!AnfAlgo::OutputAddrExist(node, j)) {
1623 MS_LOG(INFO) << "Cannot find output addr for slot " << j << " for " << kernel_name;
1624 continue;
1625 }
1626 auto addr = AnfAlgo::GetOutputAddr(node, j);
1627 MS_EXCEPTION_IF_NULL(addr);
1628 auto type = common::AnfAlgo::GetOutputInferDataType(node, j);
1629 if (!IsTypeDebuggerSupported(type)) {
1630 continue;
1631 }
1632 auto format = kOpFormat_DEFAULT;
1633 string tensor_name = kernel_name + ':' + std::to_string(j);
1634 ShapeVector int_shapes = trans::GetRuntimePaddingShape(node, j);
1635 auto ret =
1636 addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, root_graph_id, false, true);
1637 if (!ret) {
1638 MS_LOG(ERROR) << "LoadMemToHost:"
1639 << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
1640 }
1641 }
1642 exec_order = exec_order + 1;
1643 }
1644 }
1645
1646 /*
1647 * Feature group: Online debugger.
1648 * Target device group: GPU.
1649 * Runtime category: Old runtime.
1650 * Description: Update step number if we are processing the first graph (to support multigraph).
1651 */
UpdateStepNum(const session::KernelGraph * graph)1652 void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
1653 MS_EXCEPTION_IF_NULL(graph);
1654 MS_EXCEPTION_IF_NULL(debugger_);
1655 if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()) &&
1656 (graph->graph_id() == debugger_->GetFirstRunGraphId())) {
1657 // access lock for public method
1658 std::lock_guard<std::mutex> a_lock(access_lock_);
1659 ++num_step_;
1660 }
1661 }
1662
1663 /*
1664 * Feature group: Online debugger.
1665 * Target device group: GPU.
1666 * Runtime category: MindRT.
1667 * Description: Update step number when DebugActor::DebugOnStepEnd is called at the end of each step.
1668 */
UpdateStepNumGPU()1669 void Debugger::UpdateStepNumGPU() {
1670 auto &dump_json_parser = DumpJsonParser::GetInstance();
1671 if (device_target_ == kGPUDevice && (debugger_enabled_ || dump_json_parser.DumpEnabledForIter())) {
1672 // access lock for public method
1673 std::lock_guard<std::mutex> a_lock(access_lock_);
1674 ++num_step_;
1675 MS_LOG(DEBUG) << "Update step for GPU, current step: " << num_step_;
1676 }
1677 }
1678
ClearCurrentData()1679 void Debugger::ClearCurrentData() {
1680 if (device::KernelRuntime::DumpDataEnabledIteration()) {
1681 if (debug_services_) {
1682 debug_services_->EmptyCurrentTensor();
1683 } else {
1684 MS_LOG(WARNING) << "debug_services_ is nullptr";
1685 }
1686 }
1687 }
1688
TensorExistsInCurrent(const std::string & tensor_name)1689 bool Debugger::TensorExistsInCurrent(const std::string &tensor_name) {
1690 if (debug_services_ != nullptr) {
1691 return debug_services_->TensorExistsInCurrent(tensor_name);
1692 }
1693 return false;
1694 }
1695 } // namespace mindspore
1696