• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "include/backend/debug/debugger/debugger.h"
18 #include <dirent.h>
19 #include <tuple>
20 #include <vector>
21 #include <algorithm>
22 #include <iostream>
23 #include <map>
24 #include <regex>
25 #include "include/backend/debug/data_dump/dump_json_parser.h"
26 #include "backend/common/session/session_basic.h"
27 #include "include/backend/anf_runtime_algorithm.h"
28 #include "include/common/utils/anfalgo.h"
29 #include "runtime/device/kernel_runtime.h"
30 #include "include/backend/debug/data_dump/e2e_dump.h"
31 #include "include/common/utils/config_manager.h"
32 #include "include/common/debug/env_config_parser.h"
33 #include "include/common/utils/comm_manager.h"
34 #include "runtime/hardware/device_context_manager.h"
35 #include "include/common/debug/anf_ir_dump.h"
36 #include "include/common/debug/anf_dump_utils.h"
37 #include "runtime/graph_scheduler/device_tensor_store.h"
38 #ifdef ENABLE_DEBUGGER
39 #include "debug/debugger/proto_exporter.h"
40 #endif
41 #include "include/backend/debug/debugger/proto_exporter.h"
42 #include "debug/debugger/debugger_utils.h"
43 #include "debug/debugger/grpc_client.h"
44 #include "debug/debug_services.h"
45 #include "runtime/device/ms_device_shape_transfer.h"
46 
47 using debugger::Chunk;
48 using debugger::EventReply;
49 using debugger::GraphProto;
50 using debugger::ModelProto;
51 using debugger::Statistics;
52 using debugger::TensorProto;
53 using debugger::WatchCondition;
54 using debugger::WatchCondition_Condition_inf;
55 using debugger::WatchCondition_Condition_nan;
56 using debugger::WatchCondition_Parameter;
57 using debugger::WatchNode;
58 using debugger::WatchpointHit;
59 using mindspore::runtime::DeviceTensorStore;
60 
61 namespace mindspore {
62 
63 static constexpr auto g_chunk_size = 1024 * 1024 * 3;
64 static constexpr int32_t heartbeat_period_second = 30;
65 
GetInstance()66 std::shared_ptr<Debugger> Debugger::GetInstance() {
67   std::lock_guard<std::mutex> i_lock(instance_lock_);
68   if (debugger_ == nullptr) {
69     debugger_ = std::shared_ptr<Debugger>(new (std::nothrow) Debugger());
70   }
71   return debugger_;
72 }
73 
Debugger()74 Debugger::Debugger()
75     : grpc_client_(nullptr),
76       debug_services_(nullptr),
77       heartbeat_thread_(nullptr),
78       device_id_(0),
79       device_target_(""),
80       num_step_(0),
81       debugger_enabled_(false),
82       suspended_at_last_kernel_(false),
83       run_level_(""),
84       node_name_(""),
85       cur_name_(""),
86       training_done_(false),
87       send_metadata_done_(false),
88       received_new_graph_(false),
89       is_dataset_graph_(false),
90       partial_memory_(false),
91       initial_suspend_(true),
92       enable_heartbeat_(false),
93       not_dataset_graph_sum_(0),
94       ascend_kernel_by_kernel_(false),
95       enable_debugger_called_(false),
96       version_("") {
97   CheckDebuggerEnabledParam();
98   auto ms_context = MsContext::GetInstance();
99   MS_EXCEPTION_IF_NULL(ms_context);
100   std::string device_target = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
101   MS_LOG(INFO) << "Debugger got device_target: " << device_target;
102   if (!CheckDebuggerEnabled()) {
103     return;
104   } else if (device_target == kCPUDevice) {
105     MS_LOG(WARNING) << "Not enabling debugger. Debugger does not support CPU.";
106   } else {
107     // configure partial memory reuse
108     partial_memory_ = CheckDebuggerPartialMemoryEnabled();
109 
110     // switch memory reuse on or off
111     EnvConfigParser::GetInstance().SetSysMemreuse(partial_memory_);
112     // print some message about memory reuse to user
113     if (partial_memory_) {
114       MS_LOG(WARNING)
115         << "Partial Memory Reuse is enabled. Note: 1. Please only set watchpoints before running the first "
116            "step. 2. Tensor values are only available for nodes that are watched by any watchpoint.";
117     } else {
118       MS_LOG(WARNING)
119         << "Memory Reuse is disabled. Set environment variable MS_DEBUGGER_PARTIAL_MEM=1 to reduce memory "
120            "usage for large models.";
121     }
122   }
123 }
124 
Init(const uint32_t device_id,const std::string device_target)125 void Debugger::Init(const uint32_t device_id, const std::string device_target) {
126   // access lock for public method
127   std::lock_guard<std::mutex> a_lock(access_lock_);
128   // save device_id
129   MS_LOG(INFO) << "Debugger got device_id: " << device_id;
130   device_id_ = device_id;
131   MS_LOG(INFO) << "Debugger got device_target: " << device_target;
132   device_target_ = device_target;
133   version_ = MSVERSION;
134 }
135 
IsTypeDebuggerSupported(TypeId type)136 bool IsTypeDebuggerSupported(TypeId type) {
137   if (type < TypeId::kNumberTypeEnd && type > TypeId::kNumberTypeBegin && type != kNumberTypeComplex64) {
138     return true;
139   }
140   MS_LOG(INFO) << "Debugger does not support type: " << TypeIdLabel(type);
141   return false;
142 }
143 
EnableDebugger()144 void Debugger::EnableDebugger() {
145   // reset some of the class members
146   num_step_ = 0;
147   debugger_enabled_ = false;
148   enable_heartbeat_ = false;
149   partial_memory_ = false;
150   grpc_client_ = nullptr;
151   debug_services_ = nullptr;
152   heartbeat_thread_ = nullptr;
153   enable_debugger_called_ = true;
154 
155   // see if dump using debugger backend is enabled
156   bool dump_enabled = CheckDebuggerDumpEnabled();
157   MS_LOG(INFO) << "dump using debugger backend = " << dump_enabled;
158 
159   // check if debugger enabled
160   debugger_enabled_ = CheckDebuggerEnabled();
161   MS_LOG(INFO) << "debugger_enabled_ = " << debugger_enabled_;
162 
163   if (!debugger_enabled_ && !dump_enabled) {
164     MS_LOG(INFO) << "Not enabling debugger. Set environment variable ENABLE_MS_DEBUGGER=1 to enable debugger.";
165     return;
166   }
167 
168   if (debugger_enabled_) {
169     // configure grpc host
170     std::string env_host_str = common::GetEnv("MS_DEBUGGER_HOST");
171     std::string host;
172     if (!env_host_str.empty()) {
173       if (CheckIp(env_host_str)) {
174         MS_LOG(INFO) << "Getenv MS_DEBUGGER_HOST: " << env_host_str;
175         host = env_host_str;
176       } else {
177         debugger_enabled_ = false;
178         MS_EXCEPTION(ValueError) << "Environment variable MS_DEBUGGER_HOST isn't a valid IP address. "
179                                     "Please set environment variable MS_DEBUGGER_HOST=x.x.x.x to a valid IP";
180       }
181     } else {
182       MS_LOG(INFO) << "Environment variable MS_DEBUGGER_HOST doesn't exist. Using default debugger host: localhost";
183       host = "localhost";
184     }
185     // configure grpc port
186     std::string env_port_str = common::GetEnv("MS_DEBUGGER_PORT");
187     std::string port;
188     if (!env_port_str.empty()) {
189       if (CheckPort(env_port_str)) {
190         MS_LOG(INFO) << "Getenv MS_DEBUGGER_PORT: " << env_port_str;
191         port = env_port_str;
192       } else {
193         debugger_enabled_ = false;
194         MS_EXCEPTION(ValueError) << "Environment variable MS_DEBUGGER_PORT is not valid. Custom port ranging from 1 to "
195                                     "65535";
196       }
197     } else {
198       port = "50051";
199       if (!CheckPort(port)) {
200         MS_EXCEPTION(ValueError) << "Default MS_DEBUGGER_PORT is not valid. Custom port ranging from 1 to 65535";
201       }
202       MS_LOG(INFO) << "Environment variable MS_DEBUGGER_PORT doesn't exist. Using default debugger port: 50051";
203     }
204     // initialize grpc client
205     grpc_client_ = std::make_unique<GrpcClient>(host, port);
206     // initialize sending heartbeat
207     heartbeat_thread_ = std::make_unique<std::thread>([this]() { SendHeartbeat(heartbeat_period_second); });
208   }
209   debug_services_ = std::make_unique<DebugServices>();
210 }
211 
CheckDatasetSinkMode(const KernelGraphPtr & graph_ptr)212 void Debugger::CheckDatasetSinkMode(const KernelGraphPtr &graph_ptr) {
213   bool sink_mode =
214     ConfigManager::GetInstance().dataset_mode() == DatasetMode::DS_SINK_MODE || graph_ptr->IsDatasetGraph();
215   if (CheckDebuggerDumpEnabled() && sink_mode && device_target_ == kGPUDevice) {
216     MS_EXCEPTION(NotSupportError)
217       << "e2e_dump is not supported on GPU with dataset_sink_mode=True. Please set dataset_sink_mode=False";
218   }
219 
220   if (CheckDebuggerEnabled() && sink_mode) {
221     MS_EXCEPTION(NotSupportError)
222       << "Debugger is not supported with dataset_sink_mode=True. Please set dataset_sink_mode=False";
223   }
224 }
225 
CheckDebuggerDumpEnabled() const226 bool Debugger::CheckDebuggerDumpEnabled() const {
227   // see if dump is enabled
228   auto &dump_json_parser = DumpJsonParser::GetInstance();
229   if (device_target_ == kGPUDevice) {
230     return dump_json_parser.e2e_dump_enabled();
231   } else if (device_target_ == kAscendDevice) {
232     return dump_json_parser.async_dump_enabled() || dump_json_parser.e2e_dump_enabled();
233   }
234   return false;
235 }
236 
CheckDebuggerEnabled() const237 bool Debugger::CheckDebuggerEnabled() const {
238   // get env variables to configure debugger
239   std::string env_enable_str = common::GetEnv("ENABLE_MS_DEBUGGER");
240   if (!env_enable_str.empty()) {
241     (void)std::transform(env_enable_str.begin(), env_enable_str.end(), env_enable_str.begin(), ::tolower);
242     if ((env_enable_str == "1" || env_enable_str == "true") && device_target_ != kCPUDevice) {
243       return true;
244     }
245   }
246   return false;
247 }
248 
CheckDebuggerEnabledParam() const249 void Debugger::CheckDebuggerEnabledParam() const {
250   // check the value of env variable ENABLE_MS_DEBUGGER
251   std::string env_enable_str = common::GetEnv("ENABLE_MS_DEBUGGER");
252   if (!env_enable_str.empty()) {
253     (void)std::transform(env_enable_str.begin(), env_enable_str.end(), env_enable_str.begin(), ::tolower);
254     if (env_enable_str != "0" && env_enable_str != "1" && env_enable_str != "false" && env_enable_str != "true") {
255       MS_LOG(WARNING) << "Env variable ENABLE_MS_DEBUGGER should be True/False/1/0 (case insensitive), but get: "
256                       << env_enable_str;
257     }
258   }
259 }
260 
CheckDebuggerPartialMemoryEnabled() const261 bool Debugger::CheckDebuggerPartialMemoryEnabled() const {
262   std::string env_partial_mem_str = common::GetEnv("MS_DEBUGGER_PARTIAL_MEM");
263   if (!env_partial_mem_str.empty()) {
264     MS_LOG(INFO) << "Getenv MS_DEBUGGER_PARTIAL_MEM: " << env_partial_mem_str;
265     if (env_partial_mem_str == "1") {
266       return true;
267     }
268   }
269   return false;
270 }
271 
272 /*
273  * Feature group: Dump, Online debugger.
274  * Target device group: Ascend, GPU.
275  * Runtime category: Old runtime, MindRT
276  * Description: Returns true if online debugger or dump is enabled.
277  */
DebuggerBackendEnabled() const278 bool Debugger::DebuggerBackendEnabled() const { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); }
279 
Reset()280 void Debugger::Reset() {
281   // access lock for public method
282   std::lock_guard<std::mutex> a_lock(access_lock_);
283   // reset components
284   if (heartbeat_thread_ && heartbeat_thread_->joinable()) {
285     SetEnableHeartbeat(false);
286     heartbeat_thread_->join();
287     MS_LOG(INFO) << "Join Heartbeat thread.";
288   }
289   heartbeat_thread_ = nullptr;
290   device_id_ = 0;
291   device_target_ = "";
292   num_step_ = 0;
293   debugger_enabled_ = false;
294   is_dataset_graph_ = false;
295   partial_memory_ = false;
296   graph_ptr_ = nullptr;
297   grpc_client_ = nullptr;
298   debug_services_ = nullptr;
299   graph_proto_list_.clear();
300   graph_ptr_list_.clear();
301   graph_ptr_step_vec_.clear();
302   executed_graph_ptr_set_.clear();
303   parameters_mindRT_.clear();
304   visited_root_graph_ids_.clear();
305   MS_LOG(INFO) << "Release Debugger resource.";
306 }
307 
308 /*
309  * Feature group: Dump, Online debugger.
310  * Target device group: Ascend, GPU.
311  * Runtime category: MindRT.
312  * Description: Sets root_graph_id for all the graphs in the compiled graph list. Sets cur_root_graph_id_ and
313  * prev_root_graph_id_ and calls PreExecute function for all the graphs.
314  */
PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> & graphs,const std::vector<AnfNodePtr> & origin_parameters_order)315 void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs,
316                                        const std::vector<AnfNodePtr> &origin_parameters_order) {
317   // MindRTBackend for GPU and Ascend
318   if (device_target_ == kCPUDevice) {
319     return;
320   }
321   // Store graphs that are run in one step.
322   graph_ptr_step_vec_ = graphs;
323   parameters_mindRT_ = origin_parameters_order;
324   prev_root_graph_id_ = cur_root_graph_id_;
325   // set first run graph as the root graph
326   cur_root_graph_id_ = graph_ptr_step_vec_[0]->graph_id();
327   MS_LOG(DEBUG) << "Current root graph id: " << cur_root_graph_id_ << " prev_root_graph_id_: " << prev_root_graph_id_
328                 << " for step: " << num_step_ << ".";
329   MS_LOG(DEBUG) << "Set root graph for all the subgraphs:";
330   for (size_t graph_index = 0; graph_index < graphs.size(); ++graph_index) {
331     const auto &graph = graphs[graph_index];
332     // set root graph id for GPU mindrt runtime.
333     MS_LOG(INFO) << "Set root graph for graph: " << graph->graph_id() << " to: " << cur_root_graph_id_ << ".";
334     graph->set_root_graph_id(cur_root_graph_id_);
335     if (debugger_) {
336       debugger_->PreExecute(graph);
337     }
338   }
339 }
340 
341 /*
342  * Feature group: Dump.
343  * Target device group: Ascend.
344  * Runtime category: Old runtime, MindRT.
345  * Description: When async dump is enabled and dataset_sink_mode is true, graph_iter_num_map_ stores the number of
346  * iterations per epoch for each running graph.
347  */
UpdateGraphIterMap(uint32_t graph_id,int32_t iter_num)348 void Debugger::UpdateGraphIterMap(uint32_t graph_id, int32_t iter_num) {
349   if (graph_iter_num_map_.find(graph_id) == graph_iter_num_map_.end()) {
350     graph_iter_num_map_[graph_id] = iter_num;
351   }
352 }
353 
354 /*
355  * Feature group: Dump, Online debugger.
356  * Target device group: Ascend.
357  * Runtime category: Old runtime.
358  * Description: For Ascend old runtime, this function sets the current and previous root graph id.
359  */
SetCurrentAndPrevRootGraph(uint32_t root_graph_id)360 void Debugger::SetCurrentAndPrevRootGraph(uint32_t root_graph_id) {
361   // for GPU and ascend MindRT root graphs are set in PreExecuteGraphDebugger.
362   if (device_target_ != kAscendDevice || MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
363     return;
364   }
365   prev_root_graph_id_ = cur_root_graph_id_;
366   cur_root_graph_id_ = root_graph_id;
367   MS_LOG(DEBUG) << "Current root graph id: " << cur_root_graph_id_ << " prev_root_graph_id_: " << prev_root_graph_id_
368                 << " for step: " << num_step_ << ".";
369 }
370 
371 /*
372  * Feature group: Dump, Online debugger.
373  * Target device group: GPU.
374  * Runtime category: Old runtime.
375  * Description: In the case of GPU old runtime and when we have multiple subgraphs, we use the first run graph id to
376  * update the step number.
377  */
StoreRunGraphIdList(uint32_t graph_id)378 void Debugger::StoreRunGraphIdList(uint32_t graph_id) {
379   // collect rungrap_ids to update step number in multigraph case for GPU old runtime
380   if (rungraph_id_list_.size() > 0) {
381     rungraph_id_list_.push_back(graph_id);
382   } else {
383     if (std::find(rungraph_id_list_.begin(), rungraph_id_list_.end(), graph_id) == rungraph_id_list_.end()) {
384       rungraph_id_list_.push_back(graph_id);
385     }
386   }
387 }
388 
389 /*
390  * Feature group: Dump, Online debugger.
391  * Target device group: Ascend, GPU.
392  * Runtime category: Old runtime, MindRT.
393  * Description: Sets previous and current root_graph_id for Ascend old runtime, sends graphs to online debugger when
394  * debugger_enabled_ is true.
395  */
PreExecute(const KernelGraphPtr & graph_ptr)396 void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
397   MS_EXCEPTION_IF_NULL(graph_ptr);
398   // access lock for public method
399   std::lock_guard<std::mutex> a_lock(access_lock_);
400   if (!MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
401     // Checking dataset_sink_mode for mindRT is done in debug_actor
402     CheckDatasetSinkMode(graph_ptr);
403   }
404   auto graph_id = graph_ptr->graph_id();
405   MS_LOG(DEBUG) << "PreExecute for graph: " << graph_id << " in step: " << num_step_ << ".";
406   StoreRunGraphIdList(graph_id);
407   SetCurrentAndPrevRootGraph(graph_ptr->root_graph_id());
408   // multiple graphs
409   if (graph_proto_list_.size() > 1) {
410     // there are more than one graphs are not dataset_graph
411     if (not_dataset_graph_sum_ > 0) {
412       SendMultiGraphsAndClear(graph_ptr);
413     }
414   } else if (graph_proto_list_.size() == 1) {
415     // single graph, and not the initial step
416     if (device_target_ == kGPUDevice && !MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT) &&
417         num_step_ != 0) {
418       if (debugger_enabled_ && !(run_level_ == "node" && suspended_at_last_kernel_)) {
419         CommandLoop();
420       }
421       debug_services_->ResetLoadedTensors();
422     }
423     // In single graph case, reset graph_ptr_ to be nullptr when debugger receives a new graph
424     if (received_new_graph_) {
425       graph_ptr_ = nullptr;
426       CheckGraphPtr(graph_ptr);
427     }
428   } else if (debugger_enabled_ && graph_id == rungraph_id_list_.front() && device_target_ == kGPUDevice &&
429              !MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
430     // Multiple graph, and not the initial step,
431     // stop only when receive the first sub run graph for each step for old runtime
432     // if we have stopped for the last kernel before, no need to stop again
433     if (Common::GetDebugTerminate()) {
434       return;
435     }
436     if (!(run_level_ == "node" && suspended_at_last_kernel_)) {
437       CommandLoop();
438     }
439     debug_services_->ResetLoadedTensors();
440   }
441   // resets for the new graph
442   suspended_at_last_kernel_ = false;
443 }
444 
445 /*
446  * Feature group: Online debugger.
447  * Target device group: Ascend, GPU.
448  * Runtime category: Old runtime, MindRT.
449  * Description: Sends all the subgraphs to online debugger when debugger_enabled_ is true.
450  */
SendMultiGraphsAndClear(const KernelGraphPtr & graph_ptr)451 void Debugger::SendMultiGraphsAndClear(const KernelGraphPtr &graph_ptr) {
452   // only try to enable debugger if they are not all dataset graphs
453   if (!enable_debugger_called_) {
454     EnableDebugger();
455   }
456   if (debugger_enabled_) {
457     // only send compiled graphs once at the initial step.
458     auto dbg_graph_ptr = graph_ptr_;
459     // use current graph ptr to load parameters
460     graph_ptr_ = graph_ptr;
461     LoadParametersAndConst();
462     // revert graph ptr to original value
463     graph_ptr_ = dbg_graph_ptr;
464 
465     SendMultiGraphsAndSuspend(graph_proto_list_);
466 
467     graph_proto_list_.clear();
468     received_new_graph_ = false;
469   }
470 }
471 
472 /*
473  * Feature group: Dump.
474  * Target device group: Ascend, GPU.
475  * Runtime category: MindRT.
476  * Description: Returns the rank_id for GPU and Ascend kernel-bykernel mindRT.
477  */
GetRankID()478 uint32_t Debugger::GetRankID() {
479   uint32_t rank_id = GetRankId();
480   return rank_id;
481 }
482 
483 /*
484  * Feature group: Dump.
485  * Target device group: Ascend, GPU.
486  * Runtime category: MindRT.
487  * Description: When dump is enabled, this function: 1) Dumps parameters for the current root_graph_id to the
488  * root_graph's directory. 2) Dumps constant data once for each graph. 3) Dumps graph run history for each graph.
489  */
DumpParamsAndConstAndHistory()490 void Debugger::DumpParamsAndConstAndHistory() {
491   if (!CheckDebuggerDumpEnabled()) {
492     return;
493   }
494   LoadParametersAllGraphs();
495   E2eDump::DumpParametersData(GetRankID(), debugger_.get());
496   // Whether constant data was already dumped for the current root graph.
497   bool cur_root_graph_checked = std::find(visited_root_graph_ids_.begin(), visited_root_graph_ids_.end(),
498                                           cur_root_graph_id_) != visited_root_graph_ids_.end();
499   for (auto graph : graph_ptr_step_vec_) {
500     if (!cur_root_graph_checked) {
501       LoadConstsForGraph(graph);
502       // Dump constant data for GPU.
503       E2eDump::DumpConstantData(graph.get(), GetRankID(), debugger_.get());
504       // Dump constant data for Ascend.
505       DumpConstantDataAscend(graph);
506     }
507   }
508   for (auto kernel_graph = executed_graph_ptr_set_.cbegin(); kernel_graph != executed_graph_ptr_set_.cend();
509        ++kernel_graph) {
510     auto debugger = Debugger::GetInstance();
511     MS_EXCEPTION_IF_NULL(debugger);
512     // Dump graph run hisotry for each graph.
513     if (debugger->GetAscendKernelByKernelFlag() && (*kernel_graph)->graph_id() != (*kernel_graph)->root_graph_id()) {
514       MS_LOG(INFO) << "current graph graph_id = " << (*kernel_graph)->graph_id() << " is not root graph.";
515     } else {
516       E2eDump::DumpRunIter(*kernel_graph, GetRankID());
517     }
518   }
519   if (!cur_root_graph_checked) {
520     visited_root_graph_ids_.push_back(cur_root_graph_id_);
521   }
522 }
523 
DumpConstantDataAscend(const KernelGraphPtr & graph)524 void Debugger::DumpConstantDataAscend(const KernelGraphPtr &graph) {
525   if (device_target_ != kAscendDevice) {
526     return;
527   }
528   auto &json_parser = DumpJsonParser::GetInstance();
529   if (json_parser.e2e_dump_enabled() || json_parser.async_dump_enabled()) {
530     // Dump constant data for ascend mindRT, for old runtime constant data is dumped in session_basic.
531     uint32_t rank_id = GetRankID();
532     std::string cst_file_dir = GenerateDumpPath(graph->root_graph_id(), rank_id, true);
533     DumpConstantInfo(graph, cst_file_dir);
534   }
535 }
536 
537 /*
538  * Feature group: Dump.
539  * Target device group: Ascend, GPU.
540  * Runtime category: MindRT.
541  * Description: Dumps a single node for given graph_id.
542  */
DumpSingleNode(const CNodePtr & node,uint32_t graph_id) const543 void Debugger::DumpSingleNode(const CNodePtr &node, uint32_t graph_id) const {
544   if (debugger_ && debugger_->DebuggerBackendEnabled()) {
545     uint32_t rank_id = GetRankID();
546     (void)E2eDump::DumpSingleNodeData(node, graph_id, rank_id, debugger_.get());
547   }
548 }
549 
550 /*
551  * Feature group: Dump.
552  * Target device group: GPU.
553  * Runtime category: MindRT.
554  * Description: This function is used for new GPU runtime using MindRTBackend, on Ascend platform, graphs are saved in
555  * session_basic.
556  */
DumpInGraphCompiler(const KernelGraphPtr & kernel_graph)557 void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) {
558   if (device_target_ == kAscendDevice) {
559     return;
560   }
561   auto &json_parser = DumpJsonParser::GetInstance();
562   if (json_parser.e2e_dump_enabled()) {
563     uint32_t rank_id = GetRankID();
564     kernel_graph->set_root_graph_id(kernel_graph->graph_id());
565     std::string final_graph = "trace_code_graph_" + std::to_string(kernel_graph->graph_id());
566     std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id);
567     std::string target_dir = root_dir + "/graphs";
568     std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir";
569     DumpIRProtoWithSrcInfo(kernel_graph, final_graph, target_dir, kDebugWholeStack);
570     DumpIR("trace_code_graph", kernel_graph, true, kWholeStack, ir_file_path);
571     DumpGraphExeOrder("ms_execution_order_graph_" + std::to_string(kernel_graph->graph_id()) + ".csv", root_dir,
572                       kernel_graph->execution_order());
573   }
574 }
575 
576 /*
577  * Feature group: Dump, Online debugger.
578  * Target device group: GPU and CPU.
579  * Runtime category: MindRT.
580  * Description: Load and dump parameters and constant data, call postExecute and update dump iter.
581  */
PostExecuteGraphDebugger()582 void Debugger::PostExecuteGraphDebugger() {
583   if (device_target_ == kAscendDevice) {
584     MS_LOG(DEBUG) << "On Ascend, parameters and constant data is not dumped here.";
585     return;
586   }
587   // On CPU, update dump iteration, Parameters and consts are not dumped here
588   if (device_target_ == kCPUDevice) {
589     DumpJsonParser::GetInstance().UpdateDumpIter();
590     return;
591   }
592   DumpParamsAndConstAndHistory();
593   // debug used for dump
594   if (CheckDebuggerDumpEnabled() && !debugger_enabled()) {
595     ClearCurrentData();
596   }
597   if (debugger_) {
598     debugger_->PostExecute();
599   }
600   E2eDump::UpdateIterMindRTDump();
601   executed_graph_ptr_set_.clear();
602 }
603 
604 /*
605  * Feature group: Online debugger.
606  * Target device group: Ascend, GPU.
607  * Runtime category: Old runtime, MindRT.
608  * Description: Send hit watchpoints, update the step number and reset loaded tensors.
609  */
PostExecute()610 void Debugger::PostExecute() {
611   // access lock for public method
612   std::lock_guard<std::mutex> a_lock(access_lock_);
613   if (Common::GetDebugTerminate()) {
614     return;
615   }
616   if (debugger_ && debugger_->DebuggerBackendEnabled()) {
617     // analyze tensor data and send the watchpoints been hit
618     if (debugger_enabled_ && !is_dataset_graph_) {
619       SendWatchpoints(CheckWatchpoints());
620       // no need to suspend at each graph for GPU old runtime, suspension happens in preExecute
621       if (device_target_ == kAscendDevice) {
622         CommandLoop();
623       } else if (device_target_ == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
624         if (!(run_level_ == "node" && suspended_at_last_kernel_)) {
625           CommandLoop();
626         }
627       }
628       if (device_target_ != kGPUDevice) {
629         num_step_++;
630       }
631     }
632     // Only keep parameters in th current map
633     // GPU ResetLoadedTensors for old runtime happens in preExecute
634     if ((device_target_ == kGPUDevice && MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) ||
635         device_target_ == kAscendDevice) {
636       if (debug_services_ != nullptr) {
637         debug_services_->ResetLoadedTensors();
638       } else {
639         MS_LOG(DEBUG) << "debug_services_ is nullptr";
640       }
641     }
642   }
643 }
644 
ReadNodeDataRequired(const CNodePtr & kernel) const645 bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) const {
646   if (debugger_enabled_ && !is_dataset_graph_) {
647     auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, kernel);
648     // if node has a watchpoint on it, is next_to node, or continue_to node then read the kernel tensor data
649     if (is_watchpoint || (run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_))) {
650       return true;
651     }
652   }
653   return false;
654 }
655 
656 /*
657  * Feature group: Online debugger.
658  * Target device group: GPU.
659  * Runtime category: Old runtime, MindRT.
660  * Description: Check and send watchpoint hit for a single node, suspend if a watchpoint is hit or we are continuing
661  * in node level.
662  */
PostExecuteNode(const CNodePtr & kernel,bool last_kernel)663 void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
664   // access lock for public method
665   std::lock_guard<std::mutex> a_lock(access_lock_);
666   if (Common::GetDebugTerminate()) {
667     return;
668   }
669   if (debugger_enabled_ && !is_dataset_graph_) {
670     auto is_watchpoint = debug_services_->IsWatchPoint(cur_name_, kernel);
671 
672     // if kernel is watchpoint,and get hit. suspend.
673     bool hit_empty_flag = true;
674     if (is_watchpoint) {
675       auto hits = CheckWatchpoints(cur_name_, kernel);
676       if (!hits.empty()) {
677         SendWatchpoints(hits);
678         CommandLoop();
679 
680         hit_empty_flag = false;
681       }
682     }
683     if (hit_empty_flag && run_level_ == "node" && (node_name_ == "" || node_name_ == cur_name_)) {
684       // if kernel is not watchpoint and is next_to or continue_to node, suspend
685       // sets a bool to be checked in preExecute to avoid double stopping at last kernel in the last graph
686       if (last_kernel) {
687         suspended_at_last_kernel_ = true;
688       }
689       CommandLoop();
690     }
691     return;
692   }
693 }
694 
695 /*
696  * Feature group: Dump, Online debugger.
697  * Target device group: Ascend, GPU.
698  * Runtime category: Old runtime, MindRT.
699  * Description: Get graph proto and add it to graph proto list and add loaded graph pointers to a list.
700  */
LoadGraphs(const KernelGraphPtr & graph_ptr)701 void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) {
702   MS_EXCEPTION_IF_NULL(graph_ptr);
703   if (graph_ptr_ != graph_ptr) {
704     MS_LOG(INFO) << "LoadGraphs Debugger got new graph: " << graph_ptr->graph_id();
705     received_new_graph_ = true;
706     // save new graph_ptr
707     graph_ptr_ = graph_ptr;
708     CheckDatasetGraph();
709     if (!is_dataset_graph_) {
710       // get proto for new graph_ptr
711       auto graph_proto = GetGraphProto(graph_ptr);
712       // add new graph proto to graph_proto_list_
713       graph_proto_list_.push_back(graph_proto);
714       graph_ptr_list_.push_back(graph_ptr);
715       not_dataset_graph_sum_++;
716     }
717     // reset is_dataset_graph to be false
718     is_dataset_graph_ = false;
719   }
720 }
721 
722 // In single graph cases, check single graph ptr
CheckGraphPtr(const KernelGraphPtr & graph_ptr)723 void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
724   MS_EXCEPTION_IF_NULL(graph_ptr);
725   if (graph_ptr_ != graph_ptr) {
726     MS_LOG(INFO) << "CheckGraphPtr Debugger got new graph: " << graph_ptr->graph_id();
727     // save new graph_ptr
728     graph_ptr_ = graph_ptr;
729     if (!is_dataset_graph_) {
730       // only try to enable debugger if it is not a dataset graph
731       if (!enable_debugger_called_) {
732         EnableDebugger();
733       }
734       if (debugger_enabled_) {
735         LoadParametersAndConst();
736         // get graph proto and send to MindInsight
737         auto graph_proto = graph_proto_list_.front();
738         SendGraphAndSuspend(graph_proto);
739         graph_proto_list_.clear();
740         received_new_graph_ = false;
741       }
742     }
743   }
744 }
745 
CheckDatasetGraph()746 void Debugger::CheckDatasetGraph() {
747   // print parameter node names
748   MS_EXCEPTION_IF_NULL(graph_ptr_);
749   const auto &params = graph_ptr_->inputs();
750   for (const auto &param : params) {
751     MS_LOG(INFO) << "param: " << GetKernelNodeName(param);
752   }
753   // check if there is GetNext or InitDataSetQueue node
754   const auto &nodes = graph_ptr_->execution_order();
755   for (const auto &node : nodes) {
756     auto node_name = common::AnfAlgo::GetCNodeName(node);
757     MS_LOG(INFO) << "node: " << GetKernelNodeName(node);
758     if (node_name == "GetNext" || node_name == "InitDataSetQueue") {
759       MS_LOG(INFO) << "Not enabling debugger for graph " << graph_ptr_->graph_id() << ": found dataset graph node "
760                    << node_name;
761       is_dataset_graph_ = true;
762       return;
763     }
764   }
765   is_dataset_graph_ = false;
766 }
767 
GetGraphProto(const KernelGraphPtr & graph_ptr) const768 GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const {
769   // convert kernel graph to debugger modelproto
770   ModelProto model = GetDebuggerFuncGraphProto(graph_ptr);
771   return model.graph();
772 }
773 
774 /*
775  * Feature group: Online debugger.
776  * Target device group: Ascend, GPU.
777  * Runtime category: Old runtime, MindRT.
778  * Description: Send debugger backend heartbeat to online debugger every few seconds.
779  */
SendHeartbeat(int32_t period)780 void Debugger::SendHeartbeat(int32_t period) {
781   int num_heartbeat_fail = 0;
782   const int max_num_heartbeat_fail = 5;
783   const int retry_milliseconds = 500;
784 
785   Heartbeat heartbeat;
786   heartbeat.set_message("Debugger is alive");
787   heartbeat.set_period(heartbeat_period_second);
788 
789   SetEnableHeartbeat(CheckDebuggerEnabled());
790   while (enable_heartbeat_) {
791     MS_EXCEPTION_IF_NULL(grpc_client_);
792     EventReply reply = grpc_client_->SendHeartbeat(heartbeat);
793     if (reply.status() != EventReply::OK) {
794       MS_LOG(ERROR) << "Error: SendHeartbeat failed";
795       num_heartbeat_fail++;
796       if (num_heartbeat_fail >= max_num_heartbeat_fail) {
797         MS_LOG(ERROR) << "Maximum number of failure for SendHeartbeat reached : exiting training session.";
798         SetEnableHeartbeat(false);
799         break;
800       } else {
801         MS_LOG(ERROR) << "Number of consecutive SendHeartbeat fail:" << num_heartbeat_fail;
802         std::this_thread::sleep_for(std::chrono::milliseconds(retry_milliseconds));
803       }
804     } else {
805       int recheck_period_ms = 200;
806       for (int i = 0; i < (period * 1000 / recheck_period_ms); i++) {
807         if (enable_heartbeat_) {
808           std::this_thread::sleep_for(std::chrono::milliseconds(recheck_period_ms));
809         } else {
810           break;
811         }
812       }
813     }
814   }
815 }
816 
SendGraphAndSuspend(const GraphProto & graph_proto)817 void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
818   if (!CheckSendMetadata()) {
819     return;
820   }
821   // send graph to MindInsight server
822   MS_EXCEPTION_IF_NULL(grpc_client_);
823   EventReply reply = grpc_client_->SendGraph(graph_proto);
824   if (reply.status() != EventReply::OK) {
825     MS_LOG(ERROR) << "Error: SendGraph failed";
826   }
827   // enter command loop, wait and process commands
828   CommandLoop();
829 }
830 
SendMetadata(bool version_check)831 bool Debugger::SendMetadata(bool version_check) {
832   // prepare metadata
833   MS_EXCEPTION_IF_NULL(graph_ptr_);
834   std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
835   Metadata metadata;
836   metadata.set_device_name(device_name);
837   metadata.set_cur_step(num_step_);
838   metadata.set_backend(device_target_);
839   metadata.set_cur_node(cur_name_);
840   metadata.set_training_done(training_done_);
841   metadata.set_ms_version(version_);
842   MS_LOG(INFO) << "Is training done?" << training_done_;
843   // set graph number to not_dataset_graph_sum_
844   metadata.set_graph_num(not_dataset_graph_sum_);
845 
846   MS_EXCEPTION_IF_NULL(grpc_client_);
847   EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
848 
849   bool ret = false;
850   if (reply_metadata.status() == EventReply::OK) {
851     if (version_check) {
852       // get type of the command in meta data reply, it should be version matched
853       DebuggerCommand cmd = GetCommand(reply_metadata);
854       if (cmd != DebuggerCommand::kVersionMatchedCMD) {
855         MS_LOG(ERROR) << "MindInsight version is too old, Mindspore version is " << version_;
856         Exit();
857       } else {
858         if (GetMiVersionMatched(reply_metadata)) {
859           MS_LOG(INFO) << "MindSpore version is " << version_ << " matches MindInsight version.";
860           ret = true;
861         } else {
862           MS_LOG(ERROR) << "MindSpore version " << version_ << ", did not match MindInsight version.";
863           CommandLoop();
864         }
865       }
866     } else {
867       // version check is done before so we can just return true here
868       ret = true;
869     }
870   } else {
871     MS_LOG(ERROR) << "Error: SendMetadata failed";
872   }
873 
874   return ret;
875 }
876 
SendMultiGraphsAndSuspend(const std::list<GraphProto> & graph_proto_list)877 void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_proto_list) {
878   if (!CheckSendMetadata()) {
879     return;
880   }
881   MS_EXCEPTION_IF_NULL(grpc_client_);
882   // send multiple graphs to mindinght server
883   // split graph into chunks if one graph is larger than chunk size
884   std::list<Chunk> chunked_graph_proto_list;
885   Chunk chunk;
886   for (auto graph : graph_proto_list) {
887     std::string str = graph.SerializeAsString();
888     auto graph_size = graph.ByteSize();
889     if (graph_size > g_chunk_size) {
890       auto sub_graph_str = grpc_client_->ChunkString(str, graph_size);
891 
892       for (unsigned int i = 0; i < sub_graph_str.size(); i++) {
893         chunk.set_buffer(sub_graph_str[i]);
894         if (i < sub_graph_str.size() - 1) {
895           chunk.set_finished(false);
896         } else {
897           chunk.set_finished(true);
898         }
899         chunked_graph_proto_list.push_back(chunk);
900       }
901     } else {
902       chunk.set_buffer(str);
903       chunk.set_finished(true);
904       chunked_graph_proto_list.push_back(chunk);
905     }
906   }
907   EventReply reply = grpc_client_->SendMultiGraphs(chunked_graph_proto_list);
908   if (reply.status() != EventReply::OK) {
909     MS_LOG(ERROR) << "Error: SendGraph failed";
910   }
911   // enter command loop, wait and process commands
912   CommandLoop();
913 }
914 
CheckSendMetadata()915 bool Debugger::CheckSendMetadata() {
916   if (!send_metadata_done_) {
917     if (!SendMetadata(true)) {
918       return false;
919     }
920     send_metadata_done_ = true;
921   }
922   return true;
923 }
924 
CommandLoop()925 void Debugger::CommandLoop() {
926   // prepare metadata
927   MS_EXCEPTION_IF_NULL(graph_ptr_);
928   std::string device_name = std::to_string(device_id_) + ":" + std::to_string(cur_root_graph_id_);
929   Metadata metadata;
930 
931   metadata.set_device_name(device_name);
932   metadata.set_cur_step(num_step_);
933   metadata.set_backend(device_target_);
934   metadata.set_cur_node(cur_name_);
935   metadata.set_training_done(training_done_);
936 
937   // loop exit flag
938   bool run = false;
939   int num_wait_fail = 0;
940   const int max_num_wait_fail = 5;
941 
942   while (!run) {
943     // wait for command
944     MS_EXCEPTION_IF_NULL(grpc_client_);
945     EventReply reply = grpc_client_->WaitForCommand(metadata);
946     if (reply.status() != EventReply::OK) {
947       MS_LOG(ERROR) << "Error: WaitForCommand failed";
948       num_wait_fail++;
949       if (num_wait_fail > max_num_wait_fail) {
950         MS_LOG(ERROR) << "Maximum number of WaitForCommand retry reached: exiting training session.";
951         MS_LOG(ERROR) << "Failed to connect to MindInsight debugger server. Please check the config "
952                          "of debugger host and port.";
953         Exit();
954         run = true;
955       } else {
956         MS_LOG(ERROR) << "Number of consecutive WaitForCommand fail:" << num_wait_fail << "; Retry after "
957                       << num_wait_fail << "s";
958         std::this_thread::sleep_for(std::chrono::seconds(num_wait_fail));
959       }
960       continue;
961     }
962 
963     // get type of the command in reply
964     DebuggerCommand cmd = GetCommand(reply);
965     if (cmd == DebuggerCommand::kUnknownCMD) {
966       MS_LOG(DEBUG) << "Debug: debugger received unknown command";
967       continue;
968     }
969 
970     MS_LOG(INFO) << "received command: ";
971     switch (cmd) {
972       case DebuggerCommand::kUnknownCMD:
973         MS_LOG(INFO) << "UnknownCMD";
974         break;
975       case DebuggerCommand::kExitCMD:
976         MS_LOG(INFO) << "ExitCMD";
977         Exit(true);
978         // Used for debugger termination
979         run = true;
980         break;
981       case DebuggerCommand::kRunCMD:
982         ProcessRunCMD(reply);
983         if (GetRunLevel(reply) != "recheck") {
984           // exit loop
985           run = true;
986         }
987         break;
988       case DebuggerCommand::kSetCMD:
989         ProcessKSetCMD(reply);
990         break;
991       case DebuggerCommand::kViewCMD:
992         ProcessKViewCMD(reply);
993         break;
994       case DebuggerCommand::kVersionMatchedCMD:
995         MS_LOG(ERROR) << "Received unexpected Version Matched CMD from MindInsight.";
996         Exit();
997         break;
998       default:
999         MS_LOG(ERROR) << "Received unknown CMD from MindInsight";
1000         Exit();
1001         break;
1002     }
1003   }
1004 }
1005 
ProcessRunCMD(const EventReply & reply)1006 void Debugger::ProcessRunCMD(const EventReply &reply) {
1007   MS_LOG(INFO) << "RunCMD";
1008   if (GetRunLevel(reply) == "recheck") {
1009     MS_LOG(INFO) << "rechecking all watchpoints";
1010     SendWatchpoints(CheckWatchpoints("", nullptr, true));
1011   } else {
1012     // no longer the initial suspension.
1013     initial_suspend_ = false;
1014     // print run cmd content
1015     // get run_level and node_name
1016     run_level_ = GetRunLevel(reply);
1017     node_name_ = GetNodeName(reply);
1018 
1019     MS_LOG(INFO) << "run_level: " << run_level_;
1020     MS_LOG(INFO) << "node_name_: " << node_name_;
1021   }
1022 }
1023 
ProcessKSetCMD(const EventReply & reply)1024 void Debugger::ProcessKSetCMD(const EventReply &reply) {
1025   MS_LOG(INFO) << "SetCMD";
1026   MS_LOG(INFO) << "id: " << GetWatchpointID(reply);
1027   MS_LOG(INFO) << "delete: " << GetWatchpointDelete(reply);
1028   if (GetWatchpointDelete(reply)) {
1029     MS_LOG(INFO) << "Deleting watchpoint";
1030     RemoveWatchpoint(GetWatchpointID(reply));
1031   } else {
1032     MS_LOG(INFO) << "Setting watchpoint";
1033     MS_LOG(INFO) << "condition: " << GetWatchcondition(reply).condition();
1034     ProtoVector<WatchNode> recieved_nodes = GetWatchnodes(reply);
1035     for (const auto &node : recieved_nodes) {
1036       MS_LOG(INFO) << "node name: " << node.node_name();
1037       MS_LOG(INFO) << "node type: " << node.node_type();
1038     }
1039     ProtoVector<WatchCondition_Parameter> parameters = GetParameters(reply);
1040     for (const auto &parameter : parameters) {
1041       MS_LOG(INFO) << "parameter name: " << parameter.name();
1042       MS_LOG(INFO) << "parameter is disabled: " << parameter.disabled();
1043       MS_LOG(INFO) << "parameter value: " << parameter.value();
1044     }
1045     SetWatchpoint(GetWatchnodes(reply), GetWatchcondition(reply), GetWatchpointID(reply), GetParameters(reply));
1046   }
1047 }
1048 
ProcessKViewCMD(const EventReply & reply)1049 void Debugger::ProcessKViewCMD(const EventReply &reply) {
1050   MS_LOG(INFO) << "ViewCMD";
1051   // print view cmd content
1052   ProtoVector<TensorProto> received_tensors = GetTensors(reply);
1053   for (auto received_tensor : received_tensors) {
1054     MS_LOG(INFO) << "tensor node name: " << received_tensor.node_name();
1055     MS_LOG(INFO) << "tensor slot: " << received_tensor.slot();
1056     MS_LOG(INFO) << "tensor finished: " << std::boolalpha << received_tensor.finished() << std::noboolalpha;
1057     MS_LOG(INFO) << "tensor iter: " << received_tensor.iter();
1058     MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << received_tensor.truncate() << std::noboolalpha;
1059   }
1060 
1061   switch (reply.view_cmd().level()) {
1062     case debugger::ViewCMD_Level::ViewCMD_Level_base:
1063       MS_LOG(INFO) << "Tensor base request.";
1064       ViewBaseLevel(reply);
1065       break;
1066 
1067     case debugger::ViewCMD_Level::ViewCMD_Level_statistics:
1068       MS_LOG(INFO) << "Tensor statistics request.";
1069       ViewStatLevel(reply);
1070       break;
1071 
1072     case debugger::ViewCMD_Level::ViewCMD_Level_value:
1073       MS_LOG(INFO) << "Tensor value request.";
1074       ViewValueLevel(reply);
1075       break;
1076     default:
1077       MS_LOG(DEBUG) << "Debug: Unknown tensor info level";
1078       break;
1079   }
1080 }
1081 
ViewValueLevel(const EventReply & reply)1082 void Debugger::ViewValueLevel(const EventReply &reply) {
1083   MS_LOG(INFO) << "Sending tensors";
1084   std::list<TensorProto> tensors = LoadTensors(GetTensors(reply));
1085   // print view cmd reply
1086   for (auto tensor = tensors.cbegin(); tensor != tensors.cend(); ++tensor) {
1087     MS_LOG(INFO) << "tensor node name: " << tensor->node_name();
1088     MS_LOG(INFO) << "tensor slot: " << tensor->slot();
1089     MS_LOG(INFO) << "tensor finished: " << std::boolalpha << tensor->finished() << std::noboolalpha;
1090     MS_LOG(INFO) << "tensor iter: " << tensor->iter();
1091     MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << tensor->truncate() << std::noboolalpha;
1092     MS_LOG(INFO) << "tensor dims: ";
1093     for (auto dim = tensor->dims().cbegin(); dim != tensor->dims().cend(); dim++) {
1094       MS_LOG(INFO) << *dim << ",";
1095     }
1096     MS_LOG(INFO) << "tensor dtype: " << tensor->data_type();
1097   }
1098   MS_EXCEPTION_IF_NULL(grpc_client_);
1099   EventReply send_tensors_reply = grpc_client_->SendTensors(tensors);
1100   if (send_tensors_reply.status() != debugger::EventReply::OK) {
1101     MS_LOG(ERROR) << "Error: SendTensors failed";
1102   }
1103 }
1104 
ViewStatLevel(const EventReply & reply)1105 void Debugger::ViewStatLevel(const EventReply &reply) {
1106   std::list<TensorSummary> tensor_stats_list = LoadTensorsStat(GetTensors(reply));
1107   EventReply send_tensors_stat_reply = grpc_client_->SendTensorStats(tensor_stats_list);
1108   if (send_tensors_stat_reply.status() != debugger::EventReply::OK) {
1109     MS_LOG(ERROR) << "Error: SendTensorsStats failed.";
1110   }
1111 }
1112 
ViewBaseLevel(const EventReply & reply)1113 void Debugger::ViewBaseLevel(const EventReply &reply) {
1114   std::list<TensorBase> tensor_base_list = LoadTensorsBase(GetTensors(reply));
1115   EventReply send_tensor_base_reply = grpc_client_->SendTensorBase(tensor_base_list);
1116   if (send_tensor_base_reply.status() != debugger::EventReply::OK) {
1117     MS_LOG(ERROR) << "Error: SendTensorsBase failed.";
1118   }
1119 }
1120 
AddTensorProtoInfo(TensorProto * tensor_item,const TensorProto & tensor)1121 void AddTensorProtoInfo(TensorProto *tensor_item, const TensorProto &tensor) {
1122   tensor_item->set_node_name(tensor.node_name());
1123   tensor_item->set_slot(tensor.slot());
1124   tensor_item->set_iter(tensor.iter());
1125   tensor_item->set_truncate(tensor.truncate());
1126   tensor_item->clear_tensor_content();
1127   tensor_item->clear_data_type();
1128   tensor_item->clear_dims();
1129 }
1130 
AddTensorStatInfo(const DebugServices::TensorStat & tensor_stat,std::list<TensorSummary> * const tensor_summary_list)1131 void AddTensorStatInfo(const DebugServices::TensorStat &tensor_stat,
1132                        std::list<TensorSummary> *const tensor_summary_list) {
1133   if (tensor_summary_list == nullptr) {
1134     MS_LOG(DEBUG) << "tensor_summary_list is nullptr.";
1135     return;
1136   }
1137   TensorSummary tensor_summary_item;
1138   TensorBase *tensor_base = tensor_summary_item.mutable_tensor_base();
1139   tensor_base->set_data_type(tensor_stat.dtype);
1140   tensor_base->set_data_size(static_cast<int64_t>(tensor_stat.data_size));
1141   for (auto elem : tensor_stat.shape) {
1142     tensor_base->add_shape(elem);
1143   }
1144 
1145   Statistics *tensor_statistics = tensor_summary_item.mutable_statistics();
1146   tensor_statistics->set_is_bool(tensor_stat.is_bool);
1147   tensor_statistics->set_max_value(static_cast<float>(tensor_stat.max_value));
1148   tensor_statistics->set_min_value(static_cast<float>(tensor_stat.min_value));
1149   tensor_statistics->set_avg_value(static_cast<float>(tensor_stat.avg_value));
1150   tensor_statistics->set_count(tensor_stat.count);
1151   tensor_statistics->set_neg_zero_count(tensor_stat.neg_zero_count);
1152   tensor_statistics->set_pos_zero_count(tensor_stat.pos_zero_count);
1153   tensor_statistics->set_nan_count(tensor_stat.nan_count);
1154   tensor_statistics->set_neg_inf_count(tensor_stat.neg_inf_count);
1155   tensor_statistics->set_pos_inf_count(tensor_stat.pos_inf_count);
1156   tensor_statistics->set_zero_count(tensor_stat.zero_count);
1157 
1158   tensor_summary_list->push_back(tensor_summary_item);
1159 }
1160 
SetWatchpoint(const ProtoVector<WatchNode> & nodes,const WatchCondition & condition,const int32_t id,const ProtoVector<WatchCondition_Parameter> & parameters)1161 void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
1162                              const ProtoVector<WatchCondition_Parameter> &parameters) {
1163   std::vector<std::tuple<std::string, bool>> check_node_list;
1164   std::vector<DebugServices::parameter_t> parameter_list;
1165 
1166   std::transform(nodes.begin(), nodes.end(), std::back_inserter(check_node_list),
1167                  [](const WatchNode &node) -> std::tuple<std::string, bool> {
1168                    return make_tuple(node.node_name(), node.node_type() == "scope");
1169                  });
1170 
1171   std::transform(
1172     parameters.begin(), parameters.end(), std::back_inserter(parameter_list),
1173     [](const WatchCondition_Parameter &parameter) -> DebugServices::parameter_t {
1174       return DebugServices::parameter_t{parameter.name(), parameter.disabled(), parameter.value(), parameter.hit()};
1175     });
1176   debug_services_->AddWatchpoint(id, static_cast<int>(condition.condition()), condition.value(), check_node_list,
1177                                  parameter_list);
1178 }
1179 
RemoveWatchpoint(const int32_t id)1180 void Debugger::RemoveWatchpoint(const int32_t id) { debug_services_->RemoveWatchpoint(id); }
1181 
LoadTensors(const ProtoVector<TensorProto> & tensors) const1182 std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &tensors) const {
1183   std::vector<std::string> name;
1184   std::vector<std::string> ret_name;
1185   std::vector<const char *> data_ptr;
1186   std::vector<ssize_t> data_size;
1187   std::vector<unsigned int> dtype;
1188   std::vector<std::vector<int64_t>> shape;
1189 
1190   std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
1191 
1192   // ret_name will contain tensor names that are found in TensorLoader
1193   // items in ret_name will be in the same order with tensors if found
1194   debug_services_->ReadNodesTensors(name, &ret_name, &data_ptr, &data_size, &dtype, &shape);
1195   std::list<TensorProto> tensor_list;
1196   size_t result_index = 0;
1197 
1198   for (auto tensor : tensors) {
1199     ssize_t size_iter = 0;
1200     if (result_index >= ret_name.size() || ret_name[result_index] != GetTensorFullName(tensor)) {
1201       TensorProto tensor_item;
1202       tensor_item.set_finished(true);
1203       AddTensorProtoInfo(&tensor_item, tensor);
1204       tensor_list.push_back(tensor_item);
1205       continue;
1206     }
1207     ssize_t tensor_size = data_size[result_index];
1208     while (size_iter < tensor_size) {
1209       ssize_t chunk_size = g_chunk_size;
1210       TensorProto tensor_item;
1211       tensor_item.set_finished(false);
1212       if (tensor_size - size_iter <= g_chunk_size) {
1213         chunk_size = tensor_size - size_iter;
1214         tensor_item.set_finished(true);
1215       }
1216       AddTensorProtoInfo(&tensor_item, tensor);
1217       // return empty tensor if didn't find the requested tensor
1218 
1219       tensor_item.set_tensor_content(data_ptr[result_index] + size_iter, chunk_size);
1220 
1221       tensor_item.set_data_type(static_cast<debugger::DataType>(dtype[result_index]));
1222       for (auto &elem : shape[result_index]) {
1223         tensor_item.add_dims(elem);
1224       }
1225       // add tensor to result list and increment result_index to check next item in ret_name
1226       tensor_list.push_back(tensor_item);
1227       if (size_iter > INT_MAX - g_chunk_size) {
1228         MS_EXCEPTION(ValueError) << size_iter << " + " << g_chunk_size << " would lead to integer overflow!";
1229       }
1230       size_iter += g_chunk_size;
1231     }
1232     result_index++;
1233   }
1234   return tensor_list;
1235 }
1236 
LoadTensorsBase(const ProtoVector<TensorProto> & tensors) const1237 std::list<TensorBase> Debugger::LoadTensorsBase(const ProtoVector<TensorProto> &tensors) const {
1238   std::list<TensorBase> tensor_base_list;
1239   std::vector<std::string> name;
1240   std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
1241   std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
1242   debug_services_->SearchNodesTensors(name, &result_list);
1243   for (auto result : result_list) {
1244     auto tensor = std::get<1>(result);
1245     if (!tensor || ((cur_root_graph_id_ != tensor->GetRootGraphId()) &&
1246                     MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT))) {
1247       // tensor was not found or tensor's graph was not executed in the current step, creating empty tensor base.
1248       TensorBase tensor_base_item;
1249       tensor_base_item.set_data_size(0);
1250       tensor_base_item.set_data_type(0);
1251       tensor_base_item.add_shape(0);
1252       tensor_base_list.push_back(tensor_base_item);
1253       continue;
1254     }
1255     // tensor was found creating tensor base object.
1256     TensorBase tensor_base_item;
1257     tensor_base_item.set_data_size(static_cast<int64_t>(tensor->GetByteSize()));
1258     tensor_base_item.set_data_type(static_cast<int32_t>(tensor->GetType()));
1259     for (auto elem : tensor->GetShape()) {
1260       tensor_base_item.add_shape(elem);
1261     }
1262     tensor_base_list.push_back(tensor_base_item);
1263   }
1264   return tensor_base_list;
1265 }
1266 
LoadTensorsStat(const ProtoVector<TensorProto> & tensors) const1267 std::list<TensorSummary> Debugger::LoadTensorsStat(const ProtoVector<TensorProto> &tensors) const {
1268   std::list<TensorSummary> tensor_summary_list;
1269   std::vector<std::string> name;
1270   std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
1271   std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
1272   debug_services_->SearchNodesTensors(name, &result_list);
1273   for (auto result : result_list) {
1274     auto tensor = std::get<1>(result);
1275     if (!tensor || ((cur_root_graph_id_ != tensor->GetRootGraphId()) &&
1276                     MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT))) {
1277       // tensor was not found or tensor's graph was not executed in the current step, creating empty tensor summary.
1278       DebugServices::TensorStat tensor_stat;
1279       AddTensorStatInfo(tensor_stat, &tensor_summary_list);
1280       continue;
1281     }
1282     // tensor was found creating tensor summary object.
1283     DebugServices::TensorStat tensor_stat = DebugServices::GetTensorStatistics(tensor);
1284     AddTensorStatInfo(tensor_stat, &tensor_summary_list);
1285   }
1286   return tensor_summary_list;
1287 }
1288 
GetTensor(const std::string & tensor_name) const1289 std::shared_ptr<TensorData> Debugger::GetTensor(const std::string &tensor_name) const {
1290   return debug_services_->GetTensor(tensor_name);
1291 }
1292 
Exit(bool exit_success)1293 void Debugger::Exit(bool exit_success) {
1294   // debugger will notify main thread to exit because main thread can only exit at step boundary.
1295   MS_LOG(INFO) << "Exit Debugger";
1296   SetEnableHeartbeat(false);
1297   Common::DebugTerminate(true, exit_success);
1298 }
1299 
CheckWatchpoints(const std::string & watchnode,const CNodePtr & kernel,bool recheck)1300 std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode, const CNodePtr &kernel,
1301                                                     bool recheck) {
1302   std::vector<std::string> name;
1303   std::vector<std::string> slot;
1304   std::vector<int> condition;
1305   std::vector<unsigned int> watchpoint_id;
1306   std::vector<std::vector<DebugServices::parameter_t>> parameters;
1307   std::vector<int32_t> error_codes;
1308   std::vector<std::shared_ptr<TensorData>> tensor_list;
1309   if (watchnode.empty()) {
1310     tensor_list = debug_services_->GetTensor();
1311   } else {
1312     tensor_list = debug_services_->GetNodeTensor(kernel);
1313   }
1314   DebugServices::ProcessedNPYFiles processed_npy_files;
1315   MS_LOG(INFO) << "checkwatchpoints call for step " << num_step_;
1316   debug_services_->CheckWatchpoints(&name, &slot, &condition, &watchpoint_id, &parameters, &error_codes,
1317                                     &processed_npy_files, &tensor_list, initial_suspend_, watchnode.empty(), recheck);
1318   std::list<WatchpointHit> hits;
1319   for (unsigned int i = 0; i < name.size(); i++) {
1320     WatchpointHit hit;
1321     std::vector<DebugServices::parameter_t> &parameter = parameters[i];
1322     hit.set_id(watchpoint_id[i]);
1323     hit.set_error_code(error_codes[i]);
1324     // here TensorProto act as a tensor indicator, not sending tensor content
1325     TensorProto *tensor_item = hit.mutable_tensor();
1326     tensor_item->set_node_name(name[i]);
1327     tensor_item->set_slot(slot[i]);
1328     tensor_item->set_finished(true);
1329 
1330     WatchCondition *condition_item = hit.mutable_watch_condition();
1331     condition_item->set_condition(debugger::WatchCondition_Condition(condition[i]));
1332     for (const auto &p : parameter) {
1333       auto x = condition_item->mutable_params()->Add();
1334       x->set_name(p.name);
1335       x->set_disabled(p.disabled);
1336       x->set_value(p.value);
1337       x->set_hit(p.hit);
1338       x->set_actual_value(p.actual_value);
1339     }
1340     hits.push_back(hit);
1341   }
1342   return hits;
1343 }
1344 
SendWatchpoints(const std::list<WatchpointHit> & points)1345 void Debugger::SendWatchpoints(const std::list<WatchpointHit> &points) {
1346   // send info about watchpoint
1347   if (!points.empty()) {
1348     MS_EXCEPTION_IF_NULL(grpc_client_);
1349     EventReply reply = grpc_client_->SendWatchpointHits(points);
1350     if (reply.status() != EventReply::OK) {
1351       MS_LOG(ERROR) << "Error: SendWatchpointHits failed";
1352     }
1353   }
1354 }
1355 
DumpTensorToFile(const std::string & filepath,const std::string & tensor_name,size_t slot) const1356 bool Debugger::DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const {
1357   if (debug_services_ == nullptr) {
1358     MS_LOG(INFO) << "The debug_services_ is nullptr.";
1359     return false;
1360   }
1361   return debug_services_.get()->DumpTensorToFile(filepath, tensor_name, slot);
1362 }
1363 
LoadNewTensor(const std::shared_ptr<TensorData> & tensor,bool keep_prev)1364 bool Debugger::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
1365   if (debug_services_ == nullptr) {
1366     debug_services_ = std::make_unique<DebugServices>();
1367   }
1368   return debug_services_.get()->LoadNewTensor(tensor, keep_prev);
1369 }
1370 
debugger_enabled() const1371 bool Debugger::debugger_enabled() const { return debugger_enabled_; }
1372 
partial_memory() const1373 bool Debugger::partial_memory() const { return partial_memory_; }
1374 
SetEnableHeartbeat(bool enabled)1375 void Debugger::SetEnableHeartbeat(bool enabled) { enable_heartbeat_ = enabled; }
1376 
SetCurNode(const std::string & cur_name)1377 void Debugger::SetCurNode(const std::string &cur_name) {
1378   // access lock for public method
1379   std::lock_guard<std::mutex> a_lock(access_lock_);
1380   cur_name_ = cur_name;
1381 }
1382 
run_level() const1383 std::string Debugger::run_level() const { return run_level_; }
1384 
SetTrainingDone(bool training_done)1385 void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; }
1386 
CheckPort(const std::string & port) const1387 bool Debugger::CheckPort(const std::string &port) const {
1388   int num = 0;
1389   const int min_port_num = 1;
1390   const int max_port_num = 65535;
1391   const int decimal = 10;
1392   if (port[0] == '0' && port[1] != '\0') {
1393     return false;
1394   }
1395   size_t i = 0;
1396   while (port[i] != '\0') {
1397     if (port[i] < '0' || port[i] > '9') {
1398       return false;
1399     }
1400     num = num * decimal + (port[i] - '0');
1401     if (num > max_port_num) {
1402       return false;
1403     }
1404     i++;
1405   }
1406   if (num < min_port_num) {
1407     return false;
1408   }
1409   return true;
1410 }
1411 
CheckIp(const std::string & host) const1412 bool Debugger::CheckIp(const std::string &host) const {
1413   std::regex reg_ip(
1414     "(25[0-4]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[1-9])"
1415     "[.](25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])"
1416     "[.](25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[0-9])"
1417     "[.](25[0-4]|2[0-4][0-9]|1[0-9][0-9]|[1-9][0-9]|[1-9])");
1418   std::smatch smat;
1419   std::string host_str = host;
1420   return std::regex_match(host_str, smat, reg_ip);
1421 }
1422 
GetFirstRunGraphId() const1423 uint32_t Debugger::GetFirstRunGraphId() const { return rungraph_id_list_.front(); }
1424 
1425 /*
1426  * Feature group: Dump.
1427  * Target device group: Ascend, GPU.
1428  * Runtime category: Old runtime, MindRT.
1429  * Description: Load a single parameter or value node.
1430  */
LoadSingleAnfnode(const AnfNodePtr & anf_node,const size_t output_index,uint32_t root_graph_id)1431 void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index, uint32_t root_graph_id) {
1432   MS_EXCEPTION_IF_NULL(anf_node);
1433   if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
1434     return;
1435   }
1436   // When MindRT is used, only ValueNodes and ParameterWeights can be loaded from device to host
1437   if (MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
1438     if (!anf_node->isa<ValueNode>() &&
1439         !(anf_node->isa<Parameter>() && common::AnfAlgo::IsParameterWeight(anf_node->cast<ParameterPtr>()))) {
1440       return;
1441     }
1442   }
1443   // for parameters and value nodes, set its execution order to be 0;
1444   int exec_order = 0;
1445   std::string node_name = GetKernelNodeName(anf_node);
1446   GetFileKernelName(NOT_NULL(&node_name));
1447   // check if output adde exists, if not, return;
1448   if (!AnfAlgo::OutputAddrExist(anf_node, output_index)) {
1449     return;
1450   }
1451   auto addr = AnfAlgo::GetOutputAddr(anf_node, output_index);
1452   MS_EXCEPTION_IF_NULL(addr);
1453   auto type = common::AnfAlgo::GetOutputInferDataType(anf_node, output_index);
1454   if (!IsTypeDebuggerSupported(type)) {
1455     return;
1456   }
1457   auto format = kOpFormat_DEFAULT;
1458   string tensor_name = node_name + ':' + "0";
1459   ShapeVector int_shapes = trans::GetRuntimePaddingShape(anf_node, output_index);
1460   bool keep_prev;
1461   if (anf_node->isa<Parameter>()) {
1462     keep_prev = true;
1463     debug_services_->MoveTensorCurrentToPrev(tensor_name);
1464   } else {
1465     keep_prev = false;
1466   }
1467   bool ret =
1468     addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev, root_graph_id, false, true);
1469   if (!ret) {
1470     MS_LOG(ERROR) << "LoadMemToHost:"
1471                   << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
1472   }
1473 }
1474 
LoadSingleParameterMindRT(const AnfNodePtr & node)1475 void Debugger::LoadSingleParameterMindRT(const AnfNodePtr &node) {
1476   MS_EXCEPTION_IF_NULL(node);
1477   auto root_graph_id = cur_root_graph_id_;
1478   // This function is only  for loading parameters mindRT.
1479   std::string node_name = GetKernelNodeName(node);
1480   GetFileKernelName(NOT_NULL(&node_name));
1481   TypeId type;
1482   TypeId device_type;
1483   ShapeVector int_shapes;
1484   auto device_addr = GetParameterInfo(node, NOT_NULL(&int_shapes), NOT_NULL(&type), NOT_NULL(&device_type));
1485   if (device_addr == nullptr || device_addr->GetPtr() == nullptr) {
1486     MS_LOG(DEBUG) << "Skip node: " << node_name << ". Parameter data is not available for mindRT.";
1487     return;
1488   }
1489   if (!IsTypeDebuggerSupported(type)) {
1490     return;
1491   }
1492   auto format = kOpFormat_DEFAULT;
1493   string tensor_name = node_name + ':' + "0";
1494   if (debug_services_ != nullptr) {
1495     debug_services_->MoveTensorCurrentToPrev(tensor_name);
1496   }
1497   // Keep_prev is True for parameters.
1498   // force update for parameters.
1499   bool ret = device_addr->LoadMemToHost(tensor_name, 0, format, int_shapes, type, 0, true, root_graph_id, true, true);
1500   if (!ret) {
1501     MS_LOG(ERROR) << "LoadMemToHost:"
1502                   << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
1503   }
1504 }
1505 
1506 /*
1507  * Feature group: Dump, Online debugger.
1508  * Target device group: Ascend, GPU.
1509  * Runtime category: Old runtime, MindRT.
1510  * Description: Load all the parameters and value nodes for the last loaded graph.
1511  */
LoadParametersAndConst()1512 void Debugger::LoadParametersAndConst() {
1513   if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) {
1514     return;
1515   }
1516   MS_EXCEPTION_IF_NULL(graph_ptr_);
1517   // load parameters
1518   MS_LOG(INFO) << "Start to load Parameters for graph " << graph_ptr_->graph_id() << ".";
1519   auto root_graph_id = graph_ptr_->root_graph_id();
1520   const auto &parameters = graph_ptr_->inputs();
1521   for (auto &item : parameters) {
1522     LoadSingleAnfnode(item, kParameterOutputIndex, root_graph_id);
1523   }
1524   // load value nodes
1525   // get all constant values from the graph
1526   MS_LOG(INFO) << "Start to load value nodes for graph " << graph_ptr_->graph_id() << ".";
1527   const auto value_nodes = graph_ptr_->graph_value_nodes();
1528   for (auto &item : value_nodes) {
1529     LoadSingleAnfnode(item, kValueNodeOutputIndex, root_graph_id);
1530   }
1531 }
1532 
1533 /*
1534  * Feature group: Dump, Online debugger.
1535  * Target device group: Ascend, GPU.
1536  * Runtime category: Old runtime, MindRT.
1537  * Description: Load all the parameters and value nodes for the given graph.
1538  */
LoadParametersAndConst(const KernelGraphPtr & graph)1539 void Debugger::LoadParametersAndConst(const KernelGraphPtr &graph) {
1540   if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) {
1541     return;
1542   }
1543   MS_EXCEPTION_IF_NULL(graph);
1544   // load parameters
1545   MS_LOG(INFO) << "Start to load Parameters for graph " << graph->graph_id() << ".";
1546   auto root_graph_id = graph->root_graph_id();
1547   const auto &parameters = graph->inputs();
1548   for (auto &item : parameters) {
1549     LoadSingleAnfnode(item, kParameterOutputIndex, root_graph_id);
1550   }
1551   // load value nodes
1552   // get all constant values from the graph
1553   MS_LOG(INFO) << "Start to load value nodes for graph " << graph->graph_id() << ".";
1554   const auto value_nodes = graph->graph_value_nodes();
1555   for (auto &item : value_nodes) {
1556     LoadSingleAnfnode(item, kValueNodeOutputIndex, root_graph_id);
1557   }
1558 }
1559 
1560 /*
1561  * Feature group: Dump.
1562  * Target device group: GPU.
1563  * Runtime category: MindRT.
1564  * Description: This function is for loading parameters' data from device to host into tensor_list_map_ for GPU dump.
1565  * Ascend does not use tensor_map_list_ for dump so it is not needed for ascend dump.
1566  */
LoadParametersAllGraphs()1567 void Debugger::LoadParametersAllGraphs() {
1568   if (!(device_target_ == kGPUDevice && CheckDebuggerDumpEnabled())) {
1569     return;
1570   }
1571   for (auto &node : parameters_mindRT_) {
1572     LoadSingleParameterMindRT(node);
1573   }
1574 }
1575 
1576 /*
1577  * Feature group: Dump.
1578  * Target device group: GPU.
1579  * Runtime category: MindRT.
1580  * Description: This function is for loading constant data from device to host into tensor_list_map_ for GPU dump.
1581  * Ascend does not use tensor_map_list_ for dump so it is not needed for ascend dump.
1582  */
LoadConstsForGraph(const KernelGraphPtr & graph)1583 void Debugger::LoadConstsForGraph(const KernelGraphPtr &graph) {
1584   if (!(device_target_ == kGPUDevice && CheckDebuggerDumpEnabled())) {
1585     return;
1586   }
1587   // load value nodes
1588   // get all constant values from the graph
1589   MS_LOG(INFO) << "Start to load value nodes for graph " << graph->graph_id() << ".";
1590   auto root_graph_id = graph->root_graph_id();
1591   const auto value_nodes = graph->graph_value_nodes();
1592   for (auto &item : value_nodes) {
1593     LoadSingleAnfnode(item, kValueNodeOutputIndex, root_graph_id);
1594   }
1595 }
1596 
1597 /*
1598  * Feature group: Online debugger.
1599  * Target device group: Ascend.
1600  * Runtime category: Old runtime, MindRT.
1601  * Description: Load all the kernels for the last loaded graph.
1602  */
LoadGraphOutputs()1603 void Debugger::LoadGraphOutputs() {
1604   if (!(debugger_enabled() && device_target_ == kAscendDevice)) {
1605     return;
1606   }
1607   MS_EXCEPTION_IF_NULL(graph_ptr_);
1608   const auto &apply_kernels = graph_ptr_->execution_order();
1609   auto root_graph_id = graph_ptr_->root_graph_id();
1610   // for kernels, execution order starts from 1
1611   int exec_order = 1;
1612   for (const auto &node : apply_kernels) {
1613     MS_EXCEPTION_IF_NULL(node);
1614     std::string kernel_name = GetKernelNodeName(node);
1615     auto output_size = AnfAlgo::GetOutputTensorNum(node);
1616     if (partial_memory_) {
1617       if (!debug_services_->IsWatchPoint(kernel_name, node)) {
1618         continue;
1619       }
1620     }
1621     for (size_t j = 0; j < output_size; ++j) {
1622       if (!AnfAlgo::OutputAddrExist(node, j)) {
1623         MS_LOG(INFO) << "Cannot find output addr for slot " << j << " for " << kernel_name;
1624         continue;
1625       }
1626       auto addr = AnfAlgo::GetOutputAddr(node, j);
1627       MS_EXCEPTION_IF_NULL(addr);
1628       auto type = common::AnfAlgo::GetOutputInferDataType(node, j);
1629       if (!IsTypeDebuggerSupported(type)) {
1630         continue;
1631       }
1632       auto format = kOpFormat_DEFAULT;
1633       string tensor_name = kernel_name + ':' + std::to_string(j);
1634       ShapeVector int_shapes = trans::GetRuntimePaddingShape(node, j);
1635       auto ret =
1636         addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, root_graph_id, false, true);
1637       if (!ret) {
1638         MS_LOG(ERROR) << "LoadMemToHost:"
1639                       << ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
1640       }
1641     }
1642     exec_order = exec_order + 1;
1643   }
1644 }
1645 
1646 /*
1647  * Feature group: Online debugger.
1648  * Target device group: GPU.
1649  * Runtime category: Old runtime.
1650  * Description: Update step number if we are processing the first graph (to support multigraph).
1651  */
UpdateStepNum(const session::KernelGraph * graph)1652 void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
1653   MS_EXCEPTION_IF_NULL(graph);
1654   MS_EXCEPTION_IF_NULL(debugger_);
1655   if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()) &&
1656       (graph->graph_id() == debugger_->GetFirstRunGraphId())) {
1657     // access lock for public method
1658     std::lock_guard<std::mutex> a_lock(access_lock_);
1659     ++num_step_;
1660   }
1661 }
1662 
1663 /*
1664  * Feature group: Online debugger.
1665  * Target device group: GPU.
1666  * Runtime category: MindRT.
1667  * Description: Update step number when DebugActor::DebugOnStepEnd is called at the end of each step.
1668  */
UpdateStepNumGPU()1669 void Debugger::UpdateStepNumGPU() {
1670   auto &dump_json_parser = DumpJsonParser::GetInstance();
1671   if (device_target_ == kGPUDevice && (debugger_enabled_ || dump_json_parser.DumpEnabledForIter())) {
1672     // access lock for public method
1673     std::lock_guard<std::mutex> a_lock(access_lock_);
1674     ++num_step_;
1675     MS_LOG(DEBUG) << "Update step for GPU, current step: " << num_step_;
1676   }
1677 }
1678 
ClearCurrentData()1679 void Debugger::ClearCurrentData() {
1680   if (device::KernelRuntime::DumpDataEnabledIteration()) {
1681     if (debug_services_) {
1682       debug_services_->EmptyCurrentTensor();
1683     } else {
1684       MS_LOG(WARNING) << "debug_services_ is nullptr";
1685     }
1686   }
1687 }
1688 
TensorExistsInCurrent(const std::string & tensor_name)1689 bool Debugger::TensorExistsInCurrent(const std::string &tensor_name) {
1690   if (debug_services_ != nullptr) {
1691     return debug_services_->TensorExistsInCurrent(tensor_name);
1692   }
1693   return false;
1694 }
1695 }  // namespace mindspore
1696