• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021-2024 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "debug/debugger/debugger_utils.h"
18 #include <memory>
19 #include <set>
20 #include <string>
21 #include <vector>
22 #include "backend/common/session/session_basic.h"
23 #include "debug/data_dump/tensor_statistic.h"
24 #include "include/backend/anf_runtime_algorithm.h"
25 #include "include/backend/debug/data_dump/dump_json_parser.h"
26 #include "include/backend/debug/data_dump/e2e_dump.h"
27 #include "include/backend/debug/debugger/debugger.h"
28 #include "include/common/debug/anf_dump_utils.h"
29 #include "include/common/utils/anfalgo.h"
30 #include "include/common/utils/config_manager.h"
31 #include "kernel/kernel.h"
32 
33 constexpr int kFailure = 1;
34 
35 using mindspore::kernel::AddressPtr;
36 using mindspore::kernel::KernelLaunchAddr;
37 using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>;
38 using KernelGraph = mindspore::session::KernelGraph;
39 using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
40 
41 namespace mindspore {
42 /*
43  * Feature group: Online debugger.
44  * Target device group: GPU.
45  * Runtime category: MindRT.
46  * Description: Returns a vector containing real output number.
47  */
CheckRealOutput(const std::string & node_name,const size_t & output_size)48 std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &output_size) {
49   std::vector<size_t> real_outputs;
50   // P.BatchNorm is used for training and inference
51   // can add the filter list for more operators here....
52   if (node_name == "BatchNorm") {
53     MS_LOG(INFO) << "loading node named " << node_name;
54     (void)real_outputs.insert(real_outputs.cend(), {0, 3, 4});
55   } else {
56     // by default, TensorLoader will load all outputs
57     for (size_t j = 0; j < output_size; ++j) {
58       real_outputs.push_back(j);
59     }
60   }
61   return real_outputs;
62 }
63 
64 /*
65  * Feature group: Dump, Online debugger.
66  * Target device group: GPU, Ascend.
67  * Runtime category: MindRT.
68  * Description: Get Valid Tensor indexes.
69  */
GetValidDumpIndex(const CNodePtr & cnode,size_t index_size,bool is_input)70 vector<size_t> GetValidDumpIndex(const CNodePtr &cnode, size_t index_size, bool is_input) {
71   std::vector<size_t> valid_indexes;
72   valid_indexes.reserve(index_size);
73   if (is_input) {
74     std::vector<size_t> ignored_address;
75     auto kernel_mod = AnfAlgo::GetKernelMod(cnode);
76     if (kernel_mod != nullptr) {
77       ignored_address = kernel_mod->GetLaunchIgnoredInputAddressIdx();
78     }
79     std::set<size_t> ignored_address_set(ignored_address.begin(), ignored_address.end());
80     for (size_t index = 0; index < index_size; ++index) {
81       if (ignored_address_set.find(index) != ignored_address_set.end()) {
82         continue;
83       }
84       valid_indexes.push_back(index);
85     }
86   } else {
87     auto node_name = common::AnfAlgo::GetCNodeName(cnode);
88     valid_indexes = CheckRealOutput(node_name, index_size);
89   }
90   return valid_indexes;
91 }
92 
93 /*
94  * Feature group: Dump, Online debugger.
95  * Target device group: GPU, Ascend.
96  * Runtime category: MindRT.
97  * Description: Get kernel inputs from device_tensors and load the inputs from device to host.
98  */
LoadInputs(const CNodePtr & cnode,std::vector<device::DeviceAddress * > device_tensors,uint32_t exec_order,uint32_t root_graph_id,const DeviceContext * device_context,const bool trans_flag,const uint32_t sample_mode,const uint32_t sample_num,const bool async_copy)99 void LoadInputs(const CNodePtr &cnode, std::vector<device::DeviceAddress *> device_tensors, uint32_t exec_order,
100                 uint32_t root_graph_id, const DeviceContext *device_context, const bool trans_flag,
101                 const uint32_t sample_mode, const uint32_t sample_num, const bool async_copy) {
102   MS_EXCEPTION_IF_NULL(cnode);
103   MS_EXCEPTION_IF_NULL(device_context);
104   auto kernel_mod = AnfAlgo::GetKernelMod(cnode);
105   std::vector<size_t> ignored_address;
106   if (kernel_mod != nullptr) {
107     ignored_address = kernel_mod->GetLaunchIgnoredInputAddressIdx();
108   }
109 
110   auto input_size = device_tensors.size();
111   for (size_t j = 0; j < input_size; ++j) {
112     // Ignore the input address that is not used in the kernel launch.
113     if (std::find(ignored_address.begin(), ignored_address.end(), j) != ignored_address.end()) {
114       MS_LOG(INFO) << "Ignore dump input data for kernel:" << cnode->fullname_with_scope() << " with input index:" << j;
115       continue;
116     }
117     auto input_kernel = cnode->input(j + 1);
118     std::string input_kernel_name = GetKernelNodeName(input_kernel);
119     auto device_type = AnfAlgo::GetOutputDeviceDataType(input_kernel, kParameterOutputIndex);
120     auto host_type = common::AnfAlgo::GetOutputInferDataType(input_kernel, kParameterOutputIndex);
121     auto type = trans_flag ? host_type : device_type;
122     // For example, this happens with the Depend op
123     if (type == kMetaTypeNone) {
124       continue;
125     }
126     auto host_format = kOpFormat_DEFAULT;
127     auto device_format =
128       E2eDump::IsDeviceTargetGPU() ? kOpFormat_DEFAULT : AnfAlgo::GetOutputFormat(input_kernel, kParameterOutputIndex);
129 
130     string input_tensor_name = input_kernel_name + ':' + "0";
131     auto device_addr = device_tensors[j];
132 
133     auto dump_shape = device_addr->kernel_tensor()->GetShapeVector();  // host shape
134     if (!trans_flag) {
135       dump_shape = AnfAlgo::GetOutputDeviceShape(input_kernel, kParameterOutputIndex, dump_shape);  // device_shape
136     }
137     if (sample_mode == DumpJsonParser::DUMP_HEAD_AND_TAIL && SizeOf(dump_shape) > sample_num) {
138       dump_shape = {sample_num};
139     }
140     auto ret = device_addr->LoadMemToHost(input_tensor_name, UintToInt(exec_order), host_format, dump_shape, type, 0,
141                                           true, root_graph_id, false, trans_flag, async_copy);
142     if (!ret) {
143       MS_LOG(WARNING) << "LoadMemToHost failed: tensor_name:" << input_tensor_name << ", host_format:" << host_format
144                       << ", device_format:" << device_format << ".";
145     }
146   }
147 }
148 
149 /*
150  * Feature group: Dump, Online debugger.
151  * Target device group: GPU, Ascend.
152  * Runtime category: MindRT.
153  * Description: Get kernel outputs from device_tensors and load the inputs from device to host.
154  */
LoadOutputs(const CNodePtr & cnode,std::vector<device::DeviceAddress * > device_tensors,uint32_t exec_order,uint32_t root_graph_id,const DeviceContext * device_context,const bool trans_flag,const uint32_t sample_mode,const uint32_t sample_num)155 void LoadOutputs(const CNodePtr &cnode, std::vector<device::DeviceAddress *> device_tensors, uint32_t exec_order,
156                  uint32_t root_graph_id, const DeviceContext *device_context, const bool trans_flag,
157                  const uint32_t sample_mode, const uint32_t sample_num) {
158   auto output_size = AnfAlgo::GetOutputTensorNum(cnode);
159   auto node_name = common::AnfAlgo::GetCNodeName(cnode);
160   std::string kernel_name = GetKernelNodeName(cnode);
161   std::vector<size_t> real_outputs = CheckRealOutput(node_name, output_size);
162   for (size_t j : real_outputs) {
163     auto device_type = AnfAlgo::GetOutputDeviceDataType(cnode, j);
164     auto host_type = common::AnfAlgo::GetOutputInferDataType(cnode, j);
165     auto type = trans_flag ? host_type : device_type;
166     // For example, this happens with the Depend op
167     if (type == kMetaTypeNone) {
168       continue;
169     }
170 
171     auto host_format = kOpFormat_DEFAULT;
172     auto device_format = E2eDump::IsDeviceTargetGPU() ? kOpFormat_DEFAULT : AnfAlgo::GetOutputFormat(cnode, j);
173 
174     string tensor_name = kernel_name + ':' + std::to_string(j);
175     auto device_addr = device_tensors[j];
176     auto dump_shape = device_addr->kernel_tensor()->GetShapeVector();
177     if (!trans_flag) {
178       dump_shape = AnfAlgo::GetOutputDeviceShape(cnode, j, dump_shape);
179     }
180     if (sample_mode == DumpJsonParser::DUMP_HEAD_AND_TAIL && SizeOf(dump_shape) > sample_num) {
181       dump_shape = {sample_num};
182     }
183     auto ret = device_addr->LoadMemToHost(tensor_name, UintToInt(exec_order), host_format, dump_shape, type, j, false,
184                                           root_graph_id, false, trans_flag);
185     if (!ret) {
186       MS_LOG(WARNING) << "LoadMemToHost failed: tensor_name:" << tensor_name << ", host_format:" << host_format
187                       << ", device_format:" << device_format << ".!";
188     }
189   }
190 }
191 
192 /*
193  * Feature group: Dump, Online debugger.
194  * Target device group: Ascend, GPU.
195  * Runtime category: MindRT.
196  * Description: Returns true if the node needs to be read for Dump or online debugger. This function is used by GPU
197  * and Ascend kernel-by-kernel mindRT.
198  */
CheckReadData(const CNodePtr & cnode)199 bool CheckReadData(const CNodePtr &cnode) {
200   auto debugger = Debugger::GetInstance();
201   if (!debugger) {
202     return false;
203   }
204   bool read_data = false;
205   auto &dump_json_parser = DumpJsonParser::GetInstance();
206   bool dump_enabled = dump_json_parser.DumpEnabledForIter();
207   MS_LOG(DEBUG) << "dump_enabled: " << dump_enabled;
208   std::string kernel_name = GetKernelNodeName(cnode);
209   if (dump_enabled) {
210     if (dump_json_parser.NeedDump(kernel_name)) {
211       read_data = true;
212     }
213   }
214   if (debugger->debugger_enabled()) {
215     read_data = debugger->ReadNodeDataRequired(cnode);
216   }
217   return read_data;
218 }
219 
IsDeviceTargetGPU()220 bool IsDeviceTargetGPU() {
221   auto context = MsContext::GetInstance();
222   MS_EXCEPTION_IF_NULL(context);
223   return context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice;
224 }
225 
GetTransFlag()226 bool GetTransFlag() {
227   auto debugger = Debugger::GetInstance();
228   MS_EXCEPTION_IF_NULL(debugger);
229   if (debugger->debugger_enabled() || IsDeviceTargetGPU()) {
230     return true;
231   }
232   return DumpJsonParser::GetInstance().trans_flag();
233 }
234 
GetSampleMode()235 uint32_t GetSampleMode() {
236   auto debugger = Debugger::GetInstance();
237   MS_EXCEPTION_IF_NULL(debugger);
238   if (debugger->debugger_enabled() || IsDeviceTargetGPU()) {
239     return 0;
240   }
241   return DumpJsonParser::GetInstance().sample_mode();
242 }
243 
GetSampleNum()244 uint32_t GetSampleNum() {
245   auto debugger = Debugger::GetInstance();
246   MS_EXCEPTION_IF_NULL(debugger);
247   if (debugger->debugger_enabled() || IsDeviceTargetGPU()) {
248     return 0;
249   }
250   return DumpJsonParser::GetInstance().sample_num();
251 }
252 
253 /*
254  * Feature group: Dump, Online debugger.
255  * Target device group: Ascend, GPU.
256  * Runtime category: MindRT.
257  * Description: Load inputs and outputs of the given node if needed and dump them if dump is enabled, then it performs
258  * PostExecuteNode function on the given node for GPU.
259  */
ReadDataAndDump(const CNodePtr & cnode,std::vector<device::DeviceAddress * > input_device_tensors,std::vector<device::DeviceAddress * > output_device_tensors,uint32_t exec_order,const DeviceContext * device_context,const bool abnormal_dump)260 void ReadDataAndDump(const CNodePtr &cnode, std::vector<device::DeviceAddress *> input_device_tensors,
261                      std::vector<device::DeviceAddress *> output_device_tensors, uint32_t exec_order,
262                      const DeviceContext *device_context, const bool abnormal_dump) {
263   auto debugger = Debugger::GetInstance();
264   if (!debugger) {
265     return;
266   }
267   auto &dump_json_parser = DumpJsonParser::GetInstance();
268   bool dump_enabled = dump_json_parser.DumpEnabledForIter();
269   MS_LOG(DEBUG) << "dump_enabled: " << dump_enabled;
270   auto kernel_graph = std::dynamic_pointer_cast<KernelGraph>(cnode->func_graph());
271   MS_EXCEPTION_IF_NULL(kernel_graph);
272   auto root_graph_id = kernel_graph->root_graph_id();
273   bool trans_flag = GetTransFlag();
274   uint32_t sample_mode = GetSampleMode();
275   uint32_t sample_num = GetSampleNum();
276   if (debugger->debugger_enabled() || dump_json_parser.InputNeedDump()) {
277     if (DumpJsonParser::GetInstance().IsDeviceCalcStats() && dump_enabled) {
278       datadump::DumpKernelTensorStats(device_context, input_device_tensors, true, cnode, root_graph_id);
279     } else {
280       bool async_copy = !abnormal_dump;
281       LoadInputs(cnode, input_device_tensors, exec_order, root_graph_id, device_context, trans_flag, sample_mode,
282                  sample_num, async_copy);
283     }
284   }
285   if (debugger->debugger_enabled() || dump_json_parser.OutputNeedDump()) {
286     if (DumpJsonParser::GetInstance().IsDeviceCalcStats() && dump_enabled) {
287       datadump::DumpKernelTensorStats(device_context, output_device_tensors, false, cnode, root_graph_id);
288     } else if (!abnormal_dump) {
289       LoadOutputs(cnode, output_device_tensors, exec_order, root_graph_id, device_context, trans_flag, sample_mode,
290                   sample_num);
291     }
292   }
293   // Dump kernel
294   if (dump_enabled && !DumpJsonParser::GetInstance().IsDeviceCalcStats()) {
295     MS_EXCEPTION_IF_NULL(kernel_graph);
296     auto graph_id = kernel_graph->graph_id();
297     // for GPU, nodes are dumped in graph_id directory.
298     if (IsDeviceTargetGPU()) {
299       debugger->DumpSingleNode(cnode, graph_id);
300     } else {
301       // for Ascend, node are dumped in root_graph_id directory.
302       debugger->DumpSingleNode(cnode, root_graph_id);
303     }
304     // Clear Dumped data when online debugger is not enabled
305     if (!debugger->debugger_enabled()) {
306       debugger->ClearCurrentData();
307     }
308   }
309   if (IsDeviceTargetGPU()) {
310     // check if the node is last kernel
311     bool last_kernel = !common::AnfAlgo::IsInplaceNode(cnode, "skip");
312     debugger->PostExecuteNode(cnode, last_kernel);
313   }
314 }
315 
316 /*
317  * Feature group: Dump, Online Debugger.
318  * Target device group: Ascend, GPU.
319  * Runtime category: MindRT.
320  * Description: Returns the error_info when sink_mode is true and we are in online debugger mode or dump mode for
321  * GPU, if everything is normal the error_info string will be empty.
322  */
CheckDatasetSinkMode(const KernelGraphPtr & graph_ptr)323 std::string CheckDatasetSinkMode(const KernelGraphPtr &graph_ptr) {
324   std::string error_info = "";
325   bool sink_mode =
326     ConfigManager::GetInstance().dataset_mode() == DatasetMode::DS_SINK_MODE || graph_ptr->IsDatasetGraph();
327   auto debugger = Debugger::GetInstance();
328   MS_EXCEPTION_IF_NULL(debugger);
329   if (debugger->CheckDebuggerDumpEnabled() && sink_mode && IsDeviceTargetGPU()) {
330     error_info = "e2e_dump is not supported on GPU with dataset_sink_mode=True. Please set dataset_sink_mode=False";
331   }
332   if (debugger->CheckDebuggerEnabled() && sink_mode) {
333     error_info = "Debugger is not supported with dataset_sink_mode=True. Please set dataset_sink_mode=False";
334   }
335   return error_info;
336 }
337 
338 /*
339  * Feature group: Online Debugger.
340  * Target device group: Ascend.
341  * Runtime category: MindRT.
342  * Description: Loads graph's outputs and parameters for Ascend super kernel mode.
343  */
LoadDataForDebugger(const KernelGraphPtr & graph_ptr)344 void LoadDataForDebugger(const KernelGraphPtr &graph_ptr) {
345   auto context = MsContext::GetInstance();
346   MS_EXCEPTION_IF_NULL(context);
347   if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kAscendDevice) {
348     return;
349   }
350 #ifdef ENABLE_DEBUGGER
351   auto debugger = Debugger::GetInstance();
352   MS_EXCEPTION_IF_NULL(debugger);
353   if (!debugger->CheckDebuggerEnabled()) {
354     return;
355   }
356   MS_LOG(INFO) << "Start load step";
357   debugger->SetGraphPtr(graph_ptr);
358   // load output
359   debugger->LoadGraphOutputs();
360   // load parameters
361   debugger->LoadParametersAndConst();
362 
363 #endif
364 }
365 
Dump(const KernelGraphPtr & graph,uint32_t rank_id)366 void Dump(const KernelGraphPtr &graph, uint32_t rank_id) {
367   MS_LOG(DEBUG) << "Start!";
368   MS_EXCEPTION_IF_NULL(graph);
369   E2eDump::DumpData(graph.get(), rank_id);
370   MS_LOG(DEBUG) << "Finish!";
371 }
372 
GetRankID()373 uint32_t GetRankID() {
374   uint32_t rank_id = 0;
375   auto ms_context = MsContext::GetInstance();
376   MS_EXCEPTION_IF_NULL(ms_context);
377   auto env_rank_id = common::GetEnv("RANK_ID");
378   if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
379     // get actual rank id if it's distribution training case.
380     rank_id = GetRankId();
381   }
382   return rank_id;
383 }
384 
SuperKernelE2eDump(const KernelGraphPtr & graph)385 void SuperKernelE2eDump(const KernelGraphPtr &graph) {
386 #ifndef ENABLE_SECURITY
387   Dump(graph, GetRankID());
388 #endif
389 }
390 
GetCommand(const debugger::EventReply & reply)391 DebuggerCommand GetCommand(const debugger::EventReply &reply) {
392   DebuggerCommand cmd = DebuggerCommand::kUnknownCMD;
393   switch (reply.cmd_case()) {
394     case debugger::EventReply::CmdCase::kExit:
395       cmd = DebuggerCommand::kExitCMD;
396       break;
397     case debugger::EventReply::CmdCase::kRunCmd:
398       cmd = DebuggerCommand::kRunCMD;
399       break;
400     case debugger::EventReply::CmdCase::kSetCmd:
401       cmd = DebuggerCommand::kSetCMD;
402       break;
403     case debugger::EventReply::CmdCase::kViewCmd:
404       cmd = DebuggerCommand::kViewCMD;
405       break;
406     case debugger::EventReply::CmdCase::kVersionMatched:
407       cmd = DebuggerCommand::kVersionMatchedCMD;
408       break;
409     default:
410       MS_LOG(DEBUG) << "Debug: UnknownCMD";
411       break;
412   }
413   return cmd;
414 }
415 
GetParameters(const debugger::EventReply & reply)416 ProtoVector<debugger::WatchCondition_Parameter> GetParameters(const debugger::EventReply &reply) {
417   if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) {
418     MS_LOG(ERROR) << "Error: Can not get Parameters from command. Returning default value: ProtoVector<Parameter>().";
419     return ProtoVector<debugger::WatchCondition_Parameter>();
420   }
421   return reply.set_cmd().watch_condition().params();
422 }
423 
GetWatchnodes(const debugger::EventReply & reply)424 ProtoVector<debugger::WatchNode> GetWatchnodes(const debugger::EventReply &reply) {
425   if (!reply.has_set_cmd()) {
426     MS_LOG(ERROR) << "Error: Not SetCMD, can not get WatchNodes. Returning default value: ProtoVector<WatchNode>().";
427     return ProtoVector<debugger::WatchNode>();
428   }
429   return reply.set_cmd().watch_nodes();
430 }
431 
GetNodeName(const debugger::EventReply & reply)432 std::string GetNodeName(const debugger::EventReply &reply) {
433   if (!reply.has_run_cmd()) {
434     MS_LOG(ERROR) << "Error: Not RunCMD, can not get NodeName. Returning default value: "
435                      "";
436     return "";
437   }
438   return reply.run_cmd().node_name();
439 }
440 
GetRunLevel(const debugger::EventReply & reply)441 std::string GetRunLevel(const debugger::EventReply &reply) {
442   if (!reply.has_run_cmd()) {
443     MS_LOG(ERROR) << "Error: Not RunCMD, can not get RunLevel. Returning default value: "
444                      "";
445     return "";
446   }
447   return reply.run_cmd().run_level();
448 }
449 
GetWatchcondition(const debugger::EventReply & reply)450 debugger::WatchCondition GetWatchcondition(const debugger::EventReply &reply) {
451   if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) {
452     MS_LOG(ERROR) << "Error: Can not get WatchCondition from command. Returning default value: WatchCondition().";
453     return debugger::WatchCondition();
454   }
455   return reply.set_cmd().watch_condition();
456 }
457 
GetWatchpointID(const debugger::EventReply & reply)458 int32_t GetWatchpointID(const debugger::EventReply &reply) {
459   if (!reply.has_set_cmd()) {
460     MS_LOG(ERROR) << "Error: Not SetCMD, can not get Watchpoint ID. Returning default value: 0.";
461     return 0;
462   }
463   return reply.set_cmd().id();
464 }
465 
GetWatchpointDelete(const debugger::EventReply & reply)466 bool GetWatchpointDelete(const debugger::EventReply &reply) {
467   if (!reply.has_set_cmd()) {
468     MS_LOG(ERROR) << "Error: Not SetCMD, can not get Watchpoint delete flag. Returning default value: false.";
469     return false;
470   }
471   return reply.set_cmd().delete_();
472 }
473 
GetTensors(const debugger::EventReply & reply)474 ProtoVector<debugger::TensorProto> GetTensors(const debugger::EventReply &reply) {
475   if (!reply.has_view_cmd()) {
476     MS_LOG(ERROR) << "Error: Not ViewCMD, can not get Tensors. Returning default value: ProtoVector<TensorProto>().";
477     return ProtoVector<debugger::TensorProto>();
478   }
479   return reply.view_cmd().tensors();
480 }
481 
GetMiVersionMatched(const debugger::EventReply & reply)482 bool GetMiVersionMatched(const debugger::EventReply &reply) { return reply.version_matched(); }
483 
GetTensorFullName(const debugger::TensorProto & tensor)484 std::string GetTensorFullName(const debugger::TensorProto &tensor) {
485   string node_name = tensor.node_name();
486   if (tensor.truncate()) {
487     // scopes in node name are separated by '/'
488     // use the name without scope if truncate is true
489     std::size_t found = node_name.find_last_of("/");
490     node_name = node_name.substr(found + 1);
491   }
492   return node_name + ":" + tensor.slot() + (tensor.iter() == "" ? "" : ":" + tensor.iter());
493 }
494 }  // namespace mindspore
495