1 /**
2 * Copyright 2021-2024 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "debug/debugger/debugger_utils.h"
18 #include <memory>
19 #include <set>
20 #include <string>
21 #include <vector>
22 #include "backend/common/session/session_basic.h"
23 #include "debug/data_dump/tensor_statistic.h"
24 #include "include/backend/anf_runtime_algorithm.h"
25 #include "include/backend/debug/data_dump/dump_json_parser.h"
26 #include "include/backend/debug/data_dump/e2e_dump.h"
27 #include "include/backend/debug/debugger/debugger.h"
28 #include "include/common/debug/anf_dump_utils.h"
29 #include "include/common/utils/anfalgo.h"
30 #include "include/common/utils/config_manager.h"
31 #include "kernel/kernel.h"
32
33 constexpr int kFailure = 1;
34
35 using mindspore::kernel::AddressPtr;
36 using mindspore::kernel::KernelLaunchAddr;
37 using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>;
38 using KernelGraph = mindspore::session::KernelGraph;
39 using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
40
41 namespace mindspore {
42 /*
43 * Feature group: Online debugger.
44 * Target device group: GPU.
45 * Runtime category: MindRT.
46 * Description: Returns a vector containing real output number.
47 */
CheckRealOutput(const std::string & node_name,const size_t & output_size)48 std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &output_size) {
49 std::vector<size_t> real_outputs;
50 // P.BatchNorm is used for training and inference
51 // can add the filter list for more operators here....
52 if (node_name == "BatchNorm") {
53 MS_LOG(INFO) << "loading node named " << node_name;
54 (void)real_outputs.insert(real_outputs.cend(), {0, 3, 4});
55 } else {
56 // by default, TensorLoader will load all outputs
57 for (size_t j = 0; j < output_size; ++j) {
58 real_outputs.push_back(j);
59 }
60 }
61 return real_outputs;
62 }
63
64 /*
65 * Feature group: Dump, Online debugger.
66 * Target device group: GPU, Ascend.
67 * Runtime category: MindRT.
68 * Description: Get Valid Tensor indexes.
69 */
GetValidDumpIndex(const CNodePtr & cnode,size_t index_size,bool is_input)70 vector<size_t> GetValidDumpIndex(const CNodePtr &cnode, size_t index_size, bool is_input) {
71 std::vector<size_t> valid_indexes;
72 valid_indexes.reserve(index_size);
73 if (is_input) {
74 std::vector<size_t> ignored_address;
75 auto kernel_mod = AnfAlgo::GetKernelMod(cnode);
76 if (kernel_mod != nullptr) {
77 ignored_address = kernel_mod->GetLaunchIgnoredInputAddressIdx();
78 }
79 std::set<size_t> ignored_address_set(ignored_address.begin(), ignored_address.end());
80 for (size_t index = 0; index < index_size; ++index) {
81 if (ignored_address_set.find(index) != ignored_address_set.end()) {
82 continue;
83 }
84 valid_indexes.push_back(index);
85 }
86 } else {
87 auto node_name = common::AnfAlgo::GetCNodeName(cnode);
88 valid_indexes = CheckRealOutput(node_name, index_size);
89 }
90 return valid_indexes;
91 }
92
93 /*
94 * Feature group: Dump, Online debugger.
95 * Target device group: GPU, Ascend.
96 * Runtime category: MindRT.
97 * Description: Get kernel inputs from device_tensors and load the inputs from device to host.
98 */
LoadInputs(const CNodePtr & cnode,std::vector<device::DeviceAddress * > device_tensors,uint32_t exec_order,uint32_t root_graph_id,const DeviceContext * device_context,const bool trans_flag,const uint32_t sample_mode,const uint32_t sample_num,const bool async_copy)99 void LoadInputs(const CNodePtr &cnode, std::vector<device::DeviceAddress *> device_tensors, uint32_t exec_order,
100 uint32_t root_graph_id, const DeviceContext *device_context, const bool trans_flag,
101 const uint32_t sample_mode, const uint32_t sample_num, const bool async_copy) {
102 MS_EXCEPTION_IF_NULL(cnode);
103 MS_EXCEPTION_IF_NULL(device_context);
104 auto kernel_mod = AnfAlgo::GetKernelMod(cnode);
105 std::vector<size_t> ignored_address;
106 if (kernel_mod != nullptr) {
107 ignored_address = kernel_mod->GetLaunchIgnoredInputAddressIdx();
108 }
109
110 auto input_size = device_tensors.size();
111 for (size_t j = 0; j < input_size; ++j) {
112 // Ignore the input address that is not used in the kernel launch.
113 if (std::find(ignored_address.begin(), ignored_address.end(), j) != ignored_address.end()) {
114 MS_LOG(INFO) << "Ignore dump input data for kernel:" << cnode->fullname_with_scope() << " with input index:" << j;
115 continue;
116 }
117 auto input_kernel = cnode->input(j + 1);
118 std::string input_kernel_name = GetKernelNodeName(input_kernel);
119 auto device_type = AnfAlgo::GetOutputDeviceDataType(input_kernel, kParameterOutputIndex);
120 auto host_type = common::AnfAlgo::GetOutputInferDataType(input_kernel, kParameterOutputIndex);
121 auto type = trans_flag ? host_type : device_type;
122 // For example, this happens with the Depend op
123 if (type == kMetaTypeNone) {
124 continue;
125 }
126 auto host_format = kOpFormat_DEFAULT;
127 auto device_format =
128 E2eDump::IsDeviceTargetGPU() ? kOpFormat_DEFAULT : AnfAlgo::GetOutputFormat(input_kernel, kParameterOutputIndex);
129
130 string input_tensor_name = input_kernel_name + ':' + "0";
131 auto device_addr = device_tensors[j];
132
133 auto dump_shape = device_addr->kernel_tensor()->GetShapeVector(); // host shape
134 if (!trans_flag) {
135 dump_shape = AnfAlgo::GetOutputDeviceShape(input_kernel, kParameterOutputIndex, dump_shape); // device_shape
136 }
137 if (sample_mode == DumpJsonParser::DUMP_HEAD_AND_TAIL && SizeOf(dump_shape) > sample_num) {
138 dump_shape = {sample_num};
139 }
140 auto ret = device_addr->LoadMemToHost(input_tensor_name, UintToInt(exec_order), host_format, dump_shape, type, 0,
141 true, root_graph_id, false, trans_flag, async_copy);
142 if (!ret) {
143 MS_LOG(WARNING) << "LoadMemToHost failed: tensor_name:" << input_tensor_name << ", host_format:" << host_format
144 << ", device_format:" << device_format << ".";
145 }
146 }
147 }
148
149 /*
150 * Feature group: Dump, Online debugger.
151 * Target device group: GPU, Ascend.
152 * Runtime category: MindRT.
153 * Description: Get kernel outputs from device_tensors and load the inputs from device to host.
154 */
LoadOutputs(const CNodePtr & cnode,std::vector<device::DeviceAddress * > device_tensors,uint32_t exec_order,uint32_t root_graph_id,const DeviceContext * device_context,const bool trans_flag,const uint32_t sample_mode,const uint32_t sample_num)155 void LoadOutputs(const CNodePtr &cnode, std::vector<device::DeviceAddress *> device_tensors, uint32_t exec_order,
156 uint32_t root_graph_id, const DeviceContext *device_context, const bool trans_flag,
157 const uint32_t sample_mode, const uint32_t sample_num) {
158 auto output_size = AnfAlgo::GetOutputTensorNum(cnode);
159 auto node_name = common::AnfAlgo::GetCNodeName(cnode);
160 std::string kernel_name = GetKernelNodeName(cnode);
161 std::vector<size_t> real_outputs = CheckRealOutput(node_name, output_size);
162 for (size_t j : real_outputs) {
163 auto device_type = AnfAlgo::GetOutputDeviceDataType(cnode, j);
164 auto host_type = common::AnfAlgo::GetOutputInferDataType(cnode, j);
165 auto type = trans_flag ? host_type : device_type;
166 // For example, this happens with the Depend op
167 if (type == kMetaTypeNone) {
168 continue;
169 }
170
171 auto host_format = kOpFormat_DEFAULT;
172 auto device_format = E2eDump::IsDeviceTargetGPU() ? kOpFormat_DEFAULT : AnfAlgo::GetOutputFormat(cnode, j);
173
174 string tensor_name = kernel_name + ':' + std::to_string(j);
175 auto device_addr = device_tensors[j];
176 auto dump_shape = device_addr->kernel_tensor()->GetShapeVector();
177 if (!trans_flag) {
178 dump_shape = AnfAlgo::GetOutputDeviceShape(cnode, j, dump_shape);
179 }
180 if (sample_mode == DumpJsonParser::DUMP_HEAD_AND_TAIL && SizeOf(dump_shape) > sample_num) {
181 dump_shape = {sample_num};
182 }
183 auto ret = device_addr->LoadMemToHost(tensor_name, UintToInt(exec_order), host_format, dump_shape, type, j, false,
184 root_graph_id, false, trans_flag);
185 if (!ret) {
186 MS_LOG(WARNING) << "LoadMemToHost failed: tensor_name:" << tensor_name << ", host_format:" << host_format
187 << ", device_format:" << device_format << ".!";
188 }
189 }
190 }
191
192 /*
193 * Feature group: Dump, Online debugger.
194 * Target device group: Ascend, GPU.
195 * Runtime category: MindRT.
196 * Description: Returns true if the node needs to be read for Dump or online debugger. This function is used by GPU
197 * and Ascend kernel-by-kernel mindRT.
198 */
CheckReadData(const CNodePtr & cnode)199 bool CheckReadData(const CNodePtr &cnode) {
200 auto debugger = Debugger::GetInstance();
201 if (!debugger) {
202 return false;
203 }
204 bool read_data = false;
205 auto &dump_json_parser = DumpJsonParser::GetInstance();
206 bool dump_enabled = dump_json_parser.DumpEnabledForIter();
207 MS_LOG(DEBUG) << "dump_enabled: " << dump_enabled;
208 std::string kernel_name = GetKernelNodeName(cnode);
209 if (dump_enabled) {
210 if (dump_json_parser.NeedDump(kernel_name)) {
211 read_data = true;
212 }
213 }
214 if (debugger->debugger_enabled()) {
215 read_data = debugger->ReadNodeDataRequired(cnode);
216 }
217 return read_data;
218 }
219
IsDeviceTargetGPU()220 bool IsDeviceTargetGPU() {
221 auto context = MsContext::GetInstance();
222 MS_EXCEPTION_IF_NULL(context);
223 return context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice;
224 }
225
GetTransFlag()226 bool GetTransFlag() {
227 auto debugger = Debugger::GetInstance();
228 MS_EXCEPTION_IF_NULL(debugger);
229 if (debugger->debugger_enabled() || IsDeviceTargetGPU()) {
230 return true;
231 }
232 return DumpJsonParser::GetInstance().trans_flag();
233 }
234
GetSampleMode()235 uint32_t GetSampleMode() {
236 auto debugger = Debugger::GetInstance();
237 MS_EXCEPTION_IF_NULL(debugger);
238 if (debugger->debugger_enabled() || IsDeviceTargetGPU()) {
239 return 0;
240 }
241 return DumpJsonParser::GetInstance().sample_mode();
242 }
243
GetSampleNum()244 uint32_t GetSampleNum() {
245 auto debugger = Debugger::GetInstance();
246 MS_EXCEPTION_IF_NULL(debugger);
247 if (debugger->debugger_enabled() || IsDeviceTargetGPU()) {
248 return 0;
249 }
250 return DumpJsonParser::GetInstance().sample_num();
251 }
252
253 /*
254 * Feature group: Dump, Online debugger.
255 * Target device group: Ascend, GPU.
256 * Runtime category: MindRT.
257 * Description: Load inputs and outputs of the given node if needed and dump them if dump is enabled, then it performs
258 * PostExecuteNode function on the given node for GPU.
259 */
ReadDataAndDump(const CNodePtr & cnode,std::vector<device::DeviceAddress * > input_device_tensors,std::vector<device::DeviceAddress * > output_device_tensors,uint32_t exec_order,const DeviceContext * device_context,const bool abnormal_dump)260 void ReadDataAndDump(const CNodePtr &cnode, std::vector<device::DeviceAddress *> input_device_tensors,
261 std::vector<device::DeviceAddress *> output_device_tensors, uint32_t exec_order,
262 const DeviceContext *device_context, const bool abnormal_dump) {
263 auto debugger = Debugger::GetInstance();
264 if (!debugger) {
265 return;
266 }
267 auto &dump_json_parser = DumpJsonParser::GetInstance();
268 bool dump_enabled = dump_json_parser.DumpEnabledForIter();
269 MS_LOG(DEBUG) << "dump_enabled: " << dump_enabled;
270 auto kernel_graph = std::dynamic_pointer_cast<KernelGraph>(cnode->func_graph());
271 MS_EXCEPTION_IF_NULL(kernel_graph);
272 auto root_graph_id = kernel_graph->root_graph_id();
273 bool trans_flag = GetTransFlag();
274 uint32_t sample_mode = GetSampleMode();
275 uint32_t sample_num = GetSampleNum();
276 if (debugger->debugger_enabled() || dump_json_parser.InputNeedDump()) {
277 if (DumpJsonParser::GetInstance().IsDeviceCalcStats() && dump_enabled) {
278 datadump::DumpKernelTensorStats(device_context, input_device_tensors, true, cnode, root_graph_id);
279 } else {
280 bool async_copy = !abnormal_dump;
281 LoadInputs(cnode, input_device_tensors, exec_order, root_graph_id, device_context, trans_flag, sample_mode,
282 sample_num, async_copy);
283 }
284 }
285 if (debugger->debugger_enabled() || dump_json_parser.OutputNeedDump()) {
286 if (DumpJsonParser::GetInstance().IsDeviceCalcStats() && dump_enabled) {
287 datadump::DumpKernelTensorStats(device_context, output_device_tensors, false, cnode, root_graph_id);
288 } else if (!abnormal_dump) {
289 LoadOutputs(cnode, output_device_tensors, exec_order, root_graph_id, device_context, trans_flag, sample_mode,
290 sample_num);
291 }
292 }
293 // Dump kernel
294 if (dump_enabled && !DumpJsonParser::GetInstance().IsDeviceCalcStats()) {
295 MS_EXCEPTION_IF_NULL(kernel_graph);
296 auto graph_id = kernel_graph->graph_id();
297 // for GPU, nodes are dumped in graph_id directory.
298 if (IsDeviceTargetGPU()) {
299 debugger->DumpSingleNode(cnode, graph_id);
300 } else {
301 // for Ascend, node are dumped in root_graph_id directory.
302 debugger->DumpSingleNode(cnode, root_graph_id);
303 }
304 // Clear Dumped data when online debugger is not enabled
305 if (!debugger->debugger_enabled()) {
306 debugger->ClearCurrentData();
307 }
308 }
309 if (IsDeviceTargetGPU()) {
310 // check if the node is last kernel
311 bool last_kernel = !common::AnfAlgo::IsInplaceNode(cnode, "skip");
312 debugger->PostExecuteNode(cnode, last_kernel);
313 }
314 }
315
316 /*
317 * Feature group: Dump, Online Debugger.
318 * Target device group: Ascend, GPU.
319 * Runtime category: MindRT.
320 * Description: Returns the error_info when sink_mode is true and we are in online debugger mode or dump mode for
321 * GPU, if everything is normal the error_info string will be empty.
322 */
CheckDatasetSinkMode(const KernelGraphPtr & graph_ptr)323 std::string CheckDatasetSinkMode(const KernelGraphPtr &graph_ptr) {
324 std::string error_info = "";
325 bool sink_mode =
326 ConfigManager::GetInstance().dataset_mode() == DatasetMode::DS_SINK_MODE || graph_ptr->IsDatasetGraph();
327 auto debugger = Debugger::GetInstance();
328 MS_EXCEPTION_IF_NULL(debugger);
329 if (debugger->CheckDebuggerDumpEnabled() && sink_mode && IsDeviceTargetGPU()) {
330 error_info = "e2e_dump is not supported on GPU with dataset_sink_mode=True. Please set dataset_sink_mode=False";
331 }
332 if (debugger->CheckDebuggerEnabled() && sink_mode) {
333 error_info = "Debugger is not supported with dataset_sink_mode=True. Please set dataset_sink_mode=False";
334 }
335 return error_info;
336 }
337
338 /*
339 * Feature group: Online Debugger.
340 * Target device group: Ascend.
341 * Runtime category: MindRT.
342 * Description: Loads graph's outputs and parameters for Ascend super kernel mode.
343 */
LoadDataForDebugger(const KernelGraphPtr & graph_ptr)344 void LoadDataForDebugger(const KernelGraphPtr &graph_ptr) {
345 auto context = MsContext::GetInstance();
346 MS_EXCEPTION_IF_NULL(context);
347 if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kAscendDevice) {
348 return;
349 }
350 #ifdef ENABLE_DEBUGGER
351 auto debugger = Debugger::GetInstance();
352 MS_EXCEPTION_IF_NULL(debugger);
353 if (!debugger->CheckDebuggerEnabled()) {
354 return;
355 }
356 MS_LOG(INFO) << "Start load step";
357 debugger->SetGraphPtr(graph_ptr);
358 // load output
359 debugger->LoadGraphOutputs();
360 // load parameters
361 debugger->LoadParametersAndConst();
362
363 #endif
364 }
365
Dump(const KernelGraphPtr & graph,uint32_t rank_id)366 void Dump(const KernelGraphPtr &graph, uint32_t rank_id) {
367 MS_LOG(DEBUG) << "Start!";
368 MS_EXCEPTION_IF_NULL(graph);
369 E2eDump::DumpData(graph.get(), rank_id);
370 MS_LOG(DEBUG) << "Finish!";
371 }
372
GetRankID()373 uint32_t GetRankID() {
374 uint32_t rank_id = 0;
375 auto ms_context = MsContext::GetInstance();
376 MS_EXCEPTION_IF_NULL(ms_context);
377 auto env_rank_id = common::GetEnv("RANK_ID");
378 if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
379 // get actual rank id if it's distribution training case.
380 rank_id = GetRankId();
381 }
382 return rank_id;
383 }
384
SuperKernelE2eDump(const KernelGraphPtr & graph)385 void SuperKernelE2eDump(const KernelGraphPtr &graph) {
386 #ifndef ENABLE_SECURITY
387 Dump(graph, GetRankID());
388 #endif
389 }
390
GetCommand(const debugger::EventReply & reply)391 DebuggerCommand GetCommand(const debugger::EventReply &reply) {
392 DebuggerCommand cmd = DebuggerCommand::kUnknownCMD;
393 switch (reply.cmd_case()) {
394 case debugger::EventReply::CmdCase::kExit:
395 cmd = DebuggerCommand::kExitCMD;
396 break;
397 case debugger::EventReply::CmdCase::kRunCmd:
398 cmd = DebuggerCommand::kRunCMD;
399 break;
400 case debugger::EventReply::CmdCase::kSetCmd:
401 cmd = DebuggerCommand::kSetCMD;
402 break;
403 case debugger::EventReply::CmdCase::kViewCmd:
404 cmd = DebuggerCommand::kViewCMD;
405 break;
406 case debugger::EventReply::CmdCase::kVersionMatched:
407 cmd = DebuggerCommand::kVersionMatchedCMD;
408 break;
409 default:
410 MS_LOG(DEBUG) << "Debug: UnknownCMD";
411 break;
412 }
413 return cmd;
414 }
415
GetParameters(const debugger::EventReply & reply)416 ProtoVector<debugger::WatchCondition_Parameter> GetParameters(const debugger::EventReply &reply) {
417 if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) {
418 MS_LOG(ERROR) << "Error: Can not get Parameters from command. Returning default value: ProtoVector<Parameter>().";
419 return ProtoVector<debugger::WatchCondition_Parameter>();
420 }
421 return reply.set_cmd().watch_condition().params();
422 }
423
GetWatchnodes(const debugger::EventReply & reply)424 ProtoVector<debugger::WatchNode> GetWatchnodes(const debugger::EventReply &reply) {
425 if (!reply.has_set_cmd()) {
426 MS_LOG(ERROR) << "Error: Not SetCMD, can not get WatchNodes. Returning default value: ProtoVector<WatchNode>().";
427 return ProtoVector<debugger::WatchNode>();
428 }
429 return reply.set_cmd().watch_nodes();
430 }
431
GetNodeName(const debugger::EventReply & reply)432 std::string GetNodeName(const debugger::EventReply &reply) {
433 if (!reply.has_run_cmd()) {
434 MS_LOG(ERROR) << "Error: Not RunCMD, can not get NodeName. Returning default value: "
435 "";
436 return "";
437 }
438 return reply.run_cmd().node_name();
439 }
440
GetRunLevel(const debugger::EventReply & reply)441 std::string GetRunLevel(const debugger::EventReply &reply) {
442 if (!reply.has_run_cmd()) {
443 MS_LOG(ERROR) << "Error: Not RunCMD, can not get RunLevel. Returning default value: "
444 "";
445 return "";
446 }
447 return reply.run_cmd().run_level();
448 }
449
GetWatchcondition(const debugger::EventReply & reply)450 debugger::WatchCondition GetWatchcondition(const debugger::EventReply &reply) {
451 if (!reply.has_set_cmd() || !reply.set_cmd().has_watch_condition()) {
452 MS_LOG(ERROR) << "Error: Can not get WatchCondition from command. Returning default value: WatchCondition().";
453 return debugger::WatchCondition();
454 }
455 return reply.set_cmd().watch_condition();
456 }
457
GetWatchpointID(const debugger::EventReply & reply)458 int32_t GetWatchpointID(const debugger::EventReply &reply) {
459 if (!reply.has_set_cmd()) {
460 MS_LOG(ERROR) << "Error: Not SetCMD, can not get Watchpoint ID. Returning default value: 0.";
461 return 0;
462 }
463 return reply.set_cmd().id();
464 }
465
GetWatchpointDelete(const debugger::EventReply & reply)466 bool GetWatchpointDelete(const debugger::EventReply &reply) {
467 if (!reply.has_set_cmd()) {
468 MS_LOG(ERROR) << "Error: Not SetCMD, can not get Watchpoint delete flag. Returning default value: false.";
469 return false;
470 }
471 return reply.set_cmd().delete_();
472 }
473
GetTensors(const debugger::EventReply & reply)474 ProtoVector<debugger::TensorProto> GetTensors(const debugger::EventReply &reply) {
475 if (!reply.has_view_cmd()) {
476 MS_LOG(ERROR) << "Error: Not ViewCMD, can not get Tensors. Returning default value: ProtoVector<TensorProto>().";
477 return ProtoVector<debugger::TensorProto>();
478 }
479 return reply.view_cmd().tensors();
480 }
481
GetMiVersionMatched(const debugger::EventReply & reply)482 bool GetMiVersionMatched(const debugger::EventReply &reply) { return reply.version_matched(); }
483
GetTensorFullName(const debugger::TensorProto & tensor)484 std::string GetTensorFullName(const debugger::TensorProto &tensor) {
485 string node_name = tensor.node_name();
486 if (tensor.truncate()) {
487 // scopes in node name are separated by '/'
488 // use the name without scope if truncate is true
489 std::size_t found = node_name.find_last_of("/");
490 node_name = node_name.substr(found + 1);
491 }
492 return node_name + ":" + tensor.slot() + (tensor.iter() == "" ? "" : ":" + tensor.iter());
493 }
494 } // namespace mindspore
495