• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "debug/data_dump/e2e_dump.h"
18 
19 #include <unistd.h>
20 #include <algorithm>
21 #include <map>
22 #include <vector>
23 #include "debug/data_dump/dump_json_parser.h"
24 #include "common/trans.h"
25 #include "debug/anf_ir_utils.h"
26 #include "debug/common.h"
27 #include "backend/session/anf_runtime_algorithm.h"
28 #include "utils/ms_context.h"
29 #include "runtime/device/kernel_runtime_manager.h"
30 #include "utils/config_manager.h"
31 #include "utils/file_utils.h"
32 #ifdef ENABLE_DEBUGGER
33 #include "debug/debug_services.h"
34 #include "debug/tensor_load.h"
35 #include "debug/debugger/debugger.h"
36 #endif
37 
38 namespace mindspore {
IsDeviceTargetGPU()39 bool E2eDump::IsDeviceTargetGPU() {
40   auto context = MsContext::GetInstance();
41   MS_EXCEPTION_IF_NULL(context);
42   return context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice;
43 }
44 
DumpGPUMemToFile(const std::string & file_path,const std::string & original_kernel_name,const device::DeviceAddress & addr,const ShapeVector & int_shapes,const TypeId & host_type,const TypeId & device_type,bool trans_flag,size_t slot,const Debugger * debugger)45 void E2eDump::DumpGPUMemToFile(const std::string &file_path, const std::string &original_kernel_name,
46                                const device::DeviceAddress &addr, const ShapeVector &int_shapes,
47                                const TypeId &host_type, const TypeId &device_type, bool trans_flag, size_t slot,
48                                const Debugger *debugger) {
49 #ifdef ENABLE_DEBUGGER
50   auto format = kOpFormat_DEFAULT;
51   MS_EXCEPTION_IF_NULL(debugger);
52   auto ret = debugger->DumpTensorToFile(original_kernel_name, trans_flag, file_path, format, int_shapes, host_type,
53                                         device_type, addr.format(), slot);
54   if (!ret) {
55     MS_LOG(ERROR) << "DumpTensorToFile Failed: flag:" << trans_flag << ", path:" << file_path
56                   << ", host_format:" << format;
57   }
58 #endif
59 }
60 
DumpOutput(const session::KernelGraph * graph,const std::string & dump_path,const Debugger * debugger)61 void E2eDump::DumpOutput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger) {
62   MS_EXCEPTION_IF_NULL(graph);
63   auto &dump_json_parser = DumpJsonParser::GetInstance();
64   if (!dump_json_parser.OutputNeedDump()) {
65     return;
66   }
67   MS_LOG(INFO) << "Start e2e dump output";
68   bool trans_flag = dump_json_parser.trans_flag();
69   const auto &apply_kernels = graph->execution_order();
70   for (const auto &node : apply_kernels) {
71     MS_EXCEPTION_IF_NULL(node);
72     std::string kernel_name = GetKernelNodeName(node);
73     if (!dump_json_parser.NeedDump(kernel_name)) {
74       continue;
75     }
76     DumpJsonParser::GetInstance().MatchKernel(kernel_name);
77     DumpOutputImpl(node, trans_flag, dump_path, &kernel_name, debugger);
78   }
79 }
80 
DumpOutputSingleNode(const CNodePtr & node,const std::string & dump_path,const Debugger * debugger)81 void E2eDump::DumpOutputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger) {
82   auto &dump_json_parser = DumpJsonParser::GetInstance();
83   if (!dump_json_parser.OutputNeedDump()) {
84     return;
85   }
86   bool trans_flag = dump_json_parser.trans_flag();
87   MS_EXCEPTION_IF_NULL(node);
88   std::string kernel_name = GetKernelNodeName(node);
89   if (!dump_json_parser.NeedDump(kernel_name)) {
90     return;
91   }
92   DumpJsonParser::GetInstance().MatchKernel(kernel_name);
93   DumpOutputImpl(node, trans_flag, dump_path, &kernel_name, debugger);
94 }
95 
DumpOutputImpl(const CNodePtr & node,bool trans_flag,const std::string & dump_path,std::string * kernel_name,const Debugger * debugger)96 void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
97                              std::string *kernel_name, const Debugger *debugger) {
98   MS_EXCEPTION_IF_NULL(node);
99   GetFileKernelName(NOT_NULL(kernel_name));
100   auto output_size = AnfAlgo::GetOutputTensorNum(node);
101   for (size_t j = 0; j < output_size; ++j) {
102     if (!AnfAlgo::OutputAddrExist(node, j)) {
103       continue;
104     }
105     auto addr = AnfAlgo::GetOutputAddr(node, j);
106     MS_EXCEPTION_IF_NULL(addr);
107     ShapeVector int_shapes;
108     GetDumpIntShape(node, j, NOT_NULL(&int_shapes), trans_flag);
109     auto type = AnfAlgo::GetOutputInferDataType(node, j);
110     auto device_type = AnfAlgo::GetOutputDeviceDataType(node, j);
111     std::string op_type = AnfAlgo::GetCNodeName(node);
112     std::string op_name = GetOpNameWithoutScope(*kernel_name);
113     uint32_t task_id = 0;
114     uint32_t stream_id = 0;
115     uint64_t timestamp = GetTimeStamp();
116     std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' +
117                             std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output." +
118                             std::to_string(j);
119     if (IsDeviceTargetGPU()) {
120       DumpGPUMemToFile(file_path, GetKernelNodeName(node), *addr, int_shapes, type, device_type, trans_flag, j,
121                        debugger);
122     } else {
123       DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
124     }
125   }
126 }
127 
DumpInput(const session::KernelGraph * graph,const std::string & dump_path,const Debugger * debugger)128 void E2eDump::DumpInput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger) {
129   MS_EXCEPTION_IF_NULL(graph);
130   auto &dump_json_parser = DumpJsonParser::GetInstance();
131   if (!dump_json_parser.InputNeedDump()) {
132     return;
133   }
134   MS_LOG(INFO) << "Start e2e dump input";
135   bool trans_flag = dump_json_parser.trans_flag();
136   const auto &apply_kernels = graph->execution_order();
137   for (const auto &node : apply_kernels) {
138     MS_EXCEPTION_IF_NULL(node);
139     std::string kernel_name = GetKernelNodeName(node);
140     if (!dump_json_parser.NeedDump(kernel_name)) {
141       continue;
142     }
143     DumpJsonParser::GetInstance().MatchKernel(kernel_name);
144     DumpInputImpl(node, trans_flag, dump_path, &kernel_name, debugger);
145   }
146 }
147 
DumpInputSingleNode(const CNodePtr & node,const std::string & dump_path,const Debugger * debugger)148 void E2eDump::DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger) {
149   auto &dump_json_parser = DumpJsonParser::GetInstance();
150   if (!dump_json_parser.InputNeedDump()) {
151     return;
152   }
153   bool trans_flag = dump_json_parser.trans_flag();
154   MS_EXCEPTION_IF_NULL(node);
155   std::string kernel_name = GetKernelNodeName(node);
156   if (!dump_json_parser.NeedDump(kernel_name)) {
157     return;
158   }
159   DumpJsonParser::GetInstance().MatchKernel(kernel_name);
160   DumpInputImpl(node, trans_flag, dump_path, &kernel_name, debugger);
161 }
162 
DumpInputImpl(const CNodePtr & node,bool trans_flag,const std::string & dump_path,std::string * kernel_name,const Debugger * debugger)163 void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
164                             std::string *kernel_name, const Debugger *debugger) {
165   MS_EXCEPTION_IF_NULL(node);
166   GetFileKernelName(NOT_NULL(kernel_name));
167   auto input_size = AnfAlgo::GetInputTensorNum(node);
168   for (size_t j = 0; j < input_size; ++j) {
169     auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(node, j);
170     auto input = kernel_with_index.first;
171     auto index = kernel_with_index.second;
172     if (!AnfAlgo::OutputAddrExist(input, index)) {
173       continue;
174     }
175     auto addr = AnfAlgo::GetOutputAddr(input, index);
176     MS_EXCEPTION_IF_NULL(addr);
177 
178     std::string tensor_name = GetKernelNodeName(node);
179     size_t slot = j;
180     if (IsDeviceTargetGPU()) {
181       auto input_kernel = node->input(j + 1);
182       std::string input_kernel_name = GetKernelNodeName(input_kernel);
183       tensor_name = input_kernel_name;
184       slot = 0;
185     }
186     ShapeVector int_shapes;
187     GetDumpIntShape(input, index, NOT_NULL(&int_shapes), trans_flag);
188     auto type = AnfAlgo::GetOutputInferDataType(input, index);
189     auto device_type = AnfAlgo::GetOutputDeviceDataType(input, index);
190     std::string op_type = AnfAlgo::GetCNodeName(node);
191     std::string op_name = GetOpNameWithoutScope(*kernel_name);
192     uint64_t timestamp = GetTimeStamp();
193     uint32_t task_id = 0;
194     uint32_t stream_id = 0;
195     std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' +
196                             std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".input." + std::to_string(j);
197     MS_EXCEPTION_IF_NULL(addr);
198     if (IsDeviceTargetGPU()) {
199       DumpGPUMemToFile(file_path, tensor_name, *addr, int_shapes, type, device_type, trans_flag, slot, debugger);
200     } else {
201       DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
202     }
203   }
204 }
205 
DumpSingleAnfNode(const AnfNodePtr & anf_node,const size_t output_index,const std::string & dump_path,bool trans_flag,std::map<std::string,size_t> * const_map,const Debugger * debugger)206 void E2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_index, const std::string &dump_path,
207                                 bool trans_flag, std::map<std::string, size_t> *const_map, const Debugger *debugger) {
208   MS_EXCEPTION_IF_NULL(anf_node);
209   auto &dump_json_parser = DumpJsonParser::GetInstance();
210   if ((!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) || IsValueNode<StringImm>(anf_node)) {
211     return;
212   }
213   std::string node_name = GetKernelNodeName(anf_node);
214   std::string dump_name = node_name;
215   if (anf_node->isa<ValueNode>()) {
216     MS_EXCEPTION_IF_NULL(const_map);
217     auto iter = const_map->find(node_name);
218     if (iter == const_map->end()) {
219       return;
220     }
221     dump_name = std::string("cst") + std::to_string(iter->second);
222   }
223 
224   if (!dump_json_parser.NeedDump(node_name)) {
225     return;
226   }
227   DumpJsonParser::GetInstance().MatchKernel(node_name);
228   GetFileKernelName(NOT_NULL(&node_name));
229   // check if output address exists, if not, return;
230   if (!AnfAlgo::OutputAddrExist(anf_node, output_index)) {
231     return;
232   }
233   auto addr = AnfAlgo::GetOutputAddr(anf_node, output_index);
234   MS_EXCEPTION_IF_NULL(addr);
235   ShapeVector int_shapes;
236   GetDumpIntShape(anf_node, output_index, NOT_NULL(&int_shapes), trans_flag);
237   auto type = AnfAlgo::GetOutputInferDataType(anf_node, output_index);
238   auto device_type = AnfAlgo::GetOutputDeviceDataType(anf_node, output_index);
239   uint64_t timestamp = GetTimeStamp();
240   uint32_t task_id = 0;
241   uint32_t stream_id = 0;
242   std::string file_path = dump_path + "/Parameter." + dump_name + '.' + std::to_string(task_id) + '.' +
243                           std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output.0";
244   if (IsDeviceTargetGPU()) {
245     DumpGPUMemToFile(file_path, node_name, *addr, int_shapes, type, device_type, trans_flag, 0, debugger);
246   } else {
247     DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
248   }
249 }
250 
DumpParametersAndConst(const session::KernelGraph * graph,const std::string & dump_path,const Debugger * debugger)251 void E2eDump::DumpParametersAndConst(const session::KernelGraph *graph, const std::string &dump_path,
252                                      const Debugger *debugger) {
253   MS_EXCEPTION_IF_NULL(graph);
254   auto &dump_json_parser = DumpJsonParser::GetInstance();
255   if (!dump_json_parser.OutputNeedDump()) {
256     return;
257   }
258   MS_LOG(INFO) << "Start e2e dump parameters and Const values";
259   bool trans_flag = dump_json_parser.trans_flag();
260   std::map<std::string, size_t> const_map;
261   GetConstantId(graph, &const_map);
262 
263   // dump parameters
264   const auto &parameters = graph->inputs();
265   for (auto &item : parameters) {
266     DumpSingleAnfNode(item, PARAMETER_OUTPUT_INDEX, dump_path, trans_flag, &const_map, debugger);
267   }
268   // dump const values
269   auto value_nodes = graph->graph_value_nodes();
270   for (const auto &value_node : value_nodes) {
271     DumpSingleAnfNode(value_node, VALUE_NODE_OUTPUT_INDEX, dump_path, trans_flag, &const_map, debugger);
272   }
273 }
274 
UpdateIterDumpSetup(const session::KernelGraph * graph,bool sink_mode)275 void E2eDump::UpdateIterDumpSetup(const session::KernelGraph *graph, bool sink_mode) {
276   uint32_t graph_id = graph->graph_id();
277   auto &dump_json_parser = DumpJsonParser::GetInstance();
278   if (IsDeviceTargetGPU()) {
279     if (starting_graph_id == INT32_MAX) {
280       starting_graph_id = graph_id;
281     } else if (starting_graph_id == graph_id && !MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
282       // Update dump iter for mindrt runtime is done using UpdateIterGPUDump().
283       // Update dump iter for GPU old runtime.
284       dump_json_parser.UpdateDumpIter();
285     }
286     return;
287   }
288   // If device target is Ascend
289   if (sink_mode && graph->IsDatasetGraph()) {
290     MS_LOG(INFO) << "No need to update iteration for dataset graph.";
291     return;
292   }
293   if (starting_graph_id == INT32_MAX) {
294     // Identify the first graph id and not increasing dump iter for the first iteration (initial dump iter = 0).
295     starting_graph_id = graph_id;
296   } else {
297     // In multi network scripts, dump iter is equal to the number of networks that have been run so far.
298     dump_json_parser.UpdateDumpIter();
299   }
300 }
301 
DumpSetup(const session::KernelGraph * graph)302 void E2eDump::DumpSetup(const session::KernelGraph *graph) {
303   auto &dump_json_parser = DumpJsonParser::GetInstance();
304   bool sink_mode = (ConfigManager::GetInstance().dataset_mode() || E2eDump::isDatasetGraph(graph));
305 
306   if (dump_json_parser.async_dump_enabled() || dump_json_parser.e2e_dump_enabled()) {
307     UpdateIterDumpSetup(graph, sink_mode);
308   }
309 }
310 
UpdateIterGPUDump()311 void E2eDump::UpdateIterGPUDump() {
312   if (starting_graph_id != INT32_MAX) {
313     DumpJsonParser::GetInstance().UpdateDumpIter();
314   }
315 }
316 
DumpData(const session::KernelGraph * graph,uint32_t rank_id,const Debugger * debugger)317 void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) {
318   MS_EXCEPTION_IF_NULL(graph);
319   bool success = false;
320   auto &dump_json_parser = DumpJsonParser::GetInstance();
321   uint32_t graph_id = graph->graph_id();
322 
323   if (dump_json_parser.GetIterDumpFlag()) {
324     MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
325     MS_LOG(INFO) << "Current graph id is " << graph_id;
326     std::string dump_path = GenerateDumpPath(graph_id, rank_id);
327 
328     DumpInput(graph, dump_path, debugger);
329     DumpOutput(graph, dump_path, debugger);
330     DumpParametersAndConst(graph, dump_path, debugger);
331     success = true;
332   }
333 
334   if (success) {
335     MS_LOG(DEBUG) << "E2eDump Dump Data completed!";
336   } else {
337     MS_LOG(DEBUG) << "E2eDump Dump has not occurred!";
338   }
339 }
340 
DumpSingleNodeData(const CNodePtr & node,uint32_t graph_id,uint32_t rank_id,const Debugger * debugger)341 bool E2eDump::DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id, const Debugger *debugger) {
342   bool success = false;
343   auto &dump_json_parser = DumpJsonParser::GetInstance();
344   if (dump_json_parser.GetIterDumpFlag()) {
345     std::string dump_path = GenerateDumpPath(graph_id, rank_id);
346     DumpInputSingleNode(node, dump_path, debugger);
347     DumpOutputSingleNode(node, dump_path, debugger);
348     success = true;
349   }
350   return success;
351 }
352 
DumpParametersAndConstData(const session::KernelGraph * graph,uint32_t rank_id,const Debugger * debugger)353 bool E2eDump::DumpParametersAndConstData(const session::KernelGraph *graph, uint32_t rank_id,
354                                          const Debugger *debugger) {
355   bool success = false;
356   uint32_t graph_id = graph->graph_id();
357   auto &dump_json_parser = DumpJsonParser::GetInstance();
358   if (dump_json_parser.GetIterDumpFlag()) {
359     MS_LOG(INFO) << "DumpParametersAndConst. Current iteration is " << dump_json_parser.cur_dump_iter();
360     MS_LOG(INFO) << "Current graph id is " << graph_id;
361     std::string dump_path = GenerateDumpPath(graph_id, rank_id);
362     DumpParametersAndConst(graph, dump_path, debugger);
363     success = true;
364   }
365   return success;
366 }
isDatasetGraph(const session::KernelGraph * graph)367 bool E2eDump::isDatasetGraph(const session::KernelGraph *graph) {
368   // check if there is GetNext or InitDataSetQueue node
369   const auto &nodes = graph->execution_order();
370   for (const auto &node : nodes) {
371     auto node_name = AnfAlgo::GetCNodeName(node);
372     if (node_name == prim::kPrimGetNext->name() || node_name == prim::kPrimInitDataSetQueue->name()) {
373       return true;
374     }
375   }
376   return false;
377 }
378 
DumpDirExists(const std::string & dump_path)379 bool E2eDump::DumpDirExists(const std::string &dump_path) {
380   DIR *dir = opendir(dump_path.c_str());
381   if (dir != nullptr) {
382     MS_LOG(INFO) << "Dump dir " << dump_path << " exists";
383     if (closedir(dir) == -1) {
384       MS_LOG(WARNING) << "Dump dir " << dump_path << " close failed!";
385     }
386     return true;
387   }
388   return false;
389 }
390 }  // namespace mindspore
391