1 /**
2 * Copyright 2020-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "debug/data_dump/e2e_dump.h"
18
19 #include <unistd.h>
20 #include <algorithm>
21 #include <map>
22 #include <vector>
23 #include "debug/data_dump/dump_json_parser.h"
24 #include "common/trans.h"
25 #include "debug/anf_ir_utils.h"
26 #include "debug/common.h"
27 #include "backend/session/anf_runtime_algorithm.h"
28 #include "utils/ms_context.h"
29 #include "runtime/device/kernel_runtime_manager.h"
30 #include "utils/config_manager.h"
31 #include "utils/file_utils.h"
32 #ifdef ENABLE_DEBUGGER
33 #include "debug/debug_services.h"
34 #include "debug/tensor_load.h"
35 #include "debug/debugger/debugger.h"
36 #endif
37
38 namespace mindspore {
IsDeviceTargetGPU()39 bool E2eDump::IsDeviceTargetGPU() {
40 auto context = MsContext::GetInstance();
41 MS_EXCEPTION_IF_NULL(context);
42 return context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice;
43 }
44
DumpGPUMemToFile(const std::string & file_path,const std::string & original_kernel_name,const device::DeviceAddress & addr,const ShapeVector & int_shapes,const TypeId & host_type,const TypeId & device_type,bool trans_flag,size_t slot,const Debugger * debugger)45 void E2eDump::DumpGPUMemToFile(const std::string &file_path, const std::string &original_kernel_name,
46 const device::DeviceAddress &addr, const ShapeVector &int_shapes,
47 const TypeId &host_type, const TypeId &device_type, bool trans_flag, size_t slot,
48 const Debugger *debugger) {
49 #ifdef ENABLE_DEBUGGER
50 auto format = kOpFormat_DEFAULT;
51 MS_EXCEPTION_IF_NULL(debugger);
52 auto ret = debugger->DumpTensorToFile(original_kernel_name, trans_flag, file_path, format, int_shapes, host_type,
53 device_type, addr.format(), slot);
54 if (!ret) {
55 MS_LOG(ERROR) << "DumpTensorToFile Failed: flag:" << trans_flag << ", path:" << file_path
56 << ", host_format:" << format;
57 }
58 #endif
59 }
60
DumpOutput(const session::KernelGraph * graph,const std::string & dump_path,const Debugger * debugger)61 void E2eDump::DumpOutput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger) {
62 MS_EXCEPTION_IF_NULL(graph);
63 auto &dump_json_parser = DumpJsonParser::GetInstance();
64 if (!dump_json_parser.OutputNeedDump()) {
65 return;
66 }
67 MS_LOG(INFO) << "Start e2e dump output";
68 bool trans_flag = dump_json_parser.trans_flag();
69 const auto &apply_kernels = graph->execution_order();
70 for (const auto &node : apply_kernels) {
71 MS_EXCEPTION_IF_NULL(node);
72 std::string kernel_name = GetKernelNodeName(node);
73 if (!dump_json_parser.NeedDump(kernel_name)) {
74 continue;
75 }
76 DumpJsonParser::GetInstance().MatchKernel(kernel_name);
77 DumpOutputImpl(node, trans_flag, dump_path, &kernel_name, debugger);
78 }
79 }
80
DumpOutputSingleNode(const CNodePtr & node,const std::string & dump_path,const Debugger * debugger)81 void E2eDump::DumpOutputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger) {
82 auto &dump_json_parser = DumpJsonParser::GetInstance();
83 if (!dump_json_parser.OutputNeedDump()) {
84 return;
85 }
86 bool trans_flag = dump_json_parser.trans_flag();
87 MS_EXCEPTION_IF_NULL(node);
88 std::string kernel_name = GetKernelNodeName(node);
89 if (!dump_json_parser.NeedDump(kernel_name)) {
90 return;
91 }
92 DumpJsonParser::GetInstance().MatchKernel(kernel_name);
93 DumpOutputImpl(node, trans_flag, dump_path, &kernel_name, debugger);
94 }
95
DumpOutputImpl(const CNodePtr & node,bool trans_flag,const std::string & dump_path,std::string * kernel_name,const Debugger * debugger)96 void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
97 std::string *kernel_name, const Debugger *debugger) {
98 MS_EXCEPTION_IF_NULL(node);
99 GetFileKernelName(NOT_NULL(kernel_name));
100 auto output_size = AnfAlgo::GetOutputTensorNum(node);
101 for (size_t j = 0; j < output_size; ++j) {
102 if (!AnfAlgo::OutputAddrExist(node, j)) {
103 continue;
104 }
105 auto addr = AnfAlgo::GetOutputAddr(node, j);
106 MS_EXCEPTION_IF_NULL(addr);
107 ShapeVector int_shapes;
108 GetDumpIntShape(node, j, NOT_NULL(&int_shapes), trans_flag);
109 auto type = AnfAlgo::GetOutputInferDataType(node, j);
110 auto device_type = AnfAlgo::GetOutputDeviceDataType(node, j);
111 std::string op_type = AnfAlgo::GetCNodeName(node);
112 std::string op_name = GetOpNameWithoutScope(*kernel_name);
113 uint32_t task_id = 0;
114 uint32_t stream_id = 0;
115 uint64_t timestamp = GetTimeStamp();
116 std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' +
117 std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output." +
118 std::to_string(j);
119 if (IsDeviceTargetGPU()) {
120 DumpGPUMemToFile(file_path, GetKernelNodeName(node), *addr, int_shapes, type, device_type, trans_flag, j,
121 debugger);
122 } else {
123 DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
124 }
125 }
126 }
127
DumpInput(const session::KernelGraph * graph,const std::string & dump_path,const Debugger * debugger)128 void E2eDump::DumpInput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger) {
129 MS_EXCEPTION_IF_NULL(graph);
130 auto &dump_json_parser = DumpJsonParser::GetInstance();
131 if (!dump_json_parser.InputNeedDump()) {
132 return;
133 }
134 MS_LOG(INFO) << "Start e2e dump input";
135 bool trans_flag = dump_json_parser.trans_flag();
136 const auto &apply_kernels = graph->execution_order();
137 for (const auto &node : apply_kernels) {
138 MS_EXCEPTION_IF_NULL(node);
139 std::string kernel_name = GetKernelNodeName(node);
140 if (!dump_json_parser.NeedDump(kernel_name)) {
141 continue;
142 }
143 DumpJsonParser::GetInstance().MatchKernel(kernel_name);
144 DumpInputImpl(node, trans_flag, dump_path, &kernel_name, debugger);
145 }
146 }
147
DumpInputSingleNode(const CNodePtr & node,const std::string & dump_path,const Debugger * debugger)148 void E2eDump::DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger) {
149 auto &dump_json_parser = DumpJsonParser::GetInstance();
150 if (!dump_json_parser.InputNeedDump()) {
151 return;
152 }
153 bool trans_flag = dump_json_parser.trans_flag();
154 MS_EXCEPTION_IF_NULL(node);
155 std::string kernel_name = GetKernelNodeName(node);
156 if (!dump_json_parser.NeedDump(kernel_name)) {
157 return;
158 }
159 DumpJsonParser::GetInstance().MatchKernel(kernel_name);
160 DumpInputImpl(node, trans_flag, dump_path, &kernel_name, debugger);
161 }
162
DumpInputImpl(const CNodePtr & node,bool trans_flag,const std::string & dump_path,std::string * kernel_name,const Debugger * debugger)163 void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
164 std::string *kernel_name, const Debugger *debugger) {
165 MS_EXCEPTION_IF_NULL(node);
166 GetFileKernelName(NOT_NULL(kernel_name));
167 auto input_size = AnfAlgo::GetInputTensorNum(node);
168 for (size_t j = 0; j < input_size; ++j) {
169 auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(node, j);
170 auto input = kernel_with_index.first;
171 auto index = kernel_with_index.second;
172 if (!AnfAlgo::OutputAddrExist(input, index)) {
173 continue;
174 }
175 auto addr = AnfAlgo::GetOutputAddr(input, index);
176 MS_EXCEPTION_IF_NULL(addr);
177
178 std::string tensor_name = GetKernelNodeName(node);
179 size_t slot = j;
180 if (IsDeviceTargetGPU()) {
181 auto input_kernel = node->input(j + 1);
182 std::string input_kernel_name = GetKernelNodeName(input_kernel);
183 tensor_name = input_kernel_name;
184 slot = 0;
185 }
186 ShapeVector int_shapes;
187 GetDumpIntShape(input, index, NOT_NULL(&int_shapes), trans_flag);
188 auto type = AnfAlgo::GetOutputInferDataType(input, index);
189 auto device_type = AnfAlgo::GetOutputDeviceDataType(input, index);
190 std::string op_type = AnfAlgo::GetCNodeName(node);
191 std::string op_name = GetOpNameWithoutScope(*kernel_name);
192 uint64_t timestamp = GetTimeStamp();
193 uint32_t task_id = 0;
194 uint32_t stream_id = 0;
195 std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' +
196 std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".input." + std::to_string(j);
197 MS_EXCEPTION_IF_NULL(addr);
198 if (IsDeviceTargetGPU()) {
199 DumpGPUMemToFile(file_path, tensor_name, *addr, int_shapes, type, device_type, trans_flag, slot, debugger);
200 } else {
201 DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
202 }
203 }
204 }
205
DumpSingleAnfNode(const AnfNodePtr & anf_node,const size_t output_index,const std::string & dump_path,bool trans_flag,std::map<std::string,size_t> * const_map,const Debugger * debugger)206 void E2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_index, const std::string &dump_path,
207 bool trans_flag, std::map<std::string, size_t> *const_map, const Debugger *debugger) {
208 MS_EXCEPTION_IF_NULL(anf_node);
209 auto &dump_json_parser = DumpJsonParser::GetInstance();
210 if ((!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) || IsValueNode<StringImm>(anf_node)) {
211 return;
212 }
213 std::string node_name = GetKernelNodeName(anf_node);
214 std::string dump_name = node_name;
215 if (anf_node->isa<ValueNode>()) {
216 MS_EXCEPTION_IF_NULL(const_map);
217 auto iter = const_map->find(node_name);
218 if (iter == const_map->end()) {
219 return;
220 }
221 dump_name = std::string("cst") + std::to_string(iter->second);
222 }
223
224 if (!dump_json_parser.NeedDump(node_name)) {
225 return;
226 }
227 DumpJsonParser::GetInstance().MatchKernel(node_name);
228 GetFileKernelName(NOT_NULL(&node_name));
229 // check if output address exists, if not, return;
230 if (!AnfAlgo::OutputAddrExist(anf_node, output_index)) {
231 return;
232 }
233 auto addr = AnfAlgo::GetOutputAddr(anf_node, output_index);
234 MS_EXCEPTION_IF_NULL(addr);
235 ShapeVector int_shapes;
236 GetDumpIntShape(anf_node, output_index, NOT_NULL(&int_shapes), trans_flag);
237 auto type = AnfAlgo::GetOutputInferDataType(anf_node, output_index);
238 auto device_type = AnfAlgo::GetOutputDeviceDataType(anf_node, output_index);
239 uint64_t timestamp = GetTimeStamp();
240 uint32_t task_id = 0;
241 uint32_t stream_id = 0;
242 std::string file_path = dump_path + "/Parameter." + dump_name + '.' + std::to_string(task_id) + '.' +
243 std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output.0";
244 if (IsDeviceTargetGPU()) {
245 DumpGPUMemToFile(file_path, node_name, *addr, int_shapes, type, device_type, trans_flag, 0, debugger);
246 } else {
247 DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
248 }
249 }
250
DumpParametersAndConst(const session::KernelGraph * graph,const std::string & dump_path,const Debugger * debugger)251 void E2eDump::DumpParametersAndConst(const session::KernelGraph *graph, const std::string &dump_path,
252 const Debugger *debugger) {
253 MS_EXCEPTION_IF_NULL(graph);
254 auto &dump_json_parser = DumpJsonParser::GetInstance();
255 if (!dump_json_parser.OutputNeedDump()) {
256 return;
257 }
258 MS_LOG(INFO) << "Start e2e dump parameters and Const values";
259 bool trans_flag = dump_json_parser.trans_flag();
260 std::map<std::string, size_t> const_map;
261 GetConstantId(graph, &const_map);
262
263 // dump parameters
264 const auto ¶meters = graph->inputs();
265 for (auto &item : parameters) {
266 DumpSingleAnfNode(item, PARAMETER_OUTPUT_INDEX, dump_path, trans_flag, &const_map, debugger);
267 }
268 // dump const values
269 auto value_nodes = graph->graph_value_nodes();
270 for (const auto &value_node : value_nodes) {
271 DumpSingleAnfNode(value_node, VALUE_NODE_OUTPUT_INDEX, dump_path, trans_flag, &const_map, debugger);
272 }
273 }
274
UpdateIterDumpSetup(const session::KernelGraph * graph,bool sink_mode)275 void E2eDump::UpdateIterDumpSetup(const session::KernelGraph *graph, bool sink_mode) {
276 uint32_t graph_id = graph->graph_id();
277 auto &dump_json_parser = DumpJsonParser::GetInstance();
278 if (IsDeviceTargetGPU()) {
279 if (starting_graph_id == INT32_MAX) {
280 starting_graph_id = graph_id;
281 } else if (starting_graph_id == graph_id && !MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
282 // Update dump iter for mindrt runtime is done using UpdateIterGPUDump().
283 // Update dump iter for GPU old runtime.
284 dump_json_parser.UpdateDumpIter();
285 }
286 return;
287 }
288 // If device target is Ascend
289 if (sink_mode && graph->IsDatasetGraph()) {
290 MS_LOG(INFO) << "No need to update iteration for dataset graph.";
291 return;
292 }
293 if (starting_graph_id == INT32_MAX) {
294 // Identify the first graph id and not increasing dump iter for the first iteration (initial dump iter = 0).
295 starting_graph_id = graph_id;
296 } else {
297 // In multi network scripts, dump iter is equal to the number of networks that have been run so far.
298 dump_json_parser.UpdateDumpIter();
299 }
300 }
301
DumpSetup(const session::KernelGraph * graph)302 void E2eDump::DumpSetup(const session::KernelGraph *graph) {
303 auto &dump_json_parser = DumpJsonParser::GetInstance();
304 bool sink_mode = (ConfigManager::GetInstance().dataset_mode() || E2eDump::isDatasetGraph(graph));
305
306 if (dump_json_parser.async_dump_enabled() || dump_json_parser.e2e_dump_enabled()) {
307 UpdateIterDumpSetup(graph, sink_mode);
308 }
309 }
310
UpdateIterGPUDump()311 void E2eDump::UpdateIterGPUDump() {
312 if (starting_graph_id != INT32_MAX) {
313 DumpJsonParser::GetInstance().UpdateDumpIter();
314 }
315 }
316
DumpData(const session::KernelGraph * graph,uint32_t rank_id,const Debugger * debugger)317 void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) {
318 MS_EXCEPTION_IF_NULL(graph);
319 bool success = false;
320 auto &dump_json_parser = DumpJsonParser::GetInstance();
321 uint32_t graph_id = graph->graph_id();
322
323 if (dump_json_parser.GetIterDumpFlag()) {
324 MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
325 MS_LOG(INFO) << "Current graph id is " << graph_id;
326 std::string dump_path = GenerateDumpPath(graph_id, rank_id);
327
328 DumpInput(graph, dump_path, debugger);
329 DumpOutput(graph, dump_path, debugger);
330 DumpParametersAndConst(graph, dump_path, debugger);
331 success = true;
332 }
333
334 if (success) {
335 MS_LOG(DEBUG) << "E2eDump Dump Data completed!";
336 } else {
337 MS_LOG(DEBUG) << "E2eDump Dump has not occurred!";
338 }
339 }
340
DumpSingleNodeData(const CNodePtr & node,uint32_t graph_id,uint32_t rank_id,const Debugger * debugger)341 bool E2eDump::DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id, const Debugger *debugger) {
342 bool success = false;
343 auto &dump_json_parser = DumpJsonParser::GetInstance();
344 if (dump_json_parser.GetIterDumpFlag()) {
345 std::string dump_path = GenerateDumpPath(graph_id, rank_id);
346 DumpInputSingleNode(node, dump_path, debugger);
347 DumpOutputSingleNode(node, dump_path, debugger);
348 success = true;
349 }
350 return success;
351 }
352
DumpParametersAndConstData(const session::KernelGraph * graph,uint32_t rank_id,const Debugger * debugger)353 bool E2eDump::DumpParametersAndConstData(const session::KernelGraph *graph, uint32_t rank_id,
354 const Debugger *debugger) {
355 bool success = false;
356 uint32_t graph_id = graph->graph_id();
357 auto &dump_json_parser = DumpJsonParser::GetInstance();
358 if (dump_json_parser.GetIterDumpFlag()) {
359 MS_LOG(INFO) << "DumpParametersAndConst. Current iteration is " << dump_json_parser.cur_dump_iter();
360 MS_LOG(INFO) << "Current graph id is " << graph_id;
361 std::string dump_path = GenerateDumpPath(graph_id, rank_id);
362 DumpParametersAndConst(graph, dump_path, debugger);
363 success = true;
364 }
365 return success;
366 }
isDatasetGraph(const session::KernelGraph * graph)367 bool E2eDump::isDatasetGraph(const session::KernelGraph *graph) {
368 // check if there is GetNext or InitDataSetQueue node
369 const auto &nodes = graph->execution_order();
370 for (const auto &node : nodes) {
371 auto node_name = AnfAlgo::GetCNodeName(node);
372 if (node_name == prim::kPrimGetNext->name() || node_name == prim::kPrimInitDataSetQueue->name()) {
373 return true;
374 }
375 }
376 return false;
377 }
378
DumpDirExists(const std::string & dump_path)379 bool E2eDump::DumpDirExists(const std::string &dump_path) {
380 DIR *dir = opendir(dump_path.c_str());
381 if (dir != nullptr) {
382 MS_LOG(INFO) << "Dump dir " << dump_path << " exists";
383 if (closedir(dir) == -1) {
384 MS_LOG(WARNING) << "Dump dir " << dump_path << " close failed!";
385 }
386 return true;
387 }
388 return false;
389 }
390 } // namespace mindspore
391