1 /**
2 * Copyright 2021-2024 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "runtime/graph_scheduler/actor/debug_actor.h"
18 #include <vector>
19 #include <memory>
20 #include <string>
21 #include "runtime/graph_scheduler/actor/debug_aware_actor.h"
22 #include "mindrt/include/async/async.h"
23 #include "utils/log_adapter.h"
24 #ifndef ENABLE_SECURITY
25 #include "debug/data_dump/cpu_e2e_dump.h"
26 #include "include/backend/debug/data_dump/e2e_dump.h"
27 #include "utils/ms_context.h"
28 #endif
29 #ifdef ENABLE_DEBUGGER
30 #include "include/backend/debug/debugger/debugger.h"
31 #include "debug/debugger/debugger_utils.h"
32 #endif
33 #include "debug/data_dump/data_dumper.h"
34 #include "include/common/debug/common.h"
35 #include "utils/file_utils.h"
36 #include "include/backend/debug/profiler/profiling.h"
37 #include "ops/nn_op_name.h"
38
39 namespace mindspore {
40 namespace runtime {
ACLDump(uint32_t device_id,const std::vector<KernelGraphPtr> & graphs,bool is_kbyk)41 void DebugActor::ACLDump(uint32_t device_id, const std::vector<KernelGraphPtr> &graphs, bool is_kbyk) {
42 std::string env_enable_str = common::GetEnv("MS_ACL_DUMP_CFG_PATH");
43 std::string dump_enable_str = common::GetEnv("MINDSPORE_DUMP_CONFIG");
44
45 std::vector<std::string> all_kernel_names;
46 for (const auto &graph : graphs) {
47 auto all_kernels = graph->execution_order();
48 std::for_each(all_kernels.begin(), all_kernels.end(),
49 [&](const auto &k) { all_kernel_names.push_back(k->fullname_with_scope()); });
50 }
51
52 auto step_count_num = 0;
53 step_count_num = step_count;
54 if (step_count == 1 && is_dataset_sink == 1) {
55 step_count_num = 0;
56 }
57 if (!graphs.empty()) {
58 auto graph = graphs[0];
59 is_dataset_sink = graph->IsDatasetGraph();
60 }
61 if (DumpJsonParser::GetInstance().async_dump_enabled() &&
62 ((DumpJsonParser::GetInstance().IsDumpIter(step_count_num) && is_kbyk) ||
63 (env_enable_str == dump_enable_str && !is_kbyk))) {
64 bool is_init = false;
65 if ((env_enable_str == dump_enable_str) && !(DumpJsonParser::GetInstance().IsDumpIter(step_count_num))) {
66 is_init = true;
67 } else {
68 std::string dump_path = DumpJsonParser::GetInstance().path();
69 std::string dump_path_step = dump_path + "/" + std::to_string(step_count_num);
70 auto real_path = FileUtils::CreateNotExistDirs(dump_path_step, false);
71 if (!real_path.has_value()) {
72 MS_LOG(WARNING) << "Fail to create acl dump dir " << real_path.value();
73 return;
74 }
75 }
76 dump_flag = true;
77 auto registered_dumper = datadump::DataDumperRegister::Instance().GetDumperForBackend(device::DeviceType::kAscend);
78 if (registered_dumper != nullptr) {
79 registered_dumper->Initialize();
80 registered_dumper->EnableDump(device_id, step_count_num, is_init, all_kernel_names);
81 }
82 }
83 }
84
DebugPreLaunch(const AnfNodePtr & node,const std::vector<DeviceTensor * > & input_device_tensors,const std::vector<DeviceTensor * > & output_device_tensors,const DeviceContext * device_context,OpContext<DeviceTensor> * const op_context,const AID *)85 void DebugActor::DebugPreLaunch(const AnfNodePtr &node, const std::vector<DeviceTensor *> &input_device_tensors,
86 const std::vector<DeviceTensor *> &output_device_tensors,
87 const DeviceContext *device_context, OpContext<DeviceTensor> *const op_context,
88 const AID *) {
89 MS_EXCEPTION_IF_NULL(node);
90 MS_EXCEPTION_IF_NULL(device_context);
91 MS_EXCEPTION_IF_NULL(op_context);
92 }
93
94 /*
95 * Feature group: Dump, Online debugger.
96 * Target device group: GPU.
97 * Runtime category: MindRT.
98 * Description: Load and read data for the given node if needed. Dump the node if dump is enabled and free the loaded
99 * memory after the dump (for GPU and ascend kernel-by-kernel).
100 */
DebugPostLaunch(const AnfNodePtr & node,const std::vector<DeviceTensor * > & input_device_tensors,const std::vector<DeviceTensor * > & output_device_tensors,const DeviceContext * device_context,OpContext<DeviceTensor> * const op_context,const AID *)101 void DebugActor::DebugPostLaunch(const AnfNodePtr &node, const std::vector<DeviceTensor *> &input_device_tensors,
102 const std::vector<DeviceTensor *> &output_device_tensors,
103 const DeviceContext *device_context, OpContext<DeviceTensor> *const op_context,
104 const AID *) {
105 MS_EXCEPTION_IF_NULL(node);
106 MS_EXCEPTION_IF_NULL(device_context);
107 MS_EXCEPTION_IF_NULL(op_context);
108 std::lock_guard<std::mutex> locker(debug_mutex_);
109
110 if (!node->isa<CNode>()) {
111 return;
112 }
113 const auto &cnode = node->cast<CNodePtr>();
114 MS_EXCEPTION_IF_NULL(cnode);
115 MS_LOG(DEBUG) << "kernel by kernel debug for node: " << cnode->fullname_with_scope() << ".";
116 if (device_context->GetDeviceType() == device::DeviceType::kAscend) {
117 #ifdef ENABLE_DEBUGGER
118 AscendKbkDump(cnode, input_device_tensors, output_device_tensors, device_context);
119 #endif
120 } else if (device_context->GetDeviceType() == device::DeviceType::kCPU) {
121 #ifndef ENABLE_SECURITY
122 if (DumpJsonParser::GetInstance().op_debug_mode() == DumpJsonParser::DUMP_LITE_EXCEPTION) {
123 MS_LOG(WARNING) << "Abnormal dump is not supported on CPU backend.";
124 return;
125 }
126 if (DumpJsonParser::GetInstance().GetIterDumpFlag()) {
127 auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
128 MS_EXCEPTION_IF_NULL(kernel_graph);
129 CPUE2eDump::DumpCNodeData(cnode, kernel_graph->graph_id());
130 CPUE2eDump::DumpRunIter(kernel_graph);
131 }
132 #endif
133 } else if (device_context->GetDeviceType() == device::DeviceType::kGPU) {
134 #ifdef ENABLE_DEBUGGER
135 if (DumpJsonParser::GetInstance().op_debug_mode() == DumpJsonParser::DUMP_LITE_EXCEPTION) {
136 MS_LOG(WARNING) << "Abnormal dump is not supported on GPU backend.";
137 return;
138 }
139 auto debugger = Debugger::GetInstance();
140 if (debugger != nullptr) {
141 auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
142 debugger->InsertExecutedGraph(kernel_graph);
143 std::string kernel_name = cnode->fullname_with_scope();
144 debugger->SetCurNode(kernel_name);
145 bool read_data = CheckReadData(cnode);
146 if (read_data) {
147 ReadDataAndDump(cnode, input_device_tensors, output_device_tensors, exec_order_, device_context);
148 }
149 }
150 exec_order_ += 1;
151 #endif
152 }
153 }
154
155 /*
156 * Feature group: Dump, Ascend.
157 * Target device group: Ascend.
158 * Runtime category: MindRT.
159 * Description: Dump data for the given node if needed. It can be normal dump and overflow dump and exception dump
160 * (ascend kernel-by-kernel e2e dump).
161 */
162 #ifdef ENABLE_DEBUGGER
AscendKbkDump(const CNodePtr & cnode,const std::vector<DeviceTensor * > & input_device_tensors,const std::vector<DeviceTensor * > & output_device_tensors,const DeviceContext * device_context)163 void DebugActor::AscendKbkDump(const CNodePtr &cnode, const std::vector<DeviceTensor *> &input_device_tensors,
164 const std::vector<DeviceTensor *> &output_device_tensors,
165 const DeviceContext *device_context) {
166 auto debugger = Debugger::GetInstance();
167 if (debugger != nullptr) {
168 auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(cnode->func_graph());
169 MS_EXCEPTION_IF_NULL(kernel_graph);
170 debugger->InsertExecutedGraph(kernel_graph);
171 debugger->SetAscendKernelByKernelFlag(true);
172 auto &dump_json_parser = DumpJsonParser::GetInstance();
173 bool e2e_dump_enabled = dump_json_parser.e2e_dump_enabled();
174 uint32_t op_debug_mode = dump_json_parser.op_debug_mode();
175 bool abnormal_dump = false;
176 bool sync_ok = true;
177 bool read_data = false;
178 if (!e2e_dump_enabled) {
179 exec_order_ += 1;
180 return;
181 }
182 if (op_debug_mode == DumpJsonParser::DUMP_LITE_EXCEPTION) {
183 abnormal_dump = true;
184 sync_ok = device_ctx_->device_res_manager_->SyncAllStreams();
185 if (!sync_ok) {
186 MS_LOG(ERROR) << "Sync stream error! The node input will be dumped";
187 }
188 } else if (op_debug_mode == DumpJsonParser::DUMP_BOTH_OVERFLOW && dump_json_parser.DumpEnabledForIter()) {
189 auto is_overflow = CheckOverflow(device_context, output_device_tensors);
190 if (is_overflow) {
191 read_data = CheckReadData(cnode);
192 }
193 } else {
194 read_data = CheckReadData(cnode);
195 }
196 if ((read_data && e2e_dump_enabled) || !sync_ok) {
197 ReadDataAndDump(cnode, input_device_tensors, output_device_tensors, exec_order_, device_context, abnormal_dump);
198 if (!sync_ok) {
199 MS_LOG(EXCEPTION) << "Sync stream error!";
200 }
201 }
202 }
203 exec_order_ += 1;
204 }
205 #endif
206 /*
207 * Feature group: Dump, Online debugger.
208 * Target device group: Ascend, GPU.
209 * Runtime category: MindRT.
210 * Description: Checks dataset_sink_mode and generates the related error if any exist and calls
211 * PreExecuteGraphDebugger.
212 */
DebugOnStepBegin(const std::vector<KernelGraphPtr> & graphs,const std::vector<AnfNodePtr> & origin_parameters_order,std::vector<DeviceContext * > device_contexts,OpContext<DeviceTensor> * const op_context,const AID *)213 void DebugActor::DebugOnStepBegin(const std::vector<KernelGraphPtr> &graphs,
214 const std::vector<AnfNodePtr> &origin_parameters_order,
215 std::vector<DeviceContext *> device_contexts,
216 OpContext<DeviceTensor> *const op_context, const AID *) {
217 MS_LOG(INFO) << "Debug on step begin.";
218 auto context = MsContext::GetInstance();
219 auto is_kbyk = context->IsKByKExecutorMode();
220 MS_EXCEPTION_IF_NULL(context);
221 std::string backend = context->backend_policy();
222 device_ctx_ = device_contexts[0];
223 auto profiler = profiler::Profiler::GetInstance(kAscendDevice);
224 if ((profiler == nullptr || !profiler->IsInitialized()) &&
225 device_ctx_->GetDeviceType() == device::DeviceType::kAscend) {
226 auto device_id = context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
227 if (common::GetEnv("MS_ACL_DUMP_CFG_PATH") == common::GetEnv("MINDSPORE_DUMP_CONFIG")) {
228 ACLDump(device_id, graphs, is_kbyk);
229 }
230 }
231 #ifndef ENABLE_SECURITY
232 if (DumpJsonParser::GetInstance().e2e_dump_enabled() && !graphs.empty()) {
233 // First graph is the dataset graph when dataset_sink_mode = True
234 auto graph = graphs[0];
235 bool is_dataset_sink = graph->IsDatasetGraph();
236 uint32_t cur_step = DumpJsonParser::GetInstance().cur_dump_iter();
237 if (cur_step == 1 && DumpJsonParser::GetInstance().GetDatasetSink()) {
238 uint32_t init_step = 0;
239 DumpJsonParser::GetInstance().UpdateDumpIter(init_step);
240 MS_LOG(INFO) << "In dataset sink mode, reset step to init_step: " << init_step;
241 }
242 DumpJsonParser::GetInstance().SetDatasetSink(is_dataset_sink);
243 }
244 #endif
245 if (backend == "ge") {
246 return;
247 }
248 MS_EXCEPTION_IF_NULL(op_context);
249 std::lock_guard<std::mutex> locker(debug_mutex_);
250 #ifdef ENABLE_DEBUGGER
251 if (!graphs.empty()) {
252 // First graph is the dataset graph when dataset_sink_mode = True
253 auto graph = graphs[0];
254 std::string error_info = CheckDatasetSinkMode(graph);
255 if (!error_info.empty()) {
256 SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*op_context), error_info);
257 }
258 }
259 auto debugger = Debugger::GetInstance();
260 if (debugger != nullptr && debugger->DebuggerBackendEnabled()) {
261 debugger->PreExecuteGraphDebugger(graphs, origin_parameters_order);
262 }
263 #endif
264 #ifndef ENABLE_SECURITY
265 if (DumpJsonParser::GetInstance().e2e_dump_enabled()) {
266 DumpJsonParser::GetInstance().ClearGraph();
267 if (graphs.size() != device_contexts.size()) {
268 SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*op_context), "Graph num:" + std::to_string(graphs.size()) +
269 " is not equal to device context size:" +
270 std::to_string(device_contexts.size()) + " for debug actor.");
271 }
272 for (size_t i = 0; i < graphs.size(); ++i) {
273 MS_EXCEPTION_IF_NULL(graphs[i]);
274 MS_EXCEPTION_IF_NULL(device_contexts[i]);
275 if (device_contexts[i]->GetDeviceType() == device::DeviceType::kCPU) {
276 DumpJsonParser::GetInstance().SaveGraph(graphs[i].get());
277 }
278 }
279 }
280 #endif
281 }
282
283 /*
284 * Feature group: Dump, Online debugger.
285 * Target device group: Ascend, GPU and CPU.
286 * Runtime category: MindRT.
287 * Description: Dump parameters and constants and update dump iter for CPU. Call PostExecuteGraph Debugger for GPU and
288 * Ascend and update step number of online debugger GPU.
289 */
DebugOnStepEnd(OpContext<DeviceTensor> * const,const AID *,int total_running_count_)290 void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *const, const AID *, int total_running_count_) {
291 MS_LOG(INFO) << "Debug on step end. total_running_count is: " << total_running_count_;
292 auto context = MsContext::GetInstance();
293 MS_EXCEPTION_IF_NULL(context);
294 std::string backend = context->backend_policy();
295 step_count = total_running_count_;
296 if (dump_flag == true) {
297 auto registered_dumper = datadump::DataDumperRegister::Instance().GetDumperForBackend(device::DeviceType::kAscend);
298 if (registered_dumper != nullptr) {
299 device_ctx_->device_res_manager_->SyncAllStreams();
300 registered_dumper->Finalize();
301 }
302 dump_flag = false;
303 }
304 device_ctx_->device_res_manager_->SyncAllStreams();
305 std::lock_guard<std::mutex> locker(debug_mutex_);
306
307 #ifndef ENABLE_SECURITY
308 if (DumpJsonParser::GetInstance().GetIterDumpFlag()) {
309 CPUE2eDump::DumpParametersData();
310 CPUE2eDump::DumpConstantsData();
311 }
312 #endif
313
314 #ifdef ENABLE_DEBUGGER
315 auto debugger = Debugger::GetInstance();
316 if (debugger != nullptr) {
317 if (backend == "ge" && !debugger->GetAscendKernelByKernelFlag()) {
318 MS_LOG(INFO) << "Not kernel mode, skip post actions.";
319 return;
320 }
321 // Reset exec_order for the next step
322 exec_order_ = 0;
323 debugger->Debugger::PostExecuteGraphDebugger();
324 debugger->Debugger::UpdateStepNumGPU();
325 }
326 #ifndef ENABLE_SECURITY
327 DumpJsonParser::GetInstance().UpdateDumpIter(step_count);
328 MS_LOG(INFO) << "UpdateDumpIter: " << step_count;
329 #endif
330 #endif
331 }
332
CheckOverflow(const DeviceContext * device_context,const std::vector<DeviceTensor * > & inputs)333 bool DebugActor::CheckOverflow(const DeviceContext *device_context, const std::vector<DeviceTensor *> &inputs) {
334 std::vector<KernelTensor *> check_kernel_tensors;
335 for (size_t i = 0; i < inputs.size(); i++) {
336 auto input = inputs[i]->kernel_tensor().get();
337 auto type = input->dtype_id();
338 if (type == mindspore::kNumberTypeFloat16 || type == mindspore::kNumberTypeFloat32 ||
339 type == mindspore::kNumberTypeBFloat16) {
340 check_kernel_tensors.emplace_back(input);
341 }
342 }
343 if (check_kernel_tensors.empty()) {
344 return false;
345 }
346 MS_EXCEPTION_IF_NULL(device_context);
347 MS_EXCEPTION_IF_NULL(device_context->device_res_manager_);
348
349 // 1. Get AllFinite kernel mod.
350 const auto &kernel_mod_iter = finite_kernel_mods_.find(device_context);
351 kernel::KernelModPtr finite_kernel_mod = nullptr;
352 if (kernel_mod_iter == finite_kernel_mods_.end()) {
353 const auto &new_finite_kernel_mod = device_context->GetKernelExecutor(false)->CreateKernelMod(kAllFiniteOpName);
354 MS_EXCEPTION_IF_NULL(new_finite_kernel_mod);
355 finite_kernel_mods_.emplace(device_context, new_finite_kernel_mod);
356 finite_kernel_mod = new_finite_kernel_mod;
357 } else {
358 finite_kernel_mod = kernel_mod_iter->second;
359 }
360 MS_EXCEPTION_IF_NULL(finite_kernel_mod);
361
362 // 2. Get output kernel tensor for AllFinite kernel.
363 MS_EXCEPTION_IF_NULL(check_kernel_tensors[0]);
364 const auto &stream_id =
365 check_kernel_tensors[0]->managed_by_somas() ? kDefaultStreamIndex : check_kernel_tensors[0]->stream_id();
366 auto &stream_id_to_output_device_address = finite_output_device_addresses_[device_context];
367 if (stream_id_to_output_device_address.find(stream_id) == stream_id_to_output_device_address.end()) {
368 auto finite_output_addr = device_context->device_res_manager_->AllocateMemory(1, stream_id);
369 MS_EXCEPTION_IF_NULL(finite_output_addr);
370
371 ShapeVector shape_vec = {1};
372 auto kernel_tensor = std::make_shared<kernel::KernelTensor>(
373 finite_output_addr, 1, Format::DEFAULT_FORMAT, kNumberTypeBool, shape_vec,
374 device_context->device_context_key().device_name_, device_context->device_context_key().device_id_);
375 kernel_tensor->set_stream_id(stream_id);
376 kernel_tensor->SetType(std::make_shared<TensorType>(kBool));
377 kernel_tensor->SetShape(std::make_shared<abstract::TensorShape>(shape_vec));
378 auto device_address = device_context->device_res_manager_->CreateDeviceAddress(kernel_tensor);
379 MS_EXCEPTION_IF_NULL(device_address);
380 stream_id_to_output_device_address.emplace(stream_id, device_address);
381 }
382 auto &output_device_address = stream_id_to_output_device_address[stream_id];
383 MS_EXCEPTION_IF_NULL(output_device_address);
384 const auto &output_kernel_tensor = output_device_address->kernel_tensor();
385 MS_EXCEPTION_IF_NULL(output_kernel_tensor);
386
387 void *stream_ptr = device_context->device_res_manager_->GetStream(stream_id);
388 MS_EXCEPTION_IF_NULL(stream_ptr);
389 bool ret = finite_kernel_mod->Launch(check_kernel_tensors, {}, {output_kernel_tensor.get()}, stream_ptr);
390 if (!ret) {
391 MS_LOG(EXCEPTION) << "Launch AllFinite kernel failed.";
392 }
393 return output_kernel_tensor->GetValueWithCheck<bool>();
394 }
395
Finalize()396 void DebugActor::Finalize() {
397 DumpJsonParser::GetInstance().PrintUnusedKernel();
398 for (const auto &item : finite_output_device_addresses_) {
399 auto &stream_id_to_output_device_address_map = item.second;
400 auto *device_context = item.first;
401 for (const auto &device_address_item : stream_id_to_output_device_address_map) {
402 const auto &device_address = device_address_item.second;
403 if (device_address && device_context) {
404 device_context->device_res_manager_->FreeMemory(device_address->GetMutablePtr());
405 }
406 }
407 }
408 }
409 } // namespace runtime
410 } // namespace mindspore
411