1 /**
2 * Copyright 2019-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "backend/session/ascend_session.h"
17 #include <algorithm>
18 #include <map>
19 #include <tuple>
20 #include <set>
21 #include <unordered_set>
22 #include <string>
23 #include <list>
24
25 #include "base/core_ops.h"
26 #include "base/base_ref_utils.h"
27 #include "ir/tensor.h"
28 #include "ir/anf.h"
29 #include "common/trans.h"
30 #include "runtime/device/kernel_runtime.h"
31 #include "runtime/device/ascend/kernel_select_ascend.h"
32 #include "runtime/device/ascend/kernel_build_ascend.h"
33 #include "runtime/device/ascend/ascend_kernel_runtime.h"
34 #include "runtime/device/ascend/profiling/profiling_manager.h"
35 #include "backend/optimizer/ascend/ascend_backend_optimization.h"
36 #include "backend/optimizer/common/common_backend_optimization.h"
37 #include "backend/optimizer/ascend/mindir/space_batch_nd_attr_update.h"
38 #include "backend/optimizer/ascend/mindir/dropout_unify_mindir.h"
39 #include "backend/optimizer/ascend/mindir/maxpool_to_maxpool_with_argmax.h"
40 #include "backend/optimizer/ascend/mindir/maxpool_with_argmax_unify_mindir.h"
41 #include "backend/optimizer/ascend/mindir/conv2d_unify_mindir.h"
42 #include "backend/optimizer/ascend/mindir/optimizer_unify_output.h"
43 #include "backend/optimizer/ascend/mindir/fake_learned_scale_quant_grad_unify_mindir.h"
44 #include "backend/optimizer/ascend/mindir/sparse_softmax_cross_entropy_with_logits_unify_mindir.h"
45 #include "backend/optimizer/ascend/mindir/slice_grad_unify_mindir.h"
46 #include "backend/optimizer/ascend/mindir/avg_pool_grad_unify_mindir.h"
47 #include "backend/optimizer/ascend/mindir/bn_grad_unify_mindir.h"
48 #include "backend/optimizer/ascend/mindir/all_to_all_unify_mindir.h"
49 #include "runtime/device/kernel_adjust.h"
50 #include "runtime/device/ascend/ascend_stream_assign.h"
51 #include "backend/session/anf_runtime_algorithm.h"
52 #include "utils/ms_utils.h"
53 #include "utils/utils.h"
54 #include "utils/context/graph_kernel_flags.h"
55 #include "backend/optimizer/common/helper.h"
56 #include "runtime/device/kernel_runtime_manager.h"
57 #include "utils/config_manager.h"
58 #ifndef ENABLE_SECURITY
59 #include "debug/data_dump/dump_json_parser.h"
60 #include "debug/data_dump/e2e_dump.h"
61 #endif
62 #include "debug/anf_ir_utils.h"
63 #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
64 #include "backend/session/ascend_auto_monad.h"
65 #include "debug/anf_ir_dump.h"
66 #include "debug/dump_proto.h"
67 #include "abstract/utils.h"
68 #ifdef ENABLE_DEBUGGER
69 #include "debug/tensor_load.h"
70 #include "debug/debugger/proto_exporter.h"
71 #else
72 #include "debug/debugger/proto_exporter_stub.h"
73 #endif
74 #include "common/util/error_manager/error_manager.h"
75 #include "toolchain/adx_datadump_server.h"
76 #ifdef ENABLE_DUMP_IR
77 #include "debug/rdr/running_data_recorder.h"
78 #include "debug/rdr/recorder_manager.h"
79 #include "debug/rdr/graph_recorder.h"
80 #endif
81 #if ENABLE_CPU && ENABLE_D
82 #include "ps/util.h"
83 #include "ps/ps_cache/ps_cache_manager.h"
84 #endif
85 #include "runtime/device/ascend/ascend_bucket.h"
86 #ifndef ENABLE_SECURITY
87 #include "profiler/device/ascend/memory_profiling.h"
88
89 using mindspore::device::ascend::ProfilingManager;
90 using mindspore::profiler::ascend::MemoryProfiling;
91 #endif
92
93 namespace mindspore {
94 namespace session {
95 const size_t kInvalidIndex = SIZE_MAX;
96 const size_t kLoopSinkTensorNum = 3;
97 const size_t kLoopSinkCurLoopIndex = 0;
98 const size_t kLoopSinkNextLoopIndex = 1;
99 const size_t kLoopSinkEpochIndex = 2;
100 const size_t kLabelNumsThreshold = 1023;
101 constexpr char SR_TAG[] = "sr_tag";
102 constexpr char BACKWARD[] = "backward";
103 constexpr auto kUnknowErrorString = "Unknown error occurred";
104 namespace {
105 #ifndef ENABLE_SECURITY
DumpGraphExeOrder(const std::vector<CNodePtr> & execution_order,const std::string & tag="")106 void DumpGraphExeOrder(const std::vector<CNodePtr> &execution_order, const std::string &tag = "") {
107 MS_LOG(INFO) << "Dump execution_order size " << execution_order.size();
108 MS_LOG(INFO) << "[index][stream_label][graph_id][node string]";
109 int i = 0;
110 for (auto &cnode : execution_order) {
111 MS_EXCEPTION_IF_NULL(cnode);
112 MS_LOG(INFO) << "[ " << i << "]"
113 << "[" << AnfAlgo::GetStreamDistinctionLabel(cnode.get()) << "]"
114 << "[" << AnfAlgo::GetGraphId(cnode.get()) << "]"
115 << "[" << cnode->DebugString() << "]";
116 i++;
117 }
118
119 std::stringstream buf;
120 buf << "================== execution order ==================\n";
121 if (!tag.empty()) {
122 buf << tag << "\n";
123 }
124 buf << "execution_order size: " << execution_order.size() << "\n";
125 i = 0;
126 for (auto &cnode : execution_order) {
127 MS_EXCEPTION_IF_NULL(cnode);
128 buf << i << ":\n";
129 buf << "\t" << cnode->DebugString() << "\n";
130 buf << "\t" << AnfAlgo::GetStreamDistinctionLabel(cnode.get()) << "\n";
131 buf << "\t" << AnfAlgo::GetGraphId(cnode.get()) << "\n";
132 i++;
133 }
134 buf << "================== execution order ==================\n";
135 }
136 #endif
137
IsVMGraphTaskSink()138 bool IsVMGraphTaskSink() {
139 auto ms_context = MsContext::GetInstance();
140 MS_EXCEPTION_IF_NULL(ms_context);
141 if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kGraphMode) {
142 return false;
143 }
144 if (ms_context->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) == false) {
145 return false;
146 }
147 if (ms_context->get_param<bool>(MS_CTX_IS_MULTI_GRAPH_SINK) == true) {
148 return false;
149 }
150 return true;
151 }
152
153 // Handle control flow by auto-monad.
HandleControlFlow(NotNull<KernelGraphPtr> graph)154 void HandleControlFlow(NotNull<KernelGraphPtr> graph) {
155 AscendAutoMonad auto_monad(graph);
156 auto_monad.Run();
157 }
158
SetStreamDistinctionLabel(const KernelGraphPtr & graph,uint32_t label,bool is_override)159 void SetStreamDistinctionLabel(const KernelGraphPtr &graph, uint32_t label, bool is_override) {
160 MS_EXCEPTION_IF_NULL(graph);
161 if (is_override || graph->stream_distinction_label() == kInvalidDistincLabel) {
162 graph->set_stream_distinction_label(label);
163 }
164 }
165
GetCNodeOutputStubTensor(const KernelWithIndex & kernel_with_index,const std::map<KernelWithIndex,OutputTensorInfo> & node_output_info,bool * output_is_weight)166 TensorPtr GetCNodeOutputStubTensor(const KernelWithIndex &kernel_with_index,
167 const std::map<KernelWithIndex, OutputTensorInfo> &node_output_info,
168 bool *output_is_weight) {
169 MS_EXCEPTION_IF_NULL(output_is_weight);
170 const auto &iter = node_output_info.find(kernel_with_index);
171 if (iter == node_output_info.end()) {
172 MS_LOG(EXCEPTION) << "Can not find output stub tensor of cnode " << kernel_with_index.first->DebugString();
173 }
174 *output_is_weight = iter->second.is_weight;
175 return iter->second.output_stub_tensor;
176 }
177
GenOpOutputStubTensor(const KernelGraphPtr & single_op_graph,const CNodePtr & kernel,const std::map<KernelWithIndex,size_t> & cnode_refcount,std::map<KernelWithIndex,OutputTensorInfo> * op_output_info)178 void GenOpOutputStubTensor(const KernelGraphPtr &single_op_graph, const CNodePtr &kernel,
179 const std::map<KernelWithIndex, size_t> &cnode_refcount,
180 std::map<KernelWithIndex, OutputTensorInfo> *op_output_info) {
181 MS_EXCEPTION_IF_NULL(single_op_graph);
182 MS_EXCEPTION_IF_NULL(kernel);
183 MS_EXCEPTION_IF_NULL(op_output_info);
184 OutputTensorInfo output_tensor_info;
185 size_t out_idx = 0;
186 for (const auto &output : single_op_graph->outputs()) {
187 KernelWithIndex kernel_with_index = std::make_pair(kernel, out_idx++);
188 if (cnode_refcount.find(kernel_with_index) == cnode_refcount.end()) {
189 continue;
190 }
191 const auto &output_kernel_with_index = AnfAlgo::VisitKernel(output, 0);
192 const auto &output_node = output_kernel_with_index.first;
193 const auto &output_index = output_kernel_with_index.second;
194 auto out_abstract = output_node->abstract();
195 MS_EXCEPTION_IF_NULL(out_abstract);
196 if (out_abstract->isa<abstract::AbstractTuple>()) {
197 out_abstract = out_abstract->cast<abstract::AbstractTuplePtr>()->elements()[output_index];
198 MS_EXCEPTION_IF_NULL(out_abstract);
199 }
200 abstract::AbstractTensorPtr tensor_abstract = out_abstract->cast<abstract::AbstractTensorPtr>();
201 MS_EXCEPTION_IF_NULL(tensor_abstract);
202 const auto &infer_type = AnfAlgo::GetOutputInferDataType(output_node, output_index);
203 tensor::TensorPtr stub_output_tensor =
204 std::make_shared<tensor::Tensor>(infer_type, tensor_abstract->shape()->shape(), nullptr);
205 const auto &output_type = AnfAlgo::GetOutputDeviceDataType(output_node, output_index);
206 const auto &output_format = AnfAlgo::GetOutputFormat(output_node, output_index);
207 tensor::DeviceInfo device_info;
208 device_info.format_ = output_format;
209 device_info.data_type_ = TypeIdToType(output_type);
210 stub_output_tensor->set_device_info(device_info);
211 device::DeviceAddressPtr device_address =
212 std::make_shared<device::ascend::AscendDeviceAddress>(nullptr, 0, output_format, output_type);
213 stub_output_tensor->set_device_address(device_address);
214 output_tensor_info.output_stub_tensor = stub_output_tensor;
215 auto kernel_info = dynamic_cast<const device::KernelInfo *>(output_node->kernel_info());
216 MS_EXCEPTION_IF_NULL(kernel_info);
217 output_tensor_info.is_weight = !(kernel_info->is_feature_map());
218 (*op_output_info)[kernel_with_index] = output_tensor_info;
219 }
220 }
221
LoadCtrlInputTensor(const std::shared_ptr<KernelGraph> & graph,std::vector<tensor::TensorPtr> * inputs)222 size_t LoadCtrlInputTensor(const std::shared_ptr<KernelGraph> &graph, std::vector<tensor::TensorPtr> *inputs) {
223 MS_EXCEPTION_IF_NULL(graph);
224 MS_LOG(DEBUG) << "Load kInputCtrlTensors";
225 auto inputs_params = graph->input_ctrl_tensors();
226 if (inputs_params == nullptr) {
227 return 0;
228 }
229 if (inputs_params->size() < kLoopSinkTensorNum) {
230 MS_LOG(EXCEPTION) << "Illegal inputs_params size";
231 }
232 // update current loop tensor to 0 per iterator
233 auto cur_loop_tensor = (*inputs_params)[kLoopSinkCurLoopIndex];
234 MS_EXCEPTION_IF_NULL(cur_loop_tensor);
235 auto *cur_val = static_cast<int32_t *>(cur_loop_tensor->data_c());
236 MS_EXCEPTION_IF_NULL(cur_val);
237 *cur_val = 0;
238 cur_loop_tensor->set_sync_status(kNeedSyncHostToDevice);
239 // set loop_count to zero
240 if (inputs != nullptr) {
241 inputs->push_back(cur_loop_tensor);
242 } else {
243 auto device_address = cur_loop_tensor->device_address();
244 if (!device_address->SyncHostToDevice(cur_loop_tensor->shape(), LongToSize(cur_loop_tensor->data().nbytes()),
245 cur_loop_tensor->data_type(), cur_loop_tensor->data_c(),
246 cur_loop_tensor->device_info().host_format_)) {
247 MS_LOG(EXCEPTION) << "SyncHostToDevice failed for cur_loop_tensor needed for async dump.";
248 }
249 }
250
251 // update next loop tensor to 0 per iterator
252 auto next_loop_tensor = (*inputs_params)[kLoopSinkNextLoopIndex];
253 MS_EXCEPTION_IF_NULL(next_loop_tensor);
254 auto *next_val = static_cast<int32_t *>(next_loop_tensor->data_c());
255 MS_EXCEPTION_IF_NULL(next_val);
256 *next_val = 0;
257 next_loop_tensor->set_sync_status(kNeedSyncHostToDevice);
258 // set loop_count to zero
259 if (inputs != nullptr) {
260 inputs->push_back(next_loop_tensor);
261 } else {
262 auto device_address = next_loop_tensor->device_address();
263 if (!device_address->SyncHostToDevice(next_loop_tensor->shape(), LongToSize(next_loop_tensor->data().nbytes()),
264 next_loop_tensor->data_type(), next_loop_tensor->data_c(),
265 next_loop_tensor->device_info().host_format_)) {
266 MS_LOG(EXCEPTION) << "SyncHostToDevice failed for next_loop_tensor needed for async dump.";
267 }
268 }
269
270 auto epoch_tensor = (*inputs_params)[kLoopSinkEpochIndex];
271 MS_EXCEPTION_IF_NULL(epoch_tensor);
272 auto *epoch_val = static_cast<int32_t *>(epoch_tensor->data_c());
273 MS_EXCEPTION_IF_NULL(epoch_val);
274 *epoch_val = SizeToInt(graph->current_epoch());
275 epoch_tensor->set_sync_status(kNeedSyncHostToDevice);
276 if (inputs != nullptr) {
277 inputs->push_back(epoch_tensor);
278 } else {
279 auto device_address = epoch_tensor->device_address();
280 if (!device_address->SyncHostToDevice(epoch_tensor->shape(), LongToSize(epoch_tensor->data().nbytes()),
281 epoch_tensor->data_type(), epoch_tensor->data_c(),
282 epoch_tensor->device_info().host_format_)) {
283 MS_LOG(EXCEPTION) << "SyncHostToDevice failed for epoch_tensor needed for async dump.";
284 }
285 }
286 MS_LOG(DEBUG) << "Load epoch_val:" << *epoch_val;
287 graph->set_current_epoch(graph->current_epoch() + 1);
288 return inputs_params->size();
289 }
290
UpdateCtrlInputTensor(const std::shared_ptr<KernelGraph> & graph,std::vector<tensor::TensorPtr> * inputs,size_t * input_ctrl_size)291 void UpdateCtrlInputTensor(const std::shared_ptr<KernelGraph> &graph, std::vector<tensor::TensorPtr> *inputs,
292 size_t *input_ctrl_size) {
293 if (graph->input_ctrl_tensors()) {
294 auto &dump_json_parser = DumpJsonParser::GetInstance();
295 bool sink_mode = (ConfigManager::GetInstance().dataset_mode() == DS_SINK_MODE || graph->IsDatasetGraph());
296 if (sink_mode || !dump_json_parser.async_dump_enabled()) {
297 *input_ctrl_size = LoadCtrlInputTensor(graph, inputs);
298 } else {
299 LoadCtrlInputTensor(graph, nullptr);
300 }
301 }
302 }
303
NeedMemcpyInDevice(const device::DeviceAddressPtr & src_device_addr,const device::DeviceAddressPtr & dst_device_addr)304 bool NeedMemcpyInDevice(const device::DeviceAddressPtr &src_device_addr,
305 const device::DeviceAddressPtr &dst_device_addr) {
306 MS_EXCEPTION_IF_NULL(dst_device_addr);
307 if (src_device_addr.get() == nullptr) {
308 return false;
309 }
310 if (src_device_addr->DeviceType() == dst_device_addr->DeviceType() &&
311 src_device_addr->format() == dst_device_addr->format() &&
312 src_device_addr->type_id() == dst_device_addr->type_id()) {
313 return true;
314 }
315 return false;
316 }
317
TensorNeedSync(const std::shared_ptr<KernelGraph> & kernel_graph,const AnfNodePtr & parameter,const tensor::TensorPtr & tensor,uint32_t * memcpy_nums)318 bool TensorNeedSync(const std::shared_ptr<KernelGraph> &kernel_graph, const AnfNodePtr ¶meter,
319 const tensor::TensorPtr &tensor, uint32_t *memcpy_nums) {
320 MS_EXCEPTION_IF_NULL(tensor);
321 if (tensor->NeedSyncHostToDevice()) {
322 return true;
323 }
324 auto ms_context = MsContext::GetInstance();
325 MS_EXCEPTION_IF_NULL(ms_context);
326 auto device_address = AnfAlgo::GetMutableOutputAddr(parameter, 0);
327 if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER)) {
328 return tensor->device_address().get() == nullptr || tensor->device_address() != device_address;
329 }
330 auto tensor_address = std::dynamic_pointer_cast<device::DeviceAddress>(tensor->device_address());
331 if (tensor_address != device_address) {
332 if (!kernel_graph->is_dynamic_shape() && IsVMGraphTaskSink() &&
333 NeedMemcpyInDevice(tensor_address, device_address)) {
334 auto status = device_address->SyncDeviceToDevice(trans::GetRuntimePaddingShape(parameter, 0),
335 tensor_address->GetSize(), tensor_address->type_id(),
336 tensor_address->GetPtr(), tensor_address->format());
337 if (status == false) {
338 MS_LOG(EXCEPTION) << "SyncDeviceToDevice failed.";
339 }
340 MS_EXCEPTION_IF_NULL(memcpy_nums);
341 (*memcpy_nums)++;
342 #if ((defined ENABLE_CPU) && (!defined _WIN32))
343 const std::string ¶m_name = parameter->fullname_with_scope();
344 if (ps::ps_cache_instance.IsHashTable(param_name)) {
345 return false;
346 }
347 #endif
348 auto input_param = parameter->cast<ParameterPtr>();
349 MS_EXCEPTION_IF_NULL(input_param);
350 if (AnfAlgo::IsParameterWeight(input_param) || kernel_graph->IsUpdatedParameter(input_param)) {
351 tensor->set_device_address(device_address);
352 }
353 if (kernel_graph->IsUpdatedParameter(input_param)) {
354 tensor->SetIsUpdateByDevice();
355 }
356 return false;
357 } else {
358 tensor->data_sync(false);
359 return true;
360 }
361 }
362 return false;
363 }
364
AddGraphToManager(const NotNull<KernelGraphPtr> graph,NotNull<FuncGraphManagerPtr> manager,NotNull<std::set<KernelGraphPtr> * > memo)365 void AddGraphToManager(const NotNull<KernelGraphPtr> graph, NotNull<FuncGraphManagerPtr> manager,
366 NotNull<std::set<KernelGraphPtr> *> memo) {
367 if (memo->find(graph) != memo->end()) {
368 return;
369 }
370 memo->insert(graph.get());
371 manager->AddFuncGraph(graph.get(), false);
372
373 for (auto &child_graph : graph->child_graph_order()) {
374 AddGraphToManager(NOT_NULL(child_graph.lock()), manager, memo);
375 }
376 }
377 } // namespace
378
Init(uint32_t device_id)379 void AscendSession::Init(uint32_t device_id) { InitExecutor(kAscendDevice, device_id); }
380
UnifyMindIR(const KernelGraphPtr & graph)381 void AscendSession::UnifyMindIR(const KernelGraphPtr &graph) {
382 SessionBasic::UnifyMindIR(graph);
383 auto context_ptr = MsContext::GetInstance();
384 MS_EXCEPTION_IF_NULL(context_ptr);
385 #ifdef ENABLE_DUMP_IR
386 bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
387 if (save_graphs) {
388 std::string file_name = "hwopt_d_before_unify_mindir_graph_" + std::to_string(graph->graph_id()) + ".ir";
389 DumpIR(file_name, graph);
390 DumpIRProto(graph, "before_unify_mindir_hwopt_" + std::to_string(graph->graph_id()));
391 }
392 #endif
393 auto optimizer = std::make_shared<opt::GraphOptimizer>();
394 auto unify_mindir_pm = std::make_shared<opt::PassManager>("unify_mindir_pm");
395 unify_mindir_pm->AddPass(std::make_shared<opt::SpaceToBatchNDAttrUpdate>());
396 unify_mindir_pm->AddPass(std::make_shared<opt::BatchToSpaceNDAttrUpdate>());
397 unify_mindir_pm->AddPass(std::make_shared<opt::MaxPool2MaxPoolWithArgmax>());
398 unify_mindir_pm->AddPass(std::make_shared<opt::MaxPoolWithArgmaxUnifyMindIR>());
399 unify_mindir_pm->AddPass(std::make_shared<opt::MaxPoolGradWithArgmaxUnifyMindIR>());
400 unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DUnifyMindIR>());
401 unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DBackpropInputUnifyMindIR>());
402 unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DBackpropFilterUnifyMindIR>());
403 unify_mindir_pm->AddPass(std::make_shared<opt::SliceGradUnifyMindIR>());
404 unify_mindir_pm->AddPass(std::make_shared<opt::AvgPoolGradUnifyMindIR>());
405 unify_mindir_pm->AddPass(std::make_shared<opt::FtrlUnifyOutput>());
406 unify_mindir_pm->AddPass(std::make_shared<opt::MomentumUnifyOutput>());
407 unify_mindir_pm->AddPass(std::make_shared<opt::RMSPropUnifyOutput>());
408 unify_mindir_pm->AddPass(std::make_shared<opt::CenteredRMSPropUnifyOutput>());
409 unify_mindir_pm->AddPass(std::make_shared<opt::FakeLearnedScaleQuantPerLayerGradUnifyMindIR>());
410 unify_mindir_pm->AddPass(std::make_shared<opt::FakeLearnedScaleQuantPerChannelGradUnifyMindIR>());
411 auto ms_context = MsContext::GetInstance();
412 MS_EXCEPTION_IF_NULL(ms_context);
413 if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) {
414 unify_mindir_pm->AddPass(std::make_shared<opt::DropoutAndDropoutGradUnifyMindIR>());
415 unify_mindir_pm->AddPass(std::make_shared<opt::DropoutUnifyMindIR0>());
416 unify_mindir_pm->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
417 unify_mindir_pm->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIRV2>());
418 unify_mindir_pm->AddPass(std::make_shared<opt::SparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
419 } else {
420 // Add PynativeGradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR pass first to avoid the backward loss function
421 // from the python frontend matching the pattern defined in PynativeSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR.
422 unify_mindir_pm->AddPass(std::make_shared<opt::PynativeGradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
423 unify_mindir_pm->AddPass(std::make_shared<opt::PynativeSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
424 }
425 unify_mindir_pm->AddPass(std::make_shared<opt::DropoutUnifyMindIR1>());
426 unify_mindir_pm->AddPass(std::make_shared<opt::DropoutGradUnifyMindIR>());
427 unify_mindir_pm->AddPass(std::make_shared<opt::BatchNormGradUnifyMindIR>());
428 unify_mindir_pm->AddPass(std::make_shared<opt::NeighborExchangeUnifyMindIR>());
429 unify_mindir_pm->AddPass(std::make_shared<opt::AllToAllUnifyMindIR>());
430
431 optimizer->AddPassManager(unify_mindir_pm);
432 (void)optimizer->Optimize(graph);
433 graph->SetExecOrderByDefault();
434 #ifdef ENABLE_DUMP_IR
435 if (save_graphs) {
436 std::string file_name = "hwopt_d_after_unify_mindir_graph_" + std::to_string(graph->graph_id()) + ".ir";
437 DumpIR(file_name, graph);
438 }
439 #endif
440 }
441
LoadInputData(const std::shared_ptr<KernelGraph> & kernel_graph,const std::vector<tensor::TensorPtr> & inputs_const) const442 void AscendSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
443 const std::vector<tensor::TensorPtr> &inputs_const) const {
444 std::vector<tensor::TensorPtr> inputs(inputs_const);
445 size_t input_ctrl_size = kLoopSinkTensorNum;
446 uint32_t device_memcpy_nums = 0;
447 MS_EXCEPTION_IF_NULL(kernel_graph);
448 UpdateCtrlInputTensor(kernel_graph, &inputs, &input_ctrl_size);
449 auto &input_nodes = kernel_graph->input_nodes();
450 if ((inputs.size() + input_ctrl_size) - kLoopSinkTensorNum != input_nodes.size()) {
451 MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size()
452 << ", input_ctrl_size:" << input_ctrl_size;
453 }
454 auto ms_context = MsContext::GetInstance();
455 MS_EXCEPTION_IF_NULL(ms_context);
456 auto enable_mem_scheduler = ms_context->get_param<bool>(MS_CTX_ENABLE_MEM_SCHEDULER);
457 if (enable_mem_scheduler) {
458 kernel_graph->SetInputTensors(inputs);
459 return;
460 }
461 for (auto item : tensor_device_addr_map_) {
462 auto output_tensor = item.first;
463 output_tensor->set_device_address(item.second);
464 }
465 SyncStream();
466 for (size_t i = 0; i < inputs.size(); ++i) {
467 auto tensor = inputs[i];
468 MS_EXCEPTION_IF_NULL(tensor);
469 auto input_node = input_nodes[i];
470 MS_EXCEPTION_IF_NULL(input_node);
471 auto size = LongToSize(tensor->data().nbytes());
472 if (!input_node->isa<Parameter>()) {
473 continue;
474 }
475 auto input_param = input_node->cast<ParameterPtr>();
476 MS_EXCEPTION_IF_NULL(input_param);
477 if (!input_param->IsUsedByRealKernelInGraph(kernel_graph->graph_id())) {
478 tensor->set_sync_status(kNoNeedSync);
479 continue;
480 } else if (input_param->has_dynamic_shape()) {
481 auto tensor_shape = tensor->shape();
482 std::vector<size_t> shape_tmp;
483 (void)std::transform(tensor_shape.begin(), tensor_shape.end(), std::back_inserter(shape_tmp), IntToSize);
484 AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetOutputInferDataType(input_node, 0)}, {shape_tmp},
485 input_node.get());
486 size = abstract::ShapeSize(shape_tmp) * abstract::TypeIdSize(tensor->data_type());
487 }
488 if (AnfAlgo::OutputAddrExist(input_node, 0) &&
489 TensorNeedSync(kernel_graph, input_node, tensor, &device_memcpy_nums)) {
490 #if ((defined ENABLE_CPU) && (!defined _WIN32))
491 const std::string ¶m_name = input_node->fullname_with_scope();
492 if (ps::ps_cache_instance.IsHashTable(param_name)) {
493 continue;
494 }
495 #endif
496 auto device_address = AnfAlgo::GetMutableOutputAddr(input_node, 0);
497 MS_EXCEPTION_IF_NULL(device_address);
498 if (size != 0 &&
499 !device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(input_node, 0), size, tensor->data_type(),
500 tensor->data_c(), tensor->device_info().host_format_)) {
501 MS_LOG(EXCEPTION) << "SyncHostToDevice failed.";
502 }
503 if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode ||
504 AnfAlgo::IsParameterWeight(input_param) || kernel_graph->IsUpdatedParameter(input_param)) {
505 tensor->set_device_address(device_address);
506 }
507 if (kernel_graph->IsUpdatedParameter(input_param)) {
508 tensor->SetIsUpdateByDevice();
509 }
510 }
511 tensor->set_sync_status(kNoNeedSync);
512 }
513 if (device_memcpy_nums > 0) {
514 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
515 MS_EXCEPTION_IF_NULL(runtime_instance);
516 auto compute_stream = runtime_instance->compute_stream();
517 auto model_stream = runtime_instance->GetModelStream(kernel_graph->graph_id());
518 auto memcpy_event = runtime_instance->CreateDeviceEvent();
519 memcpy_event->set_wait_stream(model_stream);
520 memcpy_event->set_record_stream(compute_stream);
521 memcpy_event->RecordEvent();
522 memcpy_event->WaitEvent();
523 }
524 }
525
CompileGraphImpl(const AnfNodePtrList & lst,const AnfNodePtrList & outputs)526 GraphId AscendSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
527 MS_LOG(INFO) << "Start";
528 // construct graph, if successfully, graph_sum_ + 1
529 auto graph = ConstructKernelGraph(lst, outputs);
530 auto graph_id = graph->graph_id();
531 InitAllBucket(graph);
532 MS_LOG(INFO) << "Compile graph " << graph_id << " success";
533 return graph_id;
534 }
535
CompileGraphImpl(NotNull<FuncGraphPtr> func_graph)536 GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) {
537 MS_LOG(INFO) << "Start";
538 std::vector<KernelGraphPtr> all_graphs;
539 auto root_graph = ConstructKernelGraph(func_graph, &all_graphs);
540 for (const auto &graph : all_graphs) {
541 graph->set_root_graph_id(root_graph->graph_id());
542 }
543 UnifyMindIR(root_graph);
544 // Update Graph Dynamic Shape Attr
545 UpdateAllGraphDynamicShapeAttr(all_graphs);
546 opt::BackendCommonOptimization(root_graph);
547 // empty graph dont entry to backend
548 if (root_graph->execution_order().empty()) {
549 MS_LOG(INFO) << root_graph->ToString() << " is empty graph.";
550 AnfAlgo::InsertMakeTupleForOutput(NOT_NULL(root_graph));
551 root_graph->set_executable(false);
552 InitRuntimeResource();
553 return root_graph->graph_id();
554 }
555
556 // Handle control flow by auto-monad.
557 HandleControlFlow(NOT_NULL(root_graph));
558
559 std::set<KernelGraphPtr> memo;
560 // add all graphs to manager first, so that don't have to make new manager in following passes.
561 auto manager = Manage(root_graph, true);
562 AddGraphToManager(NOT_NULL(root_graph), NOT_NULL(manager), NOT_NULL(&memo));
563 memo.clear();
564
565 // resource initialize
566 InitRuntimeResource();
567
568 IrFusionPass(NOT_NULL(root_graph), NOT_NULL(&memo));
569 memo.clear();
570 SelectKernel(NOT_NULL(root_graph));
571 memo.clear();
572
573 HardwareOptimize(NOT_NULL(root_graph), NOT_NULL(&memo));
574 memo.clear();
575 #ifdef ENABLE_DEBUGGER
576 // load graphs to debugger.
577 if (debugger_ && debugger_->DebuggerBackendEnabled()) {
578 LoadGraphsToDbg(NOT_NULL(root_graph), NOT_NULL(&memo));
579 }
580 #endif
581 memo.clear();
582 UpdateRefOutputMap(NOT_NULL(root_graph), NOT_NULL(&memo));
583 memo.clear();
584 // add make_tuple to the output graph
585 AnfAlgo::InsertMakeTupleForOutput(NOT_NULL(root_graph));
586 // root root_graph valiate,include genearte execute order and so on
587 RootGraphExecutorValidate(NOT_NULL(root_graph), all_graphs);
588 #ifdef ENABLE_DUMP_IR
589 // dump graph before remove nop nodes
590 auto context_ptr = MsContext::GetInstance();
591 MS_EXCEPTION_IF_NULL(context_ptr);
592 bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
593 if (save_graphs) {
594 DumpIRProto(root_graph, "before_removeNop_" + std::to_string(graph_sum_));
595 }
596 #endif
597
598 // adjust kernel
599 AdjustKernel(root_graph);
600 #if ENABLE_CPU && ENABLE_D
601 InitPsWorker(root_graph);
602 #endif
603 // assign stream
604 AssignStream(NOT_NULL(root_graph));
605 #ifndef ENABLE_SECURITY
606 // insert profiling point
607 device::KernelAdjust::GetInstance().Profiling(NOT_NULL(root_graph.get()));
608 #endif
609 device::KernelAdjust::GetInstance().InsertOverflowCheckOperations(NOT_NULL(root_graph));
610 // build kernel
611 BuildKernel(root_graph);
612 #ifndef ENABLE_SECURITY
613 SetSummaryNodes(root_graph.get());
614 #endif
615 // Alloc memory for child graph's inputs
616 AssignStaticMemory(NOT_NULL(root_graph), NOT_NULL(&memo));
617 memo.clear();
618 // Alloc memory for root graph's inputs and node's outputs, workspace
619 MemoryAlloc(root_graph.get());
620 // generate and load task into device
621 Load(root_graph);
622 root_graph->SetInputNodes();
623 root_graph->SetOptimizerFlag();
624 DumpAllGraphs(all_graphs);
625 // Save memory profiling data to proto file
626 #ifndef ENABLE_SECURITY
627 auto profiling_instance = MemoryProfiling::GetInstance();
628 if (profiling_instance.IsMemoryProfilingEnable()) {
629 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
630 MS_EXCEPTION_IF_NULL(runtime_instance);
631 uint64_t mem_size = runtime_instance->GetAvailableMemMaxSize();
632 profiling_instance.SetDeviceMemSize(mem_size);
633 profiling_instance.SaveMemoryProfiling();
634 }
635 #endif
636 // return the root_graph id to backend
637 auto graph_id = root_graph->graph_id();
638 return graph_id;
639 }
640
641 #ifndef ENABLE_SECURITY
SetFinalGraphSummaryFlag(const std::shared_ptr<KernelGraph> & kernel_graph)642 void AscendSession::SetFinalGraphSummaryFlag(const std::shared_ptr<KernelGraph> &kernel_graph) {
643 MS_EXCEPTION_IF_NULL(kernel_graph);
644 auto graph_order = GetGraphOrder(kernel_graph->graph_id());
645 for (auto graph_id : graph_order) {
646 auto child_graph = GetGraph(graph_id);
647 if (child_graph == nullptr) {
648 continue;
649 }
650 if (child_graph->summary_node_exist()) {
651 kernel_graph->set_summary_node_exist(true);
652 return;
653 }
654 }
655 kernel_graph->set_summary_node_exist(false);
656 }
657 #endif
658
BuildGraphImpl(GraphId graph_id)659 void AscendSession::BuildGraphImpl(GraphId graph_id) {
660 MS_LOG(INFO) << "Start";
661 auto graph = GetGraph(graph_id);
662 MS_EXCEPTION_IF_NULL(graph);
663 // resource initialize
664 InitRuntimeResource();
665 // multiple graph handle
666 if (graph_id == final_graph_id_) {
667 MS_LOG(EXCEPTION) << "Unexpected graph id:" << graph_id << ", final_graph_id_:" << final_graph_id_;
668 }
669 auto single_graph = GetGraph(graph_id);
670 MS_EXCEPTION_IF_NULL(single_graph);
671 CompileChildGraph(single_graph);
672 // set the distinction label of single graph
673 single_graph->set_stream_distinction_label(graph_id);
674 single_graph->UpdateExecuteKernelStreamLabel();
675 // adjust execution order because merge child graph and other special operations
676 AdjustKernel(graph);
677 #if ENABLE_CPU && ENABLE_D
678 InitPsWorker(graph);
679 #endif
680 // Assign streams for control sink and hccl and so on
681 AssignStream(NOT_NULL(graph));
682 #ifndef ENABLE_SECURITY
683 device::KernelAdjust::GetInstance().Profiling(NOT_NULL(graph.get()));
684 #endif
685 device::KernelAdjust::GetInstance().InsertOverflowCheckOperations(NOT_NULL(graph));
686 // build kernel if node is cnode
687 BuildKernel(graph);
688 auto ms_context = MsContext::GetInstance();
689 MS_EXCEPTION_IF_NULL(ms_context);
690 #ifdef ENABLE_DEBUGGER
691 if (debugger_ && debugger_->partial_memory()) {
692 debugger_->PreExecute(graph);
693 }
694 #endif
695 if (ms_context->get_param<bool>(MS_CTX_PRECOMPILE_ONLY)) {
696 MS_LOG(INFO) << "Precompile only, stop in build kernel step";
697 } else {
698 // alloc memory, including static memory and dynamic memory
699 MemoryAlloc(graph.get());
700 auto enable_mem_scheduler = ms_context->get_param<bool>(MS_CTX_ENABLE_MEM_SCHEDULER);
701 if (!enable_mem_scheduler) {
702 AnfAlgo::CacheAddrForGraph(graph);
703 }
704 // generate and load task info to device if it is sink mode
705 Load(graph);
706 }
707 // sync the initial const tensor to device
708 SyncInitialTenosrToDevice();
709 DumpAllGraphs({graph});
710 MS_LOG(INFO) << "End";
711 }
712
CompileChildGraph(const KernelGraphPtr & child_graph)713 void AscendSession::CompileChildGraph(const KernelGraphPtr &child_graph) {
714 MS_EXCEPTION_IF_NULL(child_graph);
715 MS_LOG(INFO) << "CompileChildGraph " << child_graph->ToString();
716 opt::AscendBackendIRFusionOptimization(child_graph);
717 child_graph->SetExecOrderByDefault();
718 #ifdef ENABLE_DUMP_IR
719 auto context_ptr = MsContext::GetInstance();
720 MS_EXCEPTION_IF_NULL(context_ptr);
721 bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
722 if (save_graphs) {
723 std::string file_name = "select_kernel_before_graph_" + std::to_string(child_graph->graph_id()) + ".ir";
724 DumpIR(file_name, child_graph);
725 }
726 #endif
727 // select kernel build info
728 SelectKernel(*child_graph);
729 #ifdef ENABLE_DUMP_IR
730 if (save_graphs) {
731 std::string file_name = "select_kernel_after_graph_" + std::to_string(child_graph->graph_id()) + ".ir";
732 DumpIR(file_name, child_graph);
733 }
734 #endif
735 // optimize graph
736 HardwareOptimize(child_graph);
737 // assign static memory of parameters
738 auto enable_mem_scheduler = context_ptr->get_param<bool>(MS_CTX_ENABLE_MEM_SCHEDULER);
739 if (!enable_mem_scheduler) {
740 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
741 MS_EXCEPTION_IF_NULL(runtime_instance);
742 runtime_instance->AssignStaticMemoryInput(*child_graph);
743 runtime_instance->AssignStaticMemoryValueNode(*child_graph);
744 }
745 }
746
IsSupportSummary()747 bool AscendSession::IsSupportSummary() { return !device::KernelAdjust::NeedInsertSwitch(); }
748
PreExecuteGraph(const std::shared_ptr<KernelGraph> & kernel_graph,const std::vector<tensor::TensorPtr> & inputs,VectorRef * const)749 void AscendSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
750 const std::vector<tensor::TensorPtr> &inputs, VectorRef *const) {
751 #ifdef ENABLE_DEBUGGER
752 if (debugger_) {
753 debugger_->PreExecute(kernel_graph);
754 }
755 #endif
756 #if ENABLE_CPU && ENABLE_D
757 // Initialize parameter server
758 InitPSParamAndOptim(kernel_graph, inputs);
759 std::string channel_name;
760 if (ps::PsDataPrefetch::GetInstance().cache_enable() && IsGetNextGraph(kernel_graph, &channel_name)) {
761 ps::ps_cache_instance.IncreaseGraphStep(channel_name);
762 }
763 #endif
764 }
765
PostExecuteGraph(const std::shared_ptr<KernelGraph> & kernel_graph,const std::vector<tensor::TensorPtr> &,VectorRef * const)766 void AscendSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
767 const std::vector<tensor::TensorPtr> &, VectorRef *const) {
768 // summary
769 #ifndef ENABLE_SECURITY
770 Summary(kernel_graph.get());
771 #endif
772 #ifdef ENABLE_DEBUGGER
773 // load tensor from device for debugger
774 if (debugger_ && debugger_->debugger_enabled()) {
775 LoadTensor(kernel_graph);
776 }
777 // debugger post-execution processing
778 if (debugger_) {
779 debugger_->PostExecute();
780 }
781 #endif
782 }
783
ExecuteGraph(const std::shared_ptr<KernelGraph> & kernel_graph)784 void AscendSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) { Execute(kernel_graph, true); }
785
RunOpHardwareOptimize(const std::shared_ptr<session::KernelGraph> & kernel_graph) const786 void AscendSession::RunOpHardwareOptimize(const std::shared_ptr<session::KernelGraph> &kernel_graph) const {
787 MS_LOG(INFO) << "HardwareOptimize Start";
788 opt::RunOpAscendBackendOptimization(kernel_graph);
789 MS_LOG(INFO) << "HardwareOptimize Finish";
790 }
791
BuildOpImpl(const OpRunInfo & op_run_info,const GraphInfo & graph_info,const std::vector<tensor::TensorPtr> & input_tensors,const std::vector<int64_t> & tensors_mask)792 KernelGraphPtr AscendSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
793 const std::vector<tensor::TensorPtr> &input_tensors,
794 const std::vector<int64_t> &tensors_mask) {
795 auto it = run_op_graphs_.find(graph_info);
796 if (it != run_op_graphs_.end()) {
797 return it->second;
798 }
799
800 const auto &graph = PreBuildOp(op_run_info, input_tensors, tensors_mask);
801 MS_EXCEPTION_IF_NULL(graph);
802 // init runtime resource
803 InitRuntimeResource();
804 // build kernel
805 RunOpAdjustKernel(graph);
806 BuildKernel(graph);
807 auto enable_op_graph_cache = MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
808 if (enable_op_graph_cache) {
809 run_op_graphs_[graph_info] = graph;
810 }
811 return graph;
812 }
813
BindAddressToTensor(const std::map<tensor::TensorPtr,session::KernelWithIndex> & tensor_to_node) const814 void AscendSession::BindAddressToTensor(
815 const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) const {
816 auto ms_context = MsContext::GetInstance();
817 MS_EXCEPTION_IF_NULL(ms_context);
818 for (const auto &item : tensor_to_node) {
819 auto &tensor = item.first;
820 auto &node = item.second.first;
821 auto &output_index = item.second.second;
822 DeviceAddressPtr address = nullptr;
823 if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER)) {
824 address = AnfAlgo::GetMutableOutputAddr(node, output_index, false);
825 } else {
826 address = AnfAlgo::GetMutableOutputAddr(node, output_index);
827 }
828 MS_EXCEPTION_IF_NULL(tensor);
829 tensor->set_device_address(address);
830 }
831 }
832
LaunchFunc(const KernelGraphPtr & graph,const std::map<tensor::TensorPtr,session::KernelWithIndex> & tensor_to_node,bool is_dynamic_shape,const std::vector<tensor::TensorPtr> & input_tensors)833 void AscendSession::LaunchFunc(const KernelGraphPtr &graph,
834 const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node,
835 bool is_dynamic_shape, const std::vector<tensor::TensorPtr> &input_tensors) {
836 MS_EXCEPTION_IF_NULL(graph);
837 // Wait for AllReduce
838 for (auto &tensor : input_tensors) {
839 if (tensor->NeedWaitDevice()) {
840 tensor->WaitDevice();
841 }
842 }
843
844 RunOpRemoveNopNode(graph);
845 RunOpMemoryAllocNew(input_tensors, tensor_to_node, *graph);
846 AnfAlgo::CacheAddrForGraph(graph);
847 // Bind Device Ptr to DeviceAddress of Tensor
848 BindAddressToTensor(tensor_to_node);
849 RunOpGenKernelEvent(graph.get());
850
851 if (is_dynamic_shape) {
852 BuildDynamicKernel(graph);
853 }
854
855 LoadInputData(graph, input_tensors);
856 Execute(graph, false);
857 RunOpMemoryClear(graph.get());
858 }
859
BatchBuildKernel(const std::vector<std::shared_ptr<SessionTask>> & build_tasks)860 void AscendSession::BatchBuildKernel(const std::vector<std::shared_ptr<SessionTask>> &build_tasks) {
861 std::vector<CNodePtr> node_to_build;
862 std::vector<KernelGraphPtr> graphs;
863
864 // Hide Nop Node && Collect nodes to build.
865 for (const auto &task : build_tasks) {
866 MS_EXCEPTION_IF_NULL(task);
867 const auto &context = task->context();
868 MS_EXCEPTION_IF_NULL(context);
869 const auto &graph = context->graph();
870 MS_EXCEPTION_IF_NULL(graph);
871
872 RunOpHideNopNode(graph);
873
874 const auto &nodes = graph->execution_order();
875 std::copy(nodes.begin(), nodes.end(), std::back_inserter(node_to_build));
876 graphs.push_back(graph);
877 }
878
879 // Build first time.
880 BuildKernel(node_to_build);
881
882 std::vector<CNodePtr> atomic_node_to_build;
883 for (auto &graph : graphs) {
884 device::ascend::KernelBuildPreprocess(graph.get());
885 const auto &nodes = graph->execution_order();
886 std::copy(nodes.begin(), nodes.end(), std::back_inserter(atomic_node_to_build));
887 }
888 // Build AtomicClean.
889 BuildKernel(atomic_node_to_build);
890 }
891
PrepareForOutputTensor(const KernelGraphPtr & graph,const std::vector<tensor::TensorPtr> & input_tensors,std::map<tensor::TensorPtr,session::KernelWithIndex> * tensor_to_node,VectorRef * outputs) const892 void AscendSession::PrepareForOutputTensor(const KernelGraphPtr &graph,
893 const std::vector<tensor::TensorPtr> &input_tensors,
894 std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node,
895 VectorRef *outputs) const {
896 // Create DeviceAddress For Output Tensor(contain: Shape, Format, DType)
897 auto runtime_instance = device::KernelRuntimeManager::Instance().GetCurrentKernelRuntime();
898 runtime_instance->RunOpMallocPre(*graph, input_tensors);
899 runtime_instance->UpdateRefNodeOutputMem(*graph);
900 // CREATE OUTPUT TENSOR ADDRESS
901 UpdateOutputs(graph, outputs, input_tensors, tensor_to_node);
902 }
903
StoreCNodePrimitive(const KernelGraphPtr & graph)904 void StoreCNodePrimitive(const KernelGraphPtr &graph) {
905 const auto &nodes = graph->execution_order();
906 for (auto &node : nodes) {
907 auto primitive = AnfAlgo::GetCNodePrimitive(node);
908 MS_EXCEPTION_IF_NULL(primitive);
909 auto new_primitive = std::make_shared<Primitive>(*primitive);
910 node->set_input(kAnfPrimitiveIndex, NewValueNode(new_primitive));
911 }
912 }
913
CreateKernelGraph(const GraphInfo & graph_info,OpRunInfo * op_run_info,std::vector<tensor::TensorPtr> * input_tensors,const std::vector<int64_t> & tensors_mask,bool cache_miss)914 KernelGraphPtr AscendSession::CreateKernelGraph(const GraphInfo &graph_info, OpRunInfo *op_run_info,
915 std::vector<tensor::TensorPtr> *input_tensors,
916 const std::vector<int64_t> &tensors_mask, bool cache_miss) {
917 auto &task_manager = PynativeTaskManager::GetInstance();
918 KernelGraphPtr graph = nullptr;
919 if (cache_miss) {
920 graph = PreBuildOp(*op_run_info, *input_tensors, tensors_mask);
921 MS_EXCEPTION_IF_NULL(graph);
922 InitRuntimeResource();
923 run_op_graphs_[graph_info] = graph;
924 } else {
925 if (!task_manager.QueueEmpty()) {
926 graph = PreBuildOp(*op_run_info, *input_tensors, tensors_mask);
927 InitRuntimeResource();
928 } else {
929 graph = run_op_graphs_[graph_info];
930 }
931 }
932 return graph;
933 }
934
DisableLazyBuild(const OpRunInfo & op_run_info)935 bool AscendSession::DisableLazyBuild(const OpRunInfo &op_run_info) {
936 auto ms_context = MsContext::GetInstance();
937 MS_EXCEPTION_IF_NULL(ms_context);
938 return !op_run_info.lazy_build || ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode ||
939 op_run_info.is_dynamic_shape || ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE);
940 }
941
RunOpImpl(const GraphInfo & graph_info,OpRunInfo * op_run_info,std::vector<tensor::TensorPtr> * input_tensors,VectorRef * outputs,const std::vector<int64_t> & tensors_mask)942 void AscendSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
943 std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs,
944 const std::vector<int64_t> &tensors_mask) {
945 MS_EXCEPTION_IF_NULL(op_run_info);
946 if (DisableLazyBuild(*op_run_info)) {
947 session::PynativeTaskManager::GetInstance().ExecuteRemainingTasks();
948 RunOpImplOrigin(graph_info, op_run_info, input_tensors, outputs, tensors_mask);
949 return;
950 }
951
952 MS_EXCEPTION_IF_NULL(input_tensors);
953 bool cache_miss = run_op_graphs_.find(graph_info) == run_op_graphs_.end();
954 auto graph = CreateKernelGraph(graph_info, op_run_info, input_tensors, tensors_mask, cache_miss);
955 EraseValueNodeTensor(tensors_mask, input_tensors);
956 MS_EXCEPTION_IF_NULL(graph);
957 std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node;
958 PrepareForOutputTensor(graph, *input_tensors, &tensor_to_node, outputs);
959
960 auto &task_manager = PynativeTaskManager::GetInstance();
961 if (!cache_miss && task_manager.QueueEmpty()) {
962 // Cache match and there are no task in Queue. Just Launch immediately.
963 LaunchFunc(graph, tensor_to_node, op_run_info->is_dynamic_shape, *input_tensors);
964 } else {
965 auto run_op_context = std::make_shared<RunOpContext>(graph_info, op_run_info->is_dynamic_shape, graph, tensors_mask,
966 *input_tensors, tensor_to_node);
967 task_manager.PushLaunchTask(std::make_shared<LaunchTask>(run_op_context));
968
969 if (cache_miss || !task_manager.QueueEmpty()) {
970 // Copy Primitive. The attributes of Primitive will be modified.
971 StoreCNodePrimitive(graph);
972 task_manager.PushBuildTask(std::make_shared<BuildTask>(run_op_context));
973 }
974 }
975
976 if (!task_manager.inited()) {
977 task_manager.Init([this]() { ExecuteAllTaskInQueue(); });
978 }
979
980 if (task_manager.QueueFull()) {
981 task_manager.ExecuteRemainingTasks();
982 }
983 }
984
RunOpImplOrigin(const GraphInfo & graph_info,OpRunInfo * op_run_info,std::vector<tensor::TensorPtr> * input_tensors,VectorRef * outputs,const std::vector<int64_t> & tensors_mask)985 void AscendSession::RunOpImplOrigin(const GraphInfo &graph_info, OpRunInfo *op_run_info,
986 std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs,
987 const std::vector<int64_t> &tensors_mask) {
988 MS_EXCEPTION_IF_NULL(input_tensors);
989 MS_EXCEPTION_IF_NULL(op_run_info);
990 const auto &graph = BuildOpImpl(*op_run_info, graph_info, *input_tensors, tensors_mask);
991
992 EraseValueNodeTensor(tensors_mask, input_tensors);
993
994 // wait for allreduce
995 for (auto &tensor : *input_tensors) {
996 if (tensor->NeedWaitDevice()) {
997 tensor->WaitDevice();
998 }
999 }
1000 // malloc mem
1001 RunOpRemoveNopNode(graph);
1002 RunOpMemoryAlloc(*input_tensors, graph.get());
1003 RunOpGenKernelEvent(graph.get());
1004 AnfAlgo::CacheAddrForGraph(graph);
1005 // Build dynamic kernel
1006 if (op_run_info->is_dynamic_shape) {
1007 BuildDynamicKernel(graph);
1008 }
1009 // load input data to device
1010 LoadInputData(graph, *input_tensors);
1011 // run op
1012 Execute(graph, false);
1013 // get output
1014 std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node;
1015 UpdateOutputs(graph, outputs, *input_tensors, &tensor_to_node);
1016 // update output abstract of dynamic op to op_run_info
1017 if (op_run_info->is_dynamic_shape) {
1018 UpdateOutputAbstract(graph, op_run_info);
1019 }
1020 RunOpMemoryClear(graph.get());
1021 }
1022
PreBuildOp(const OpRunInfo & op_run_info,const std::vector<tensor::TensorPtr> & input_tensors,const std::vector<int64_t> & tensors_mask)1023 KernelGraphPtr AscendSession::PreBuildOp(const OpRunInfo &op_run_info,
1024 const std::vector<tensor::TensorPtr> &input_tensors,
1025 const std::vector<int64_t> &tensors_mask) {
1026 // Construct graph include one op
1027 auto graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask, true);
1028 MS_EXCEPTION_IF_NULL(graph);
1029 opt::RunOpAscendBackendIRFusionOptimization(graph);
1030 SelectKernel(*graph);
1031 RunOpHardwareOptimize(graph);
1032 CacheCNodeOutputInfo(*graph);
1033 return graph;
1034 }
1035
CacheCNodeOutputInfo(const KernelGraph & graph) const1036 void AscendSession::CacheCNodeOutputInfo(const KernelGraph &graph) const {
1037 auto &nodes = graph.execution_order();
1038 for (auto const &node : nodes) {
1039 std::vector<std::string> formats;
1040 std::vector<TypeId> types;
1041 std::vector<size_t> tensor_sizes;
1042 auto output_num = AnfAlgo::GetOutputTensorNum(node);
1043 for (size_t i = 0; i < output_num; ++i) {
1044 std::string output_format = AnfAlgo::GetOutputFormat(node, i);
1045 auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i);
1046 auto tensor_size = AnfAlgo::GetOutputTensorMemSize(node, i);
1047 formats.emplace_back(output_format);
1048 types.emplace_back(output_type);
1049 tensor_sizes.emplace_back(tensor_size);
1050 }
1051 MS_EXCEPTION_IF_NULL(node);
1052 node->set_user_data<OpRuntimeInfo>(std::make_shared<OpRuntimeInfo>(formats, types, tensor_sizes));
1053 }
1054
1055 auto &inputs = graph.inputs();
1056 for (const auto &input : inputs) {
1057 MS_EXCEPTION_IF_NULL(input);
1058 if (!input->isa<Parameter>()) {
1059 continue;
1060 }
1061 std::vector<std::string> formats;
1062 std::vector<TypeId> types;
1063 std::vector<size_t> tensor_sizes;
1064 auto output_size = AnfAlgo::GetOutputTensorNum(input);
1065 for (size_t index = 0; index < output_size; index++) {
1066 auto format = AnfAlgo::GetOutputFormat(input, index);
1067 auto type_id = AnfAlgo::GetOutputDeviceDataType(input, index);
1068 if (type_id == kTypeUnknown) {
1069 type_id = AnfAlgo::GetOutputInferDataType(input, index);
1070 }
1071 auto tensor_size = AnfAlgo::GetOutputTensorMemSize(input, index);
1072 formats.emplace_back(format);
1073 types.emplace_back(type_id);
1074 tensor_sizes.emplace_back(tensor_size);
1075 }
1076 input->set_user_data<OpRuntimeInfo>(std::make_shared<OpRuntimeInfo>(formats, types, tensor_sizes));
1077 }
1078 }
1079
GetOpInputStubTensors(const CNodePtr & cnode,const std::map<AnfNodePtr,size_t> & parameter_index,const std::vector<tensor::TensorPtr> & graph_inputs,const std::map<KernelWithIndex,OutputTensorInfo> & node_output_info,InputTensorInfo * input_tensor_info)1080 void AscendSession::GetOpInputStubTensors(const CNodePtr &cnode, const std::map<AnfNodePtr, size_t> ¶meter_index,
1081 const std::vector<tensor::TensorPtr> &graph_inputs,
1082 const std::map<KernelWithIndex, OutputTensorInfo> &node_output_info,
1083 InputTensorInfo *input_tensor_info) {
1084 MS_EXCEPTION_IF_NULL(cnode);
1085 MS_EXCEPTION_IF_NULL(input_tensor_info);
1086 const auto input_tensor_num = AnfAlgo::GetInputTensorNum(cnode);
1087 for (size_t i = 1; i <= input_tensor_num; i += 1) {
1088 const auto &input = cnode->input(i);
1089 auto kernel_with_index = AnfAlgo::VisitKernel(input, 0);
1090 auto real_input = kernel_with_index.first;
1091 MS_EXCEPTION_IF_NULL(real_input);
1092 tensor::TensorPtr tensor = nullptr;
1093 if (real_input->isa<ValueNode>()) {
1094 tensor = GetValueNodeOutputTensor(real_input, kernel_with_index.second);
1095 input_tensor_info->input_tensors_mask.emplace_back(kParameterDataTensorMask);
1096 } else if (real_input->isa<Parameter>()) {
1097 tensor = GetParameterOutputTensor(real_input, parameter_index, graph_inputs);
1098 auto parameter = real_input->cast<ParameterPtr>();
1099 MS_EXCEPTION_IF_NULL(parameter);
1100 input_tensor_info->input_tensors_mask.emplace_back(parameter->has_default() ? kParameterWeightTensorMask
1101 : kParameterDataTensorMask);
1102 } else if (real_input->isa<CNode>()) {
1103 bool output_is_weight = false;
1104 tensor = GetCNodeOutputStubTensor(kernel_with_index, node_output_info, &output_is_weight);
1105 input_tensor_info->input_tensors_mask.emplace_back(output_is_weight ? kParameterWeightTensorMask
1106 : kParameterDataTensorMask);
1107 } else {
1108 MS_LOG(EXCEPTION) << "Invalid input node, node = " << real_input->DebugString();
1109 }
1110 MS_EXCEPTION_IF_NULL(tensor);
1111 MS_LOG(DEBUG) << "Get" << i << "th input tensor of " << cnode->fullname_with_scope() << " from "
1112 << real_input->fullname_with_scope() << "-" << kernel_with_index.second;
1113 input_tensor_info->input_tensors.emplace_back(tensor);
1114 }
1115 }
1116
BuildOpsInGraph(const GraphId & graph_id,const std::map<AnfNodePtr,size_t> & parameter_index,const std::vector<tensor::TensorPtr> & graph_inputs,const std::map<KernelWithIndex,size_t> & cnode_refcount)1117 void AscendSession::BuildOpsInGraph(const GraphId &graph_id, const std::map<AnfNodePtr, size_t> ¶meter_index,
1118 const std::vector<tensor::TensorPtr> &graph_inputs,
1119 const std::map<KernelWithIndex, size_t> &cnode_refcount) {
1120 if (built_graph_id_.find(graph_id) != built_graph_id_.end()) {
1121 return;
1122 }
1123 auto graph = GetGraph(graph_id);
1124 MS_EXCEPTION_IF_NULL(graph);
1125 std::map<KernelWithIndex, OutputTensorInfo> op_output_info;
1126 std::vector<CNodePtr> kernels;
1127 std::unordered_map<KernelGraphPtr, GraphInfo> single_op_graphs;
1128 // Collect kernels need to be built in single op graphs
1129 for (const auto &kernel : graph->execution_order()) {
1130 // Generate fake input tensors, tensor masks and input kernel with index
1131 InputTensorInfo input_tensor_info;
1132 GetOpInputStubTensors(kernel, parameter_index, graph_inputs, op_output_info, &input_tensor_info);
1133 // Get OpRunInfo and GraphInfo
1134 OpRunInfo op_run_info;
1135 GetSingleOpRunInfo(kernel, &op_run_info);
1136 if (op_run_info.is_dynamic_shape) {
1137 MS_LOG(INFO) << "BuildOpsInGraph stop, op " << op_run_info.op_name << " is dynamic shape.";
1138 break;
1139 }
1140 const GraphInfo &graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors);
1141 const auto &single_op_graph_iter = run_op_graphs_.find(graph_info);
1142 if (single_op_graph_iter != run_op_graphs_.end()) {
1143 // if graph of same single op exists, the output tensor of current op should be generated
1144 GenOpOutputStubTensor(single_op_graph_iter->second, kernel, cnode_refcount, &op_output_info);
1145 continue;
1146 }
1147 const auto &single_op_graph =
1148 PreBuildOp(op_run_info, input_tensor_info.input_tensors, input_tensor_info.input_tensors_mask);
1149 MS_EXCEPTION_IF_NULL(single_op_graph);
1150 GenOpOutputStubTensor(single_op_graph, kernel, cnode_refcount, &op_output_info);
1151 opt::HideNopNode(single_op_graph.get());
1152 // The graph info could have been changed in PreBuildOp
1153 const GraphInfo &new_graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors);
1154 single_op_graphs.emplace(single_op_graph, new_graph_info);
1155 const auto &execution_order = single_op_graph->execution_order();
1156 std::copy(execution_order.begin(), execution_order.end(), std::back_inserter(kernels));
1157 }
1158 InitRuntimeResource();
1159 // Compile all kernels parallel
1160 BuildKernel(kernels);
1161 // Some new kernel may be added after KernelBuildPreprocess, so collect and build kernels again
1162 kernels.clear();
1163 for (const auto &graph_item : single_op_graphs) {
1164 device::ascend::KernelBuildPreprocess(graph_item.first.get());
1165 const auto &execution_order = graph_item.first->execution_order();
1166 std::copy(execution_order.begin(), execution_order.end(), std::back_inserter(kernels));
1167 }
1168 BuildKernel(kernels);
1169 // Record single op graphs in run_op_graphs_ so that these graphs can be reused in BuildOpImpl
1170 for (const auto &graph_item : single_op_graphs) {
1171 RunOpMemoryClear(graph_item.first.get());
1172 auto enable_op_graph_cache = MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
1173 if (enable_op_graph_cache) {
1174 run_op_graphs_[graph_item.second] = graph_item.first;
1175 }
1176 MS_LOG(DEBUG) << "Pre build op finished, graph info: " << graph_item.second;
1177 }
1178 built_graph_id_.insert(graph_id);
1179 }
1180
1181 // compile graph steps
SelectKernel(const KernelGraph & kernel_graph) const1182 void AscendSession::SelectKernel(const KernelGraph &kernel_graph) const {
1183 MS_LOG(INFO) << "Start!";
1184 size_t raise_precision_count = 0;
1185 size_t reduce_precision_count = 0;
1186 for (const auto &cnode : kernel_graph.execution_order()) {
1187 auto status = device::ascend::SelectKernelInfo(cnode);
1188 AnfAlgo::EraseNodeAttr(kAttrPynativeNextOpName, cnode);
1189 AnfAlgo::EraseNodeAttr(kAttrPynativeNextIndex, cnode);
1190 if (status == device::ascend::kStatusRaisePrecision) {
1191 raise_precision_count++;
1192 } else if (status == device::ascend::kStatusReducePrecision) {
1193 reduce_precision_count++;
1194 }
1195 MS_LOG(INFO) << "Select ApplyKernel: " << cnode->DebugString();
1196 }
1197 auto ms_context = MsContext::GetInstance();
1198 MS_EXCEPTION_IF_NULL(ms_context);
1199 if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) {
1200 if (raise_precision_count > 0) {
1201 MS_LOG(WARNING) << "There has " << raise_precision_count
1202 << " node/nodes used raise precision to selected the kernel!";
1203 }
1204 if (reduce_precision_count > 0) {
1205 MS_LOG(WARNING) << "There has " << reduce_precision_count
1206 << " node/nodes used reduce precision to selected the kernel!";
1207 }
1208 }
1209 MS_LOG(INFO) << "Finish!";
1210 }
1211
1212 #ifndef ENABLE_SECURITY
DumpInit(uint32_t device_id)1213 void DumpInit(uint32_t device_id) {
1214 auto &json_parser = DumpJsonParser::GetInstance();
1215 json_parser.Parse();
1216 json_parser.CopyDumpJsonToDir(device_id);
1217 json_parser.CopyHcclJsonToDir(device_id);
1218 json_parser.CopyMSCfgJsonToDir(device_id);
1219 if (json_parser.async_dump_enabled()) {
1220 if (AdxDataDumpServerInit() != 0) {
1221 MS_LOG(EXCEPTION) << "Adx data dump server init failed";
1222 }
1223 }
1224 }
1225 #endif
1226
InitRuntimeResource()1227 void AscendSession::InitRuntimeResource() {
1228 MS_LOG(INFO) << "Start!";
1229 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1230 MS_EXCEPTION_IF_NULL(runtime_instance);
1231 if (!runtime_instance->Init()) {
1232 MS_LOG(EXCEPTION) << "Kernel runtime init error.";
1233 }
1234 auto ms_context = MsContext::GetInstance();
1235 MS_EXCEPTION_IF_NULL(ms_context);
1236 auto env_rank_id = common::GetEnv("RANK_ID");
1237 if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
1238 // get actual rank id if it's distribution training case.
1239 rank_id_ = GetRankId();
1240 }
1241 #ifndef ENABLE_SECURITY
1242 DumpInit(rank_id_);
1243 #endif
1244 MS_LOG(INFO) << "Finish!";
1245 }
1246
HardwareOptimize(const std::shared_ptr<KernelGraph> & kernel_graph) const1247 void AscendSession::HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1248 MS_LOG(INFO) << "HardwareOptimize start!";
1249 opt::AscendBackendOptimization(kernel_graph);
1250 FinalOptimize(kernel_graph);
1251 GraphKernelOptimize(kernel_graph);
1252 MS_EXCEPTION_IF_NULL(kernel_graph);
1253 kernel_graph->SetExecOrderByDefault();
1254 MS_LOG(INFO) << "HardwareOptimize Finish!";
1255 }
1256
GraphKernelOptimize(const std::shared_ptr<KernelGraph> & kernel_graph) const1257 void AscendSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1258 if (!context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
1259 return;
1260 }
1261 opt::GraphKernelOptimize(kernel_graph);
1262 kernel_graph->SetExecOrderByDefault();
1263 }
1264
AdjustKernel(const std::shared_ptr<KernelGraph> & kernel_graph) const1265 void AscendSession::AdjustKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1266 MS_LOG(INFO) << "Start!";
1267 opt::HideNopNode(kernel_graph.get());
1268 auto execution_order = kernel_graph->execution_order();
1269 AnfAlgo::ReorderExecList(NOT_NULL(&execution_order));
1270 kernel_graph->set_execution_order(execution_order);
1271 // Insert CLearZero op
1272 // prepare for next step from json get atomic info
1273 BuildKernel(kernel_graph);
1274 device::ascend::KernelBuildPreprocess(kernel_graph.get());
1275 device::KernelAdjust::GetInstance().InsertSwitchLoop(kernel_graph);
1276 #ifdef ENABLE_DUMP_IR
1277 auto context_ptr = MsContext::GetInstance();
1278 MS_EXCEPTION_IF_NULL(context_ptr);
1279 bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
1280 if (save_graphs) {
1281 DumpIR("after_adjust_kernel.ir", kernel_graph);
1282 }
1283 #endif
1284 MS_LOG(INFO) << "Finish!";
1285 }
1286
RunOpAdjustKernel(const std::shared_ptr<KernelGraph> & kernel_graph) const1287 void AscendSession::RunOpAdjustKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1288 MS_LOG(INFO) << "Start!";
1289 RunOpHideNopNode(kernel_graph);
1290 // Insert CLearZero op
1291 // prepare for next step from json get atomic info
1292 BuildKernel(kernel_graph);
1293 device::ascend::KernelBuildPreprocess(kernel_graph.get());
1294 MS_LOG(INFO) << "Finish!";
1295 }
1296
AssignStream(NotNull<KernelGraphPtr> kernel_graph) const1297 void AscendSession::AssignStream(NotNull<KernelGraphPtr> kernel_graph) const {
1298 MS_LOG(INFO) << "Start!";
1299 device::ascend::AscendStreamAssign::GetInstance().AssignStream(kernel_graph);
1300 MS_LOG(INFO) << "Finish!";
1301 }
1302
BuildKernel(const std::shared_ptr<KernelGraph> & kernel_graph) const1303 void AscendSession::BuildKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1304 BuildKernel(kernel_graph->execution_order());
1305 }
1306
BuildKernel(const std::vector<CNodePtr> & kernels)1307 void AscendSession::BuildKernel(const std::vector<CNodePtr> &kernels) {
1308 MS_LOG(INFO) << "Start!";
1309 struct timeval start_time, end_time;
1310 (void)gettimeofday(&start_time, nullptr);
1311 auto ret = device::ascend::KernelBuild(kernels);
1312 if (!ret) {
1313 MS_LOG(EXCEPTION) << "Kernel build error.";
1314 }
1315 (void)gettimeofday(&end_time, nullptr);
1316 const uint64_t kUSecondInSecond = 1000000;
1317 uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
1318 cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
1319 MS_LOG(INFO) << "KernelBuild run in " << PRIu64 << " us " << cost;
1320 MS_LOG(INFO) << "Finish!";
1321 }
1322
BuildDynamicKernel(const std::shared_ptr<KernelGraph> & kernel_graph) const1323 void AscendSession::BuildDynamicKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1324 MS_LOG(DEBUG) << "Start!";
1325 MS_EXCEPTION_IF_NULL(kernel_graph);
1326 const auto &kernels = kernel_graph->execution_order();
1327 auto iter = std::find_if(kernels.begin(), kernels.end(), [](const CNodePtr &kernel) {
1328 return AnfAlgo::GetBooleanAttr(kernel, kAttrOutputIsDynamicShape);
1329 });
1330 if (iter == kernels.end()) {
1331 return;
1332 }
1333 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1334 MS_EXCEPTION_IF_NULL(runtime_instance);
1335 if (!runtime_instance->GenDynamicKernel(*kernel_graph)) {
1336 MS_LOG(DEBUG) << "Graph:" << kernel_graph->graph_id() << " failed to generate dynamic kernel!";
1337 }
1338 MS_LOG(DEBUG) << "Finish!";
1339 }
1340
GetNextLabelSet(const std::vector<CNodePtr> & kernel_nodes,uint32_t index)1341 static CNodePtr GetNextLabelSet(const std::vector<CNodePtr> &kernel_nodes, uint32_t index) {
1342 size_t node_sizes = kernel_nodes.size();
1343 if (index >= node_sizes - 1) {
1344 MS_LOG(EXCEPTION) << "there is no node after this node:" << kernel_nodes[index]->DebugString();
1345 }
1346 auto kernel = kernel_nodes[index + 1];
1347 if (AnfAlgo::GetCNodeName(kernel) != kLabelSetOpName) {
1348 MS_LOG(EXCEPTION) << "the node is not labelset follow labelgoto/labelswitch, node: "
1349 << kernel_nodes[index]->DebugString();
1350 }
1351 return kernel;
1352 }
1353
HandleRecursiveCall(const std::vector<CNodePtr> & kernel_cnodes,const uint32_t & back_label,uint32_t * index,std::vector<CNodePtr> * back)1354 static std::vector<CNodePtr> HandleRecursiveCall(const std::vector<CNodePtr> &kernel_cnodes, const uint32_t &back_label,
1355 uint32_t *index, std::vector<CNodePtr> *back) {
1356 MS_EXCEPTION_IF_NULL(index);
1357 MS_EXCEPTION_IF_NULL(back);
1358 std::vector<CNodePtr> front;
1359 std::vector<CNodePtr> back_temp;
1360 bool back_flag = false;
1361 uint32_t i = *index;
1362 while (i < kernel_cnodes.size()) {
1363 if (!back_flag) {
1364 front.emplace_back(kernel_cnodes[i]);
1365 } else {
1366 back->emplace_back(kernel_cnodes[i]);
1367 }
1368 if (AnfAlgo::HasNodeAttr(kAttrRecursiveEnd, kernel_cnodes[i])) {
1369 *index = i;
1370 back->insert(back->end(), back_temp.begin(), back_temp.end());
1371 return front;
1372 }
1373 if (AnfAlgo::HasNodeAttr(kAttrRecursive, kernel_cnodes[i])) {
1374 back_flag = true;
1375 if (!AnfAlgo::IsLabelIndexInNode(kernel_cnodes[i], back_label)) {
1376 auto temp = HandleRecursiveCall(kernel_cnodes, back_label, &(++i), &back_temp);
1377 front.insert(front.end(), temp.begin(), temp.end());
1378 }
1379 }
1380 i++;
1381 }
1382 return front;
1383 }
1384
UnfoldRecursiveExecOrder(KernelGraph * kernel_graph)1385 static void UnfoldRecursiveExecOrder(KernelGraph *kernel_graph) {
1386 MS_EXCEPTION_IF_NULL(kernel_graph);
1387 if (!kernel_graph->recursive_call()) {
1388 return;
1389 }
1390 auto kernel_cnodes = kernel_graph->mem_reuse_exec_order();
1391 std::vector<CNodePtr> mem_reuse_order;
1392 mem_reuse_order.reserve(kernel_cnodes.size());
1393 for (uint32_t i = 0; i < kernel_cnodes.size(); i++) {
1394 if (!AnfAlgo::HasNodeAttr(kAttrRecursiveStart, kernel_cnodes[i])) {
1395 mem_reuse_order.emplace_back(kernel_cnodes[i]);
1396 continue;
1397 }
1398 auto label_id = AnfAlgo::GetNodeAttr<uint32_t>(kernel_cnodes[i], kAttrLabelIndex);
1399 std::vector<CNodePtr> back;
1400 auto front = HandleRecursiveCall(kernel_cnodes, label_id, &i, &back);
1401 mem_reuse_order.insert(mem_reuse_order.end(), front.begin(), front.end());
1402 mem_reuse_order.insert(mem_reuse_order.end(), back.begin(), back.end());
1403 }
1404 kernel_graph->set_mem_reuse_exec_order(mem_reuse_order);
1405 }
1406
GetSubGraphExecOrder(const KernelGraph * kernel_graph,uint32_t index,const CNodePtr & back_node,std::vector<CNodePtr> * mem_reuse_order)1407 static void GetSubGraphExecOrder(const KernelGraph *kernel_graph, uint32_t index, const CNodePtr &back_node,
1408 std::vector<CNodePtr> *mem_reuse_order) {
1409 MS_EXCEPTION_IF_NULL(kernel_graph);
1410 MS_EXCEPTION_IF_NULL(mem_reuse_order);
1411 auto label_id = AnfAlgo::GetNodeAttr<uint32_t>(back_node, kAttrLabelIndex);
1412 auto kernel_cnodes = kernel_graph->execution_order();
1413 for (auto i = index; i < kernel_cnodes.size(); i++) {
1414 mem_reuse_order->emplace_back(kernel_cnodes[i]);
1415 if (AnfAlgo::IsLabelIndexInNode(kernel_cnodes[i], label_id)) {
1416 return;
1417 }
1418 }
1419 }
1420
InitMemReuseExecOrder(KernelGraph * kernel_graph)1421 void InitMemReuseExecOrder(KernelGraph *kernel_graph) {
1422 MS_EXCEPTION_IF_NULL(kernel_graph);
1423 if (!kernel_graph->subgraph_multi_call()) {
1424 return;
1425 }
1426 std::unordered_map<uint32_t, uint32_t> label_id_index_map;
1427 auto kernel_cnodes = kernel_graph->execution_order();
1428 std::vector<CNodePtr> mem_reuse_order;
1429 for (uint32_t i = 0; i < kernel_cnodes.size(); i++) {
1430 mem_reuse_order.emplace_back(kernel_cnodes[i]);
1431 if (AnfAlgo::CheckPrimitiveType(kernel_cnodes[i], prim::kPrimLabelSwitch) &&
1432 !AnfAlgo::HasNodeAttr(kAttrRecursive, kernel_cnodes[i]) &&
1433 !AnfAlgo::HasNodeAttr(kAttrReturn, kernel_cnodes[i])) {
1434 auto label_list = AnfAlgo::GetNodeAttr<std::vector<uint32_t>>(kernel_cnodes[i], kAttrLabelSwitchList);
1435 for (auto label_id : label_list) {
1436 if (label_id_index_map.find(label_id) == label_id_index_map.end()) {
1437 continue;
1438 }
1439 auto back_node = GetNextLabelSet(kernel_cnodes, i);
1440 GetSubGraphExecOrder(kernel_graph, label_id_index_map[label_id], back_node, &mem_reuse_order);
1441 }
1442 continue;
1443 }
1444 if (AnfAlgo::CheckPrimitiveType(kernel_cnodes[i], prim::kPrimLabelGoto) &&
1445 !AnfAlgo::HasNodeAttr(kAttrRecursive, kernel_cnodes[i]) &&
1446 !AnfAlgo::HasNodeAttr(kAttrReturn, kernel_cnodes[i])) {
1447 auto label_id = AnfAlgo::GetNodeAttr<uint32_t>(kernel_cnodes[i], kAttrLabelIndex);
1448 if (label_id_index_map.find(label_id) == label_id_index_map.end()) {
1449 continue;
1450 }
1451 auto back_node = GetNextLabelSet(kernel_cnodes, i);
1452 GetSubGraphExecOrder(kernel_graph, label_id_index_map[label_id], back_node, &mem_reuse_order);
1453 continue;
1454 }
1455 if (AnfAlgo::CheckPrimitiveType(kernel_cnodes[i], prim::kPrimLabelSet) &&
1456 !AnfAlgo::HasNodeAttr(kAttrRecursive, kernel_cnodes[i])) {
1457 auto label_id = AnfAlgo::GetNodeAttr<uint32_t>(kernel_cnodes[i], kAttrLabelIndex);
1458 if (label_id_index_map.find(label_id) != label_id_index_map.end()) {
1459 MS_LOG(EXCEPTION) << "Two labelsets with same label id.";
1460 }
1461 label_id_index_map[label_id] = i;
1462 continue;
1463 }
1464 }
1465 kernel_graph->set_mem_reuse_exec_order(mem_reuse_order);
1466 UnfoldRecursiveExecOrder(kernel_graph);
1467 }
1468
MemoryAlloc(KernelGraph * kernel_graph) const1469 void AscendSession::MemoryAlloc(KernelGraph *kernel_graph) const {
1470 MS_LOG(INFO) << "Start!";
1471 MS_EXCEPTION_IF_NULL(kernel_graph);
1472 InitMemReuseExecOrder(kernel_graph);
1473 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1474 MS_EXCEPTION_IF_NULL(runtime_instance);
1475 runtime_instance->AssignMemory(*kernel_graph);
1476 MS_LOG(INFO) << "Finish!";
1477 }
1478
RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> & input_tensors,KernelGraph * kernel_graph) const1479 void AscendSession::RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input_tensors,
1480 KernelGraph *kernel_graph) const {
1481 MS_EXCEPTION_IF_NULL(kernel_graph);
1482 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1483 MS_EXCEPTION_IF_NULL(runtime_instance);
1484 runtime_instance->RunOpAssignMemory(input_tensors, *kernel_graph);
1485 }
1486
RunOpMemoryAllocNew(const std::vector<tensor::TensorPtr> & input_tensors,const std::map<tensor::TensorPtr,session::KernelWithIndex> & tensor_to_node,const KernelGraph & kernel_graph) const1487 void AscendSession::RunOpMemoryAllocNew(const std::vector<tensor::TensorPtr> &input_tensors,
1488 const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node,
1489 const KernelGraph &kernel_graph) const {
1490 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1491 MS_EXCEPTION_IF_NULL(runtime_instance);
1492 runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph, tensor_to_node);
1493 }
1494
RunOpGenKernelEvent(const KernelGraph * graph) const1495 void AscendSession::RunOpGenKernelEvent(const KernelGraph *graph) const {
1496 MS_EXCEPTION_IF_NULL(graph);
1497 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1498 MS_EXCEPTION_IF_NULL(runtime_instance);
1499 runtime_instance->GenKernelEvents(*graph);
1500 }
1501
RunOpMemoryClear(const KernelGraph * kernel_graph) const1502 void AscendSession::RunOpMemoryClear(const KernelGraph *kernel_graph) const {
1503 MS_EXCEPTION_IF_NULL(kernel_graph);
1504 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1505 MS_EXCEPTION_IF_NULL(runtime_instance);
1506 runtime_instance->RunOpClearMemory(*kernel_graph);
1507 }
1508
Load(const std::shared_ptr<KernelGraph> & kernel_graph) const1509 void AscendSession::Load(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1510 MS_LOG(INFO) << "Start!";
1511 auto context_ptr = MsContext::GetInstance();
1512 MS_EXCEPTION_IF_NULL(context_ptr);
1513 bool is_task_sink = context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
1514 (void)device::KernelAdjust::GetInstance().StepLoadCtrlInputs(kernel_graph);
1515 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1516 MS_EXCEPTION_IF_NULL(runtime_instance);
1517 bool ret_ok = runtime_instance->Load(*kernel_graph, is_task_sink);
1518 if (!ret_ok) {
1519 MS_LOG(EXCEPTION) << "Load task error!";
1520 }
1521 MS_LOG(INFO) << "Finish!";
1522 }
1523
Execute(const std::shared_ptr<KernelGraph> & kernel_graph,bool is_task) const1524 void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bool is_task) const {
1525 MS_LOG(DEBUG) << "Start!";
1526 bool is_task_sink = false;
1527 if (is_task) {
1528 auto context_ptr = MsContext::GetInstance();
1529 MS_EXCEPTION_IF_NULL(context_ptr);
1530 is_task_sink = context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
1531 }
1532 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1533 MS_EXCEPTION_IF_NULL(runtime_instance);
1534 if (is_task && is_task_sink) {
1535 #ifndef ENABLE_SECURITY
1536 DumpSetup(kernel_graph);
1537 #endif
1538 }
1539 bool ret_ok = runtime_instance->Run(*kernel_graph, is_task_sink);
1540 if (is_task && is_task_sink) {
1541 #ifndef ENABLE_SECURITY
1542 Dump(kernel_graph);
1543 #endif
1544 }
1545 if (!ret_ok) {
1546 #ifdef ENABLE_DUMP_IR
1547 mindspore::RDR::TriggerAll();
1548 #endif
1549 MS_LOG(EXCEPTION) << "run task error!";
1550 }
1551 MS_LOG(DEBUG) << "Finish!";
1552 }
1553
1554 #ifndef ENABLE_SECURITY
DumpSetup(const std::shared_ptr<KernelGraph> & kernel_graph) const1555 void AscendSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1556 MS_LOG(DEBUG) << "Start!";
1557 MS_EXCEPTION_IF_NULL(kernel_graph);
1558 E2eDump::DumpSetup(kernel_graph.get());
1559 MS_LOG(DEBUG) << "Finish!";
1560 }
1561
Dump(const std::shared_ptr<KernelGraph> & kernel_graph) const1562 void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1563 MS_LOG(DEBUG) << "Start!";
1564 MS_EXCEPTION_IF_NULL(kernel_graph);
1565 E2eDump::DumpData(kernel_graph.get(), rank_id_);
1566 MS_LOG(DEBUG) << "Finish!";
1567 }
1568 #endif
1569
DumpAllGraphs(const std::vector<KernelGraphPtr> & all_graphs)1570 void AscendSession::DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs) {
1571 #ifdef ENABLE_DUMP_IR
1572 auto context_ptr = MsContext::GetInstance();
1573 MS_EXCEPTION_IF_NULL(context_ptr);
1574 bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
1575 auto &json_parser = DumpJsonParser::GetInstance();
1576 json_parser.Parse();
1577 if (!save_graphs && !json_parser.e2e_dump_enabled() && !json_parser.async_dump_enabled() &&
1578 !mindspore::RecorderManager::Instance().RdrEnable()) {
1579 return;
1580 }
1581 auto kernel_runtime = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1582 MS_EXCEPTION_IF_NULL(kernel_runtime);
1583 for (auto &graph : all_graphs) {
1584 MS_EXCEPTION_IF_NULL(graph);
1585 std::string name = "graph_build." + std::to_string(graph->graph_id());
1586 DumpGraphParams dump_params = {true, static_cast<int>(kWholeStack)};
1587 (void)mindspore::RDR::RecordAnfGraph(SUBMODULE_ID, name, graph, dump_params, ".ir;.pb");
1588 if (save_graphs) {
1589 std::string file_name = "graph_build_" + std::to_string(graph->graph_id()) + ".ir";
1590 DumpIR(file_name, graph, true, kWholeStack);
1591 DumpIRProto(graph, "vm_build_" + std::to_string(graph->graph_id()));
1592 DumpIR("trace_code_graph", graph, true, kWholeStack);
1593 }
1594 std::string final_graph = "trace_code_graph_" + std::to_string(graph->graph_id());
1595 if (json_parser.e2e_dump_enabled() || json_parser.async_dump_enabled()) {
1596 std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id_);
1597 std::string target_dir = root_dir + "/graphs";
1598 std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir";
1599 DumpIRProtoWithSrcInfo(graph, final_graph, target_dir, kDebugWholeStack);
1600 DumpIR("trace_code_graph", graph, true, kWholeStack, ir_file_path);
1601 DumpGraphExeOrder("ms_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv", root_dir,
1602 graph->execution_order());
1603 }
1604 }
1605 #endif
1606 }
1607
LoadTensor(const std::shared_ptr<KernelGraph> & kernel_graph) const1608 void AscendSession::LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1609 MS_LOG(INFO) << "Start!";
1610 MS_EXCEPTION_IF_NULL(kernel_graph);
1611 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1612 MS_EXCEPTION_IF_NULL(runtime_instance);
1613 (void)runtime_instance->LoadData(*kernel_graph);
1614 MS_LOG(INFO) << "Finish!";
1615 }
1616
1617 #ifndef ENABLE_SECURITY
RecurseSetSummaryNodes(KernelGraph * graph,std::map<std::string,std::pair<AnfNodePtr,int>> * summary)1618 void AscendSession::RecurseSetSummaryNodes(KernelGraph *graph,
1619 std::map<std::string, std::pair<AnfNodePtr, int>> *summary) {
1620 MS_EXCEPTION_IF_NULL(graph);
1621 MS_EXCEPTION_IF_NULL(summary);
1622 // if final graph have no child graph
1623 auto graph_order_iter = graph_execute_orders_.find(graph->graph_id());
1624 if (graph_order_iter == graph_execute_orders_.end()) {
1625 SessionBasic::SetSummaryNodes(graph);
1626 auto summary_nodes = graph->summary_nodes();
1627 summary->insert(summary_nodes.begin(), summary_nodes.end());
1628 return;
1629 }
1630 // for every child graph, find summary nodes
1631 auto graph_order = GetGraphOrder(graph->graph_id());
1632 for (size_t i = 0; i < graph_order.size(); i++) {
1633 auto child_graph = GetGraph(graph_order[i]);
1634 if (child_graph == nullptr) {
1635 continue;
1636 }
1637 SessionBasic::SetSummaryNodes(child_graph.get());
1638 auto child_graph_summary = child_graph->summary_nodes();
1639 summary->insert(child_graph_summary.begin(), child_graph_summary.end());
1640 RecurseSetSummaryNodes(child_graph.get(), summary);
1641 }
1642 graph->set_summary_nodes(*summary);
1643 }
1644
SetSummaryNodes(KernelGraph * graph)1645 void AscendSession::SetSummaryNodes(KernelGraph *graph) {
1646 MS_LOG(DEBUG) << "Update summary Start";
1647 MS_EXCEPTION_IF_NULL(graph);
1648 auto summary_nodes = graph->summary_nodes();
1649 std::map<std::string, std::pair<AnfNodePtr, int>> summary;
1650 summary.insert(summary_nodes.begin(), summary_nodes.end());
1651 RecurseSetSummaryNodes(graph, &summary);
1652 graph->set_summary_nodes(summary);
1653 MS_LOG(DEBUG) << "Update summary end size: " << summary.size();
1654 }
1655 #endif
1656
MergeGraphExecOrder()1657 void AscendSession::MergeGraphExecOrder() {
1658 MS_LOG(INFO) << "Start!";
1659 // merge graph order
1660 auto &graph_order = GetGraphOrder(final_graph_id_);
1661 auto &graph_type = GetGraphOrderType(final_graph_id_);
1662 auto final_graph = GetGraph(final_graph_id_);
1663 MS_EXCEPTION_IF_NULL(final_graph);
1664 if (graph_order.empty()) {
1665 MS_LOG(WARNING) << "Graph output is a lonely variable not linked to any op!";
1666 return;
1667 }
1668 if (graph_order.size() > 1) {
1669 auto context_ptr = MsContext::GetInstance();
1670 MS_EXCEPTION_IF_NULL(context_ptr);
1671 if (!context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) {
1672 MS_LOG(EXCEPTION) << "Control sink network should run with task-sink mode!";
1673 }
1674 }
1675 // if first graph is common,the final graph has no label,then set the stream of final graph same with the first graph
1676 SetStreamDistinctionLabel(final_graph, graph_order[0], false);
1677 std::vector<CNodePtr> final_exec_order = final_graph->execution_order();
1678 KernelGraphPtr last_graph = nullptr;
1679 for (size_t i = 0; i < graph_order.size(); i++) {
1680 auto graph_id = graph_order[i];
1681 if (graph_type[i] == BRANCH_END || graph_type[i] == BRANCH_START) {
1682 continue;
1683 }
1684 auto child_graph = GetGraph(graph_id);
1685 last_graph = child_graph;
1686 MS_EXCEPTION_IF_NULL(child_graph);
1687 auto exec_order = child_graph->execution_order();
1688 MS_LOG(INFO) << "Merge graph,graph_id " << graph_id;
1689 (void)std::transform(exec_order.begin(), exec_order.end(), std::back_inserter(final_exec_order),
1690 [&](CNodePtr node) -> CNodePtr {
1691 AnfAlgo::SetStreamDistinctionLabel(child_graph->stream_distinction_label(), node.get());
1692 return node;
1693 });
1694 // add all value nodes of child graphs to final graph
1695 for (auto &value_node : child_graph->graph_value_nodes()) {
1696 final_graph->AddValueNodeToGraph(value_node);
1697 }
1698 // copy ref map to final graph
1699 auto child_ref_map = child_graph->GetRefMap();
1700 for (auto &item : child_ref_map) {
1701 if (final_graph->IsInRefOutputMap(item.first)) {
1702 MS_LOG(EXCEPTION) << "The ref pair is already in final graph!";
1703 }
1704 final_graph->AddRefCorrespondPairs(item.first, item.second);
1705 }
1706 }
1707 // set final_exec_order into final graph
1708 MS_EXCEPTION_IF_NULL(final_graph);
1709 #ifndef ENABLE_SECURITY
1710 DumpGraphExeOrder(final_exec_order);
1711 #endif
1712 final_graph->set_execution_order(final_exec_order);
1713 }
1714
GetGraphOrder(GraphId final_graph_id) const1715 const std::vector<GraphId> &AscendSession::GetGraphOrder(GraphId final_graph_id) const {
1716 auto graph_order_iter = graph_execute_orders_.find(final_graph_id);
1717 if (graph_order_iter == graph_execute_orders_.end()) {
1718 MS_LOG(EXCEPTION) << "Final graph" << final_graph_id << "has no child graph";
1719 }
1720 return graph_order_iter->second;
1721 }
1722
GetGraphOrderType(GraphId final_graph_id) const1723 const std::vector<GraphType> &AscendSession::GetGraphOrderType(GraphId final_graph_id) const {
1724 auto graph_type_iter = graph_order_types_.find(final_graph_id);
1725 if (graph_type_iter == graph_order_types_.end()) {
1726 MS_LOG(EXCEPTION) << "Final graph" << final_graph_id << "has no graph_order_types_";
1727 }
1728 return graph_type_iter->second;
1729 }
1730
SyncInitialTenosrToDevice()1731 void AscendSession::SyncInitialTenosrToDevice() {
1732 for (auto &item : initial_tenosrs_) {
1733 auto to_graph_id = item.first.first;
1734 auto input_idx = item.first.second;
1735 auto front_tensor = item.second;
1736 auto to_graph = GetGraph(to_graph_id);
1737 MS_EXCEPTION_IF_NULL(to_graph);
1738 std::vector<AnfNodePtr> graph_inputs = to_graph->inputs();
1739 if (input_idx >= graph_inputs.size()) {
1740 MS_LOG(EXCEPTION) << "Input_index " << input_idx << " out of range size " << graph_inputs.size();
1741 }
1742 auto backend_parameter = graph_inputs[input_idx];
1743 // sync data from host to device
1744 MS_EXCEPTION_IF_NULL(front_tensor);
1745 size_t tensor_size = LongToSize(front_tensor->data().nbytes());
1746 auto addr = AnfAlgo::GetOutputAddr(backend_parameter, 0);
1747 MS_EXCEPTION_IF_NULL(addr);
1748 if (!addr->SyncHostToDevice(trans::GetRuntimePaddingShape(backend_parameter, 0), tensor_size,
1749 front_tensor->data_type(), front_tensor->data_c(),
1750 front_tensor->device_info().host_format_)) {
1751 MS_LOG(EXCEPTION) << "Tensor SyncHostToDevice fail!";
1752 }
1753 }
1754 }
1755
RootGraphExecutorValidate(NotNull<KernelGraphPtr> graph,const std::vector<KernelGraphPtr> & all_graphs)1756 void AscendSession::RootGraphExecutorValidate(NotNull<KernelGraphPtr> graph,
1757 const std::vector<KernelGraphPtr> &all_graphs) {
1758 AscendAutoMonad auto_monad(graph);
1759 auto_monad.GenerateExecuteOrder();
1760 if (graph->label_num() > kLabelNumsThreshold) {
1761 MS_LOG(EXCEPTION) << "This model with " << all_graphs.size() << " graphs needs " << graph->label_num()
1762 << " labels, which out of range of [0, 1024).\n1. Check if front-end composition is correct.\n"
1763 << "2. Optimize model expression and reduce the number of graphs and labels.";
1764 }
1765 }
1766
IrFusionPass(const NotNull<KernelGraphPtr> graph,NotNull<std::set<KernelGraphPtr> * > memo)1767 void AscendSession::IrFusionPass(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) {
1768 if (memo->find(graph) != memo->end()) {
1769 return;
1770 }
1771 memo->insert(graph.get());
1772 opt::AscendBackendIRFusionOptimization(graph);
1773 graph->SetExecOrderByDefault();
1774 #ifdef ENABLE_DUMP_IR
1775 auto context_ptr = MsContext::GetInstance();
1776 MS_EXCEPTION_IF_NULL(context_ptr);
1777 bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
1778 if (save_graphs) {
1779 std::string file_name = "select_kernel_before_graph_" + std::to_string(graph->graph_id()) + ".ir";
1780 DumpIR(file_name, graph.get());
1781 }
1782 #endif
1783
1784 for (auto &child_graph : graph->child_graph_order()) {
1785 IrFusionPass(NOT_NULL(child_graph.lock()), memo);
1786 }
1787 }
1788
SelectKernel(NotNull<KernelGraphPtr> root_graph)1789 void AscendSession::SelectKernel(NotNull<KernelGraphPtr> root_graph) {
1790 MS_LOG(INFO) << "Start select kernel.";
1791 size_t raise_precision_count = 0;
1792 size_t reduce_precision_count = 0;
1793
1794 std::set<KernelGraphPtr> memo;
1795 RecurseSelectKernelInfo(root_graph, NOT_NULL(&memo), &raise_precision_count, &reduce_precision_count);
1796 memo.clear();
1797
1798 auto ms_context = MsContext::GetInstance();
1799 MS_EXCEPTION_IF_NULL(ms_context);
1800 if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) {
1801 if (raise_precision_count > 0) {
1802 MS_LOG(WARNING) << "There are " << raise_precision_count
1803 << " node/nodes used raise precision to selected the kernel!";
1804 }
1805 if (reduce_precision_count > 0) {
1806 MS_LOG(WARNING) << "There are " << reduce_precision_count
1807 << " node/nodes used reduce precision to selected the kernel!";
1808 }
1809 }
1810 MS_LOG(INFO) << "Finish!";
1811 }
1812
RecurseSelectKernelInfo(NotNull<KernelGraphPtr> graph,NotNull<std::set<KernelGraphPtr> * > const memo,size_t * const raise_precision_count,size_t * const reduce_precision_count) const1813 void AscendSession::RecurseSelectKernelInfo(NotNull<KernelGraphPtr> graph,
1814 NotNull<std::set<KernelGraphPtr> *> const memo,
1815 size_t *const raise_precision_count,
1816 size_t *const reduce_precision_count) const {
1817 if (memo->find(graph) != memo->end()) {
1818 return;
1819 }
1820 memo->insert(graph.get());
1821 MS_LOG(INFO) << "Start to select kernel info in graph: " << graph->graph_id();
1822
1823 for (const auto &cnode : graph->execution_order()) {
1824 if (AnfAlgo::IsCondControlKernel(cnode)) {
1825 std::vector<KernelGraphPtr> child_graphs;
1826 if (AnfAlgo::HasNodeAttr(kAttrChildGraph, cnode)) {
1827 child_graphs = AnfAlgo::GetNodeAttr<std::vector<KernelGraphPtr>>(cnode, kAttrChildGraph);
1828 }
1829 for (auto &child_graph : child_graphs) {
1830 RecurseSelectKernelInfo(NOT_NULL(child_graph), memo, raise_precision_count, reduce_precision_count);
1831 }
1832 }
1833
1834 auto status = device::ascend::SelectKernelInfo(cnode);
1835 if (status == device::ascend::kStatusRaisePrecision) {
1836 (*raise_precision_count)++;
1837 } else if (status == device::ascend::kStatusReducePrecision) {
1838 (*reduce_precision_count)++;
1839 }
1840 }
1841 #ifdef ENABLE_DUMP_IR
1842 auto context_ptr = MsContext::GetInstance();
1843 MS_EXCEPTION_IF_NULL(context_ptr);
1844 bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
1845 if (save_graphs) {
1846 std::string file_name = "select_kernel_after_graph_" + std::to_string(graph->graph_id()) + ".ir";
1847 DumpIR(file_name, graph.get());
1848 }
1849 #endif
1850 MS_LOG(INFO) << "Finish selecting kernel info in graph: " << graph->graph_id();
1851 }
1852
HardwareOptimize(NotNull<KernelGraphPtr> graph,NotNull<std::set<KernelGraphPtr> * > const memo) const1853 void AscendSession::HardwareOptimize(NotNull<KernelGraphPtr> graph,
1854 NotNull<std::set<KernelGraphPtr> *> const memo) const {
1855 if (memo->find(graph) != memo->end()) {
1856 return;
1857 }
1858 memo->insert(graph.get());
1859
1860 MS_LOG(INFO) << "Start to do HardwareOptimize in graph: " << graph->graph_id();
1861
1862 HardwareOptimize(graph.get());
1863 for (auto &child_graph : graph->child_graph_order()) {
1864 HardwareOptimize(NOT_NULL(child_graph.lock()), memo);
1865 }
1866 MS_LOG(INFO) << "Finish doing HardwareOptimize in graph: " << graph->graph_id();
1867 }
1868
1869 #ifdef ENABLE_DEBUGGER
LoadGraphsToDbg(NotNull<KernelGraphPtr> graph,NotNull<std::set<KernelGraphPtr> * > const memo) const1870 void AscendSession::LoadGraphsToDbg(NotNull<KernelGraphPtr> graph,
1871 NotNull<std::set<KernelGraphPtr> *> const memo) const {
1872 if (memo->find(graph) != memo->end()) {
1873 return;
1874 }
1875 memo->insert(graph.get());
1876
1877 MS_LOG(INFO) << "Start to do LoadGraphsToDbg in graph: " << graph->graph_id();
1878
1879 MS_EXCEPTION_IF_NULL(debugger_);
1880 debugger_->LoadGraphs(graph);
1881 MS_LOG(INFO) << "graph_sum_: " << graph_sum_;
1882 for (auto &child_graph : graph->child_graph_order()) {
1883 LoadGraphsToDbg(NOT_NULL(child_graph.lock()), memo);
1884 }
1885 MS_LOG(INFO) << "Finish doing LoadGraphsToDbg in graph: " << graph->graph_id();
1886 }
1887 #endif
1888
AssignStaticMemory(NotNull<KernelGraphPtr> graph,NotNull<std::set<KernelGraphPtr> * > const memo) const1889 void AscendSession::AssignStaticMemory(NotNull<KernelGraphPtr> graph,
1890 NotNull<std::set<KernelGraphPtr> *> const memo) const {
1891 if (memo->find(graph) != memo->end()) {
1892 return;
1893 }
1894 memo->insert(graph.get());
1895
1896 MS_LOG(INFO) << "Start to assign static memory for parameter in graph: " << graph->graph_id();
1897 // assign static memory for parameters
1898 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1899 MS_EXCEPTION_IF_NULL(runtime_instance);
1900 runtime_instance->ClearGlobalIdleMem();
1901 runtime_instance->AssignStaticMemoryInput(*graph.get());
1902 runtime_instance->AssignStaticMemoryValueNode(*graph.get());
1903 for (auto &child_graph : graph->child_graph_order()) {
1904 AssignStaticMemory(NOT_NULL(child_graph.lock()), memo);
1905 }
1906 MS_LOG(INFO) << "Finish assigning static memory for parameter in graph: " << graph->graph_id();
1907 }
1908
UpdateRefOutputMap(NotNull<KernelGraphPtr> graph,NotNull<std::set<KernelGraphPtr> * > const memo) const1909 void AscendSession::UpdateRefOutputMap(NotNull<KernelGraphPtr> graph,
1910 NotNull<std::set<KernelGraphPtr> *> const memo) const {
1911 if (memo->find(graph) != memo->end()) {
1912 return;
1913 }
1914 memo->insert(graph.get());
1915
1916 for (auto &child_graph : graph->child_graph_order()) {
1917 std::shared_ptr<KernelGraph> child_graph_ptr = child_graph.lock();
1918 MS_EXCEPTION_IF_NULL(child_graph_ptr);
1919 UpdateRefOutputMap(NOT_NULL(child_graph_ptr), memo);
1920 // copy ref map to final graph
1921 auto child_ref_map = child_graph_ptr->GetRefMap();
1922 for (auto &item : child_ref_map) {
1923 if (graph->IsInRefOutputMap(item.first)) {
1924 MS_LOG(WARNING) << "The ref pair <" << item.first.first->DebugString() << ", " << item.first.second
1925 << "> is already in " << graph->ToString();
1926 continue;
1927 }
1928 graph->AddRefCorrespondPairs(item.first, item.second);
1929 }
1930 }
1931 }
1932
SyncStream() const1933 void AscendSession::SyncStream() const {
1934 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1935 MS_EXCEPTION_IF_NULL(runtime_instance);
1936 auto ret = runtime_instance->SyncStream();
1937 if (!ret) {
1938 MS_LOG(EXCEPTION) << "Sync stream error!";
1939 }
1940 }
1941
CreateBucket(uint32_t bucket_id,uint32_t bucket_size)1942 std::shared_ptr<device::Bucket> AscendSession::CreateBucket(uint32_t bucket_id, uint32_t bucket_size) {
1943 auto bucket = std::make_shared<device::ascend::AscendBucket>(bucket_id, bucket_size);
1944
1945 auto kernel_runtime = device::KernelRuntimeManager::Instance().GetCurrentKernelRuntime();
1946 MS_EXCEPTION_IF_NULL(kernel_runtime);
1947 auto compute_stream = kernel_runtime->compute_stream();
1948 auto communication_stream = kernel_runtime->communication_stream();
1949 MS_EXCEPTION_IF_NULL(compute_stream);
1950 MS_EXCEPTION_IF_NULL(communication_stream);
1951
1952 MS_EXCEPTION_IF_NULL(bucket);
1953 bucket->Init({compute_stream}, {communication_stream});
1954 return bucket;
1955 }
1956
ReportWarningMessage()1957 void AscendSession::ReportWarningMessage() {
1958 const string &warning_message = ErrorManager::GetInstance().GetWarningMessage();
1959 if (!warning_message.empty()) {
1960 MS_LOG(WARNING) << "Ascend warning message:\n" << warning_message;
1961 }
1962 }
1963
ReportErrorMessage()1964 void AscendSession::ReportErrorMessage() {
1965 const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
1966 if (!error_message.empty() && error_message.find(kUnknowErrorString) == string::npos) {
1967 MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
1968 }
1969 }
1970
SetThreadContext()1971 void AscendSession::SetThreadContext() { ErrorManager::GetInstance().GenWorkStreamIdDefault(); }
1972
ExecuteAllTaskInQueue()1973 void AscendSession::ExecuteAllTaskInQueue() {
1974 // Execute All Task
1975 auto &task_manager = PynativeTaskManager::GetInstance();
1976 if (task_manager.QueueEmpty()) {
1977 return;
1978 }
1979
1980 try {
1981 MS_LOG(DEBUG) << "Start";
1982 auto ms_context = MsContext::GetInstance();
1983 auto infer_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER);
1984 ms_context->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER, true);
1985
1986 BatchBuildKernel(task_manager.GetAllBuildTasks());
1987 task_manager.ClearAllBuildTasks();
1988
1989 // Launch one by one
1990 const auto &launch_tasks = task_manager.GetAllLaunchTasks();
1991 while (!launch_tasks.empty()) {
1992 auto &launch_task = launch_tasks.front();
1993 const auto &context = launch_task->context();
1994 LaunchFunc(context->graph(), context->tensor_to_node(), context->is_dynamic_shape(), context->input_tensors());
1995 task_manager.PopLaunchTask();
1996 }
1997
1998 ms_context->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER, infer_flag);
1999 MS_LOG(DEBUG) << "End";
2000 } catch (const std::exception &ex) {
2001 task_manager.Reset();
2002 throw(std::runtime_error(ex.what()));
2003 } catch (...) {
2004 task_manager.Reset();
2005 std::string exName(abi::__cxa_current_exception_type()->name());
2006 MS_LOG(EXCEPTION) << "Error occurred when execute task in queue. Exception name: " << exName;
2007 }
2008 }
UpdateOutputTensors(const VectorRef * outputs,const std::map<tensor::TensorPtr,session::KernelWithIndex> & tensor_to_node,std::map<DeviceAddressPtr,DeviceAddressPtr> *)2009 void AscendSession::UpdateOutputTensors(const VectorRef *outputs,
2010 const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node,
2011 std::map<DeviceAddressPtr, DeviceAddressPtr> *) {
2012 auto context_ptr = MsContext::GetInstance();
2013 MS_EXCEPTION_IF_NULL(context_ptr);
2014 auto enable_mem_scheduler = context_ptr->get_param<bool>(MS_CTX_ENABLE_MEM_SCHEDULER);
2015 if (enable_mem_scheduler) {
2016 return;
2017 }
2018 MS_EXCEPTION_IF_NULL(outputs);
2019 tensor_device_addr_map_.clear();
2020 for (const auto &item : *outputs) {
2021 if (utils::isa<VectorRefPtr>(item)) {
2022 const auto &vector_ref = utils::cast<VectorRef>(item);
2023 std::map<DeviceAddressPtr, DeviceAddressPtr> new_to_old_device_address;
2024 UpdateOutputTensors(&vector_ref, tensor_to_node, &new_to_old_device_address);
2025 } else if (utils::isa<tensor::TensorPtr>(item)) {
2026 const auto &tensor = utils::cast<tensor::TensorPtr>(item);
2027 MS_EXCEPTION_IF_NULL(tensor);
2028 const auto &iter = tensor_to_node.find(tensor);
2029 if (iter != tensor_to_node.end()) {
2030 const auto &node = iter->second.first;
2031 size_t output_index = iter->second.second;
2032 if (!AnfAlgo::OutputAddrExist(node, output_index, true)) {
2033 continue;
2034 }
2035 const auto &address = AnfAlgo::GetMutableOutputAddr(node, output_index);
2036 tensor->set_device_address(address);
2037 if (IsVMGraphTaskSink() && tensor->NeedSyncDeviceToHostImmediately()) {
2038 auto dst_device_address = AssignExtraMemForGraphOutput(tensor, node, output_index);
2039 MS_EXCEPTION_IF_NULL(dst_device_address);
2040 if (!dst_device_address->SyncDeviceToDevice(trans::GetRuntimePaddingShape(node, output_index),
2041 address->GetSize(), address->type_id(), address->GetPtr(),
2042 address->format())) {
2043 MS_LOG(EXCEPTION) << "SyncDeviceToDevice failed!";
2044 }
2045 tensor->set_sync_status(kNoNeedSync);
2046 tensor_device_addr_map_[tensor] = dst_device_address;
2047 }
2048
2049 if (AnfAlgo::IsDynamicShape(node)) {
2050 const auto &updated_shape = AnfAlgo::GetOutputInferShape(node, output_index);
2051 ShapeVector int_shape;
2052 (void)std::transform(updated_shape.begin(), updated_shape.end(), std::back_inserter(int_shape), SizeToInt);
2053 (void)tensor->set_shape(int_shape);
2054 }
2055 }
2056 if (tensor->NeedSyncDeviceToHostImmediately()) {
2057 tensor->data_sync(false);
2058 tensor->set_device_address(nullptr);
2059 tensor->set_sync_status(kNeedSyncHostToDevice);
2060 }
2061 }
2062 }
2063 }
AssignExtraMemForGraphOutput(const tensor::TensorPtr & tensor,const AnfNodePtr & node,size_t index) const2064 DeviceAddressPtr AscendSession::AssignExtraMemForGraphOutput(const tensor::TensorPtr &tensor, const AnfNodePtr &node,
2065 size_t index) const {
2066 MS_EXCEPTION_IF_NULL(tensor);
2067 MS_EXCEPTION_IF_NULL(node);
2068 auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
2069 MS_EXCEPTION_IF_NULL(runtime_instance);
2070 return runtime_instance->AssignExtraStaticMem(tensor, node, index);
2071 }
2072 } // namespace session
2073 } // namespace mindspore
2074