• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "backend/session/ascend_session.h"
17 #include <algorithm>
18 #include <map>
19 #include <tuple>
20 #include <set>
21 #include <unordered_set>
22 #include <string>
23 #include <list>
24 
25 #include "base/core_ops.h"
26 #include "base/base_ref_utils.h"
27 #include "ir/tensor.h"
28 #include "ir/anf.h"
29 #include "common/trans.h"
30 #include "runtime/device/kernel_runtime.h"
31 #include "runtime/device/ascend/kernel_select_ascend.h"
32 #include "runtime/device/ascend/kernel_build_ascend.h"
33 #include "runtime/device/ascend/ascend_kernel_runtime.h"
34 #include "runtime/device/ascend/profiling/profiling_manager.h"
35 #include "backend/optimizer/ascend/ascend_backend_optimization.h"
36 #include "backend/optimizer/common/common_backend_optimization.h"
37 #include "backend/optimizer/ascend/mindir/space_batch_nd_attr_update.h"
38 #include "backend/optimizer/ascend/mindir/dropout_unify_mindir.h"
39 #include "backend/optimizer/ascend/mindir/maxpool_to_maxpool_with_argmax.h"
40 #include "backend/optimizer/ascend/mindir/maxpool_with_argmax_unify_mindir.h"
41 #include "backend/optimizer/ascend/mindir/conv2d_unify_mindir.h"
42 #include "backend/optimizer/ascend/mindir/optimizer_unify_output.h"
43 #include "backend/optimizer/ascend/mindir/fake_learned_scale_quant_grad_unify_mindir.h"
44 #include "backend/optimizer/ascend/mindir/sparse_softmax_cross_entropy_with_logits_unify_mindir.h"
45 #include "backend/optimizer/ascend/mindir/slice_grad_unify_mindir.h"
46 #include "backend/optimizer/ascend/mindir/avg_pool_grad_unify_mindir.h"
47 #include "backend/optimizer/ascend/mindir/bn_grad_unify_mindir.h"
48 #include "backend/optimizer/ascend/mindir/all_to_all_unify_mindir.h"
49 #include "runtime/device/kernel_adjust.h"
50 #include "runtime/device/ascend/ascend_stream_assign.h"
51 #include "backend/session/anf_runtime_algorithm.h"
52 #include "utils/ms_utils.h"
53 #include "utils/utils.h"
54 #include "utils/context/graph_kernel_flags.h"
55 #include "backend/optimizer/common/helper.h"
56 #include "runtime/device/kernel_runtime_manager.h"
57 #include "utils/config_manager.h"
58 #ifndef ENABLE_SECURITY
59 #include "debug/data_dump/dump_json_parser.h"
60 #include "debug/data_dump/e2e_dump.h"
61 #endif
62 #include "debug/anf_ir_utils.h"
63 #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
64 #include "backend/session/ascend_auto_monad.h"
65 #include "debug/anf_ir_dump.h"
66 #include "debug/dump_proto.h"
67 #include "abstract/utils.h"
68 #ifdef ENABLE_DEBUGGER
69 #include "debug/tensor_load.h"
70 #include "debug/debugger/proto_exporter.h"
71 #else
72 #include "debug/debugger/proto_exporter_stub.h"
73 #endif
74 #include "common/util/error_manager/error_manager.h"
75 #include "toolchain/adx_datadump_server.h"
76 #ifdef ENABLE_DUMP_IR
77 #include "debug/rdr/running_data_recorder.h"
78 #include "debug/rdr/recorder_manager.h"
79 #include "debug/rdr/graph_recorder.h"
80 #endif
81 #if ENABLE_CPU && ENABLE_D
82 #include "ps/util.h"
83 #include "ps/ps_cache/ps_cache_manager.h"
84 #endif
85 #include "runtime/device/ascend/ascend_bucket.h"
86 #ifndef ENABLE_SECURITY
87 #include "profiler/device/ascend/memory_profiling.h"
88 
89 using mindspore::device::ascend::ProfilingManager;
90 using mindspore::profiler::ascend::MemoryProfiling;
91 #endif
92 
93 namespace mindspore {
94 namespace session {
95 const size_t kInvalidIndex = SIZE_MAX;
96 const size_t kLoopSinkTensorNum = 3;
97 const size_t kLoopSinkCurLoopIndex = 0;
98 const size_t kLoopSinkNextLoopIndex = 1;
99 const size_t kLoopSinkEpochIndex = 2;
100 const size_t kLabelNumsThreshold = 1023;
101 constexpr char SR_TAG[] = "sr_tag";
102 constexpr char BACKWARD[] = "backward";
103 constexpr auto kUnknowErrorString = "Unknown error occurred";
104 namespace {
105 #ifndef ENABLE_SECURITY
DumpGraphExeOrder(const std::vector<CNodePtr> & execution_order,const std::string & tag="")106 void DumpGraphExeOrder(const std::vector<CNodePtr> &execution_order, const std::string &tag = "") {
107   MS_LOG(INFO) << "Dump execution_order size " << execution_order.size();
108   MS_LOG(INFO) << "[index][stream_label][graph_id][node string]";
109   int i = 0;
110   for (auto &cnode : execution_order) {
111     MS_EXCEPTION_IF_NULL(cnode);
112     MS_LOG(INFO) << "[ " << i << "]"
113                  << "[" << AnfAlgo::GetStreamDistinctionLabel(cnode.get()) << "]"
114                  << "[" << AnfAlgo::GetGraphId(cnode.get()) << "]"
115                  << "[" << cnode->DebugString() << "]";
116     i++;
117   }
118 
119   std::stringstream buf;
120   buf << "================== execution order ==================\n";
121   if (!tag.empty()) {
122     buf << tag << "\n";
123   }
124   buf << "execution_order size: " << execution_order.size() << "\n";
125   i = 0;
126   for (auto &cnode : execution_order) {
127     MS_EXCEPTION_IF_NULL(cnode);
128     buf << i << ":\n";
129     buf << "\t" << cnode->DebugString() << "\n";
130     buf << "\t" << AnfAlgo::GetStreamDistinctionLabel(cnode.get()) << "\n";
131     buf << "\t" << AnfAlgo::GetGraphId(cnode.get()) << "\n";
132     i++;
133   }
134   buf << "================== execution order ==================\n";
135 }
136 #endif
137 
IsVMGraphTaskSink()138 bool IsVMGraphTaskSink() {
139   auto ms_context = MsContext::GetInstance();
140   MS_EXCEPTION_IF_NULL(ms_context);
141   if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) != kGraphMode) {
142     return false;
143   }
144   if (ms_context->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) == false) {
145     return false;
146   }
147   if (ms_context->get_param<bool>(MS_CTX_IS_MULTI_GRAPH_SINK) == true) {
148     return false;
149   }
150   return true;
151 }
152 
153 // Handle control flow by auto-monad.
HandleControlFlow(NotNull<KernelGraphPtr> graph)154 void HandleControlFlow(NotNull<KernelGraphPtr> graph) {
155   AscendAutoMonad auto_monad(graph);
156   auto_monad.Run();
157 }
158 
SetStreamDistinctionLabel(const KernelGraphPtr & graph,uint32_t label,bool is_override)159 void SetStreamDistinctionLabel(const KernelGraphPtr &graph, uint32_t label, bool is_override) {
160   MS_EXCEPTION_IF_NULL(graph);
161   if (is_override || graph->stream_distinction_label() == kInvalidDistincLabel) {
162     graph->set_stream_distinction_label(label);
163   }
164 }
165 
GetCNodeOutputStubTensor(const KernelWithIndex & kernel_with_index,const std::map<KernelWithIndex,OutputTensorInfo> & node_output_info,bool * output_is_weight)166 TensorPtr GetCNodeOutputStubTensor(const KernelWithIndex &kernel_with_index,
167                                    const std::map<KernelWithIndex, OutputTensorInfo> &node_output_info,
168                                    bool *output_is_weight) {
169   MS_EXCEPTION_IF_NULL(output_is_weight);
170   const auto &iter = node_output_info.find(kernel_with_index);
171   if (iter == node_output_info.end()) {
172     MS_LOG(EXCEPTION) << "Can not find output stub tensor of cnode " << kernel_with_index.first->DebugString();
173   }
174   *output_is_weight = iter->second.is_weight;
175   return iter->second.output_stub_tensor;
176 }
177 
GenOpOutputStubTensor(const KernelGraphPtr & single_op_graph,const CNodePtr & kernel,const std::map<KernelWithIndex,size_t> & cnode_refcount,std::map<KernelWithIndex,OutputTensorInfo> * op_output_info)178 void GenOpOutputStubTensor(const KernelGraphPtr &single_op_graph, const CNodePtr &kernel,
179                            const std::map<KernelWithIndex, size_t> &cnode_refcount,
180                            std::map<KernelWithIndex, OutputTensorInfo> *op_output_info) {
181   MS_EXCEPTION_IF_NULL(single_op_graph);
182   MS_EXCEPTION_IF_NULL(kernel);
183   MS_EXCEPTION_IF_NULL(op_output_info);
184   OutputTensorInfo output_tensor_info;
185   size_t out_idx = 0;
186   for (const auto &output : single_op_graph->outputs()) {
187     KernelWithIndex kernel_with_index = std::make_pair(kernel, out_idx++);
188     if (cnode_refcount.find(kernel_with_index) == cnode_refcount.end()) {
189       continue;
190     }
191     const auto &output_kernel_with_index = AnfAlgo::VisitKernel(output, 0);
192     const auto &output_node = output_kernel_with_index.first;
193     const auto &output_index = output_kernel_with_index.second;
194     auto out_abstract = output_node->abstract();
195     MS_EXCEPTION_IF_NULL(out_abstract);
196     if (out_abstract->isa<abstract::AbstractTuple>()) {
197       out_abstract = out_abstract->cast<abstract::AbstractTuplePtr>()->elements()[output_index];
198       MS_EXCEPTION_IF_NULL(out_abstract);
199     }
200     abstract::AbstractTensorPtr tensor_abstract = out_abstract->cast<abstract::AbstractTensorPtr>();
201     MS_EXCEPTION_IF_NULL(tensor_abstract);
202     const auto &infer_type = AnfAlgo::GetOutputInferDataType(output_node, output_index);
203     tensor::TensorPtr stub_output_tensor =
204       std::make_shared<tensor::Tensor>(infer_type, tensor_abstract->shape()->shape(), nullptr);
205     const auto &output_type = AnfAlgo::GetOutputDeviceDataType(output_node, output_index);
206     const auto &output_format = AnfAlgo::GetOutputFormat(output_node, output_index);
207     tensor::DeviceInfo device_info;
208     device_info.format_ = output_format;
209     device_info.data_type_ = TypeIdToType(output_type);
210     stub_output_tensor->set_device_info(device_info);
211     device::DeviceAddressPtr device_address =
212       std::make_shared<device::ascend::AscendDeviceAddress>(nullptr, 0, output_format, output_type);
213     stub_output_tensor->set_device_address(device_address);
214     output_tensor_info.output_stub_tensor = stub_output_tensor;
215     auto kernel_info = dynamic_cast<const device::KernelInfo *>(output_node->kernel_info());
216     MS_EXCEPTION_IF_NULL(kernel_info);
217     output_tensor_info.is_weight = !(kernel_info->is_feature_map());
218     (*op_output_info)[kernel_with_index] = output_tensor_info;
219   }
220 }
221 
LoadCtrlInputTensor(const std::shared_ptr<KernelGraph> & graph,std::vector<tensor::TensorPtr> * inputs)222 size_t LoadCtrlInputTensor(const std::shared_ptr<KernelGraph> &graph, std::vector<tensor::TensorPtr> *inputs) {
223   MS_EXCEPTION_IF_NULL(graph);
224   MS_LOG(DEBUG) << "Load kInputCtrlTensors";
225   auto inputs_params = graph->input_ctrl_tensors();
226   if (inputs_params == nullptr) {
227     return 0;
228   }
229   if (inputs_params->size() < kLoopSinkTensorNum) {
230     MS_LOG(EXCEPTION) << "Illegal inputs_params size";
231   }
232   // update current loop tensor to 0 per iterator
233   auto cur_loop_tensor = (*inputs_params)[kLoopSinkCurLoopIndex];
234   MS_EXCEPTION_IF_NULL(cur_loop_tensor);
235   auto *cur_val = static_cast<int32_t *>(cur_loop_tensor->data_c());
236   MS_EXCEPTION_IF_NULL(cur_val);
237   *cur_val = 0;
238   cur_loop_tensor->set_sync_status(kNeedSyncHostToDevice);
239   // set loop_count to zero
240   if (inputs != nullptr) {
241     inputs->push_back(cur_loop_tensor);
242   } else {
243     auto device_address = cur_loop_tensor->device_address();
244     if (!device_address->SyncHostToDevice(cur_loop_tensor->shape(), LongToSize(cur_loop_tensor->data().nbytes()),
245                                           cur_loop_tensor->data_type(), cur_loop_tensor->data_c(),
246                                           cur_loop_tensor->device_info().host_format_)) {
247       MS_LOG(EXCEPTION) << "SyncHostToDevice failed for cur_loop_tensor needed for async dump.";
248     }
249   }
250 
251   // update next loop tensor to 0 per iterator
252   auto next_loop_tensor = (*inputs_params)[kLoopSinkNextLoopIndex];
253   MS_EXCEPTION_IF_NULL(next_loop_tensor);
254   auto *next_val = static_cast<int32_t *>(next_loop_tensor->data_c());
255   MS_EXCEPTION_IF_NULL(next_val);
256   *next_val = 0;
257   next_loop_tensor->set_sync_status(kNeedSyncHostToDevice);
258   // set loop_count to zero
259   if (inputs != nullptr) {
260     inputs->push_back(next_loop_tensor);
261   } else {
262     auto device_address = next_loop_tensor->device_address();
263     if (!device_address->SyncHostToDevice(next_loop_tensor->shape(), LongToSize(next_loop_tensor->data().nbytes()),
264                                           next_loop_tensor->data_type(), next_loop_tensor->data_c(),
265                                           next_loop_tensor->device_info().host_format_)) {
266       MS_LOG(EXCEPTION) << "SyncHostToDevice failed for next_loop_tensor needed for async dump.";
267     }
268   }
269 
270   auto epoch_tensor = (*inputs_params)[kLoopSinkEpochIndex];
271   MS_EXCEPTION_IF_NULL(epoch_tensor);
272   auto *epoch_val = static_cast<int32_t *>(epoch_tensor->data_c());
273   MS_EXCEPTION_IF_NULL(epoch_val);
274   *epoch_val = SizeToInt(graph->current_epoch());
275   epoch_tensor->set_sync_status(kNeedSyncHostToDevice);
276   if (inputs != nullptr) {
277     inputs->push_back(epoch_tensor);
278   } else {
279     auto device_address = epoch_tensor->device_address();
280     if (!device_address->SyncHostToDevice(epoch_tensor->shape(), LongToSize(epoch_tensor->data().nbytes()),
281                                           epoch_tensor->data_type(), epoch_tensor->data_c(),
282                                           epoch_tensor->device_info().host_format_)) {
283       MS_LOG(EXCEPTION) << "SyncHostToDevice failed for epoch_tensor needed for async dump.";
284     }
285   }
286   MS_LOG(DEBUG) << "Load epoch_val:" << *epoch_val;
287   graph->set_current_epoch(graph->current_epoch() + 1);
288   return inputs_params->size();
289 }
290 
UpdateCtrlInputTensor(const std::shared_ptr<KernelGraph> & graph,std::vector<tensor::TensorPtr> * inputs,size_t * input_ctrl_size)291 void UpdateCtrlInputTensor(const std::shared_ptr<KernelGraph> &graph, std::vector<tensor::TensorPtr> *inputs,
292                            size_t *input_ctrl_size) {
293   if (graph->input_ctrl_tensors()) {
294     auto &dump_json_parser = DumpJsonParser::GetInstance();
295     bool sink_mode = (ConfigManager::GetInstance().dataset_mode() == DS_SINK_MODE || graph->IsDatasetGraph());
296     if (sink_mode || !dump_json_parser.async_dump_enabled()) {
297       *input_ctrl_size = LoadCtrlInputTensor(graph, inputs);
298     } else {
299       LoadCtrlInputTensor(graph, nullptr);
300     }
301   }
302 }
303 
NeedMemcpyInDevice(const device::DeviceAddressPtr & src_device_addr,const device::DeviceAddressPtr & dst_device_addr)304 bool NeedMemcpyInDevice(const device::DeviceAddressPtr &src_device_addr,
305                         const device::DeviceAddressPtr &dst_device_addr) {
306   MS_EXCEPTION_IF_NULL(dst_device_addr);
307   if (src_device_addr.get() == nullptr) {
308     return false;
309   }
310   if (src_device_addr->DeviceType() == dst_device_addr->DeviceType() &&
311       src_device_addr->format() == dst_device_addr->format() &&
312       src_device_addr->type_id() == dst_device_addr->type_id()) {
313     return true;
314   }
315   return false;
316 }
317 
TensorNeedSync(const std::shared_ptr<KernelGraph> & kernel_graph,const AnfNodePtr & parameter,const tensor::TensorPtr & tensor,uint32_t * memcpy_nums)318 bool TensorNeedSync(const std::shared_ptr<KernelGraph> &kernel_graph, const AnfNodePtr &parameter,
319                     const tensor::TensorPtr &tensor, uint32_t *memcpy_nums) {
320   MS_EXCEPTION_IF_NULL(tensor);
321   if (tensor->NeedSyncHostToDevice()) {
322     return true;
323   }
324   auto ms_context = MsContext::GetInstance();
325   MS_EXCEPTION_IF_NULL(ms_context);
326   auto device_address = AnfAlgo::GetMutableOutputAddr(parameter, 0);
327   if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER)) {
328     return tensor->device_address().get() == nullptr || tensor->device_address() != device_address;
329   }
330   auto tensor_address = std::dynamic_pointer_cast<device::DeviceAddress>(tensor->device_address());
331   if (tensor_address != device_address) {
332     if (!kernel_graph->is_dynamic_shape() && IsVMGraphTaskSink() &&
333         NeedMemcpyInDevice(tensor_address, device_address)) {
334       auto status = device_address->SyncDeviceToDevice(trans::GetRuntimePaddingShape(parameter, 0),
335                                                        tensor_address->GetSize(), tensor_address->type_id(),
336                                                        tensor_address->GetPtr(), tensor_address->format());
337       if (status == false) {
338         MS_LOG(EXCEPTION) << "SyncDeviceToDevice failed.";
339       }
340       MS_EXCEPTION_IF_NULL(memcpy_nums);
341       (*memcpy_nums)++;
342 #if ((defined ENABLE_CPU) && (!defined _WIN32))
343       const std::string &param_name = parameter->fullname_with_scope();
344       if (ps::ps_cache_instance.IsHashTable(param_name)) {
345         return false;
346       }
347 #endif
348       auto input_param = parameter->cast<ParameterPtr>();
349       MS_EXCEPTION_IF_NULL(input_param);
350       if (AnfAlgo::IsParameterWeight(input_param) || kernel_graph->IsUpdatedParameter(input_param)) {
351         tensor->set_device_address(device_address);
352       }
353       if (kernel_graph->IsUpdatedParameter(input_param)) {
354         tensor->SetIsUpdateByDevice();
355       }
356       return false;
357     } else {
358       tensor->data_sync(false);
359       return true;
360     }
361   }
362   return false;
363 }
364 
AddGraphToManager(const NotNull<KernelGraphPtr> graph,NotNull<FuncGraphManagerPtr> manager,NotNull<std::set<KernelGraphPtr> * > memo)365 void AddGraphToManager(const NotNull<KernelGraphPtr> graph, NotNull<FuncGraphManagerPtr> manager,
366                        NotNull<std::set<KernelGraphPtr> *> memo) {
367   if (memo->find(graph) != memo->end()) {
368     return;
369   }
370   memo->insert(graph.get());
371   manager->AddFuncGraph(graph.get(), false);
372 
373   for (auto &child_graph : graph->child_graph_order()) {
374     AddGraphToManager(NOT_NULL(child_graph.lock()), manager, memo);
375   }
376 }
377 }  // namespace
378 
Init(uint32_t device_id)379 void AscendSession::Init(uint32_t device_id) { InitExecutor(kAscendDevice, device_id); }
380 
UnifyMindIR(const KernelGraphPtr & graph)381 void AscendSession::UnifyMindIR(const KernelGraphPtr &graph) {
382   SessionBasic::UnifyMindIR(graph);
383   auto context_ptr = MsContext::GetInstance();
384   MS_EXCEPTION_IF_NULL(context_ptr);
385 #ifdef ENABLE_DUMP_IR
386   bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
387   if (save_graphs) {
388     std::string file_name = "hwopt_d_before_unify_mindir_graph_" + std::to_string(graph->graph_id()) + ".ir";
389     DumpIR(file_name, graph);
390     DumpIRProto(graph, "before_unify_mindir_hwopt_" + std::to_string(graph->graph_id()));
391   }
392 #endif
393   auto optimizer = std::make_shared<opt::GraphOptimizer>();
394   auto unify_mindir_pm = std::make_shared<opt::PassManager>("unify_mindir_pm");
395   unify_mindir_pm->AddPass(std::make_shared<opt::SpaceToBatchNDAttrUpdate>());
396   unify_mindir_pm->AddPass(std::make_shared<opt::BatchToSpaceNDAttrUpdate>());
397   unify_mindir_pm->AddPass(std::make_shared<opt::MaxPool2MaxPoolWithArgmax>());
398   unify_mindir_pm->AddPass(std::make_shared<opt::MaxPoolWithArgmaxUnifyMindIR>());
399   unify_mindir_pm->AddPass(std::make_shared<opt::MaxPoolGradWithArgmaxUnifyMindIR>());
400   unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DUnifyMindIR>());
401   unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DBackpropInputUnifyMindIR>());
402   unify_mindir_pm->AddPass(std::make_shared<opt::Conv2DBackpropFilterUnifyMindIR>());
403   unify_mindir_pm->AddPass(std::make_shared<opt::SliceGradUnifyMindIR>());
404   unify_mindir_pm->AddPass(std::make_shared<opt::AvgPoolGradUnifyMindIR>());
405   unify_mindir_pm->AddPass(std::make_shared<opt::FtrlUnifyOutput>());
406   unify_mindir_pm->AddPass(std::make_shared<opt::MomentumUnifyOutput>());
407   unify_mindir_pm->AddPass(std::make_shared<opt::RMSPropUnifyOutput>());
408   unify_mindir_pm->AddPass(std::make_shared<opt::CenteredRMSPropUnifyOutput>());
409   unify_mindir_pm->AddPass(std::make_shared<opt::FakeLearnedScaleQuantPerLayerGradUnifyMindIR>());
410   unify_mindir_pm->AddPass(std::make_shared<opt::FakeLearnedScaleQuantPerChannelGradUnifyMindIR>());
411   auto ms_context = MsContext::GetInstance();
412   MS_EXCEPTION_IF_NULL(ms_context);
413   if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) {
414     unify_mindir_pm->AddPass(std::make_shared<opt::DropoutAndDropoutGradUnifyMindIR>());
415     unify_mindir_pm->AddPass(std::make_shared<opt::DropoutUnifyMindIR0>());
416     unify_mindir_pm->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
417     unify_mindir_pm->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIRV2>());
418     unify_mindir_pm->AddPass(std::make_shared<opt::SparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
419   } else {
420     // Add PynativeGradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR pass first to avoid the backward loss function
421     // from the python frontend matching the pattern defined in PynativeSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR.
422     unify_mindir_pm->AddPass(std::make_shared<opt::PynativeGradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
423     unify_mindir_pm->AddPass(std::make_shared<opt::PynativeSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
424   }
425   unify_mindir_pm->AddPass(std::make_shared<opt::DropoutUnifyMindIR1>());
426   unify_mindir_pm->AddPass(std::make_shared<opt::DropoutGradUnifyMindIR>());
427   unify_mindir_pm->AddPass(std::make_shared<opt::BatchNormGradUnifyMindIR>());
428   unify_mindir_pm->AddPass(std::make_shared<opt::NeighborExchangeUnifyMindIR>());
429   unify_mindir_pm->AddPass(std::make_shared<opt::AllToAllUnifyMindIR>());
430 
431   optimizer->AddPassManager(unify_mindir_pm);
432   (void)optimizer->Optimize(graph);
433   graph->SetExecOrderByDefault();
434 #ifdef ENABLE_DUMP_IR
435   if (save_graphs) {
436     std::string file_name = "hwopt_d_after_unify_mindir_graph_" + std::to_string(graph->graph_id()) + ".ir";
437     DumpIR(file_name, graph);
438   }
439 #endif
440 }
441 
LoadInputData(const std::shared_ptr<KernelGraph> & kernel_graph,const std::vector<tensor::TensorPtr> & inputs_const) const442 void AscendSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
443                                   const std::vector<tensor::TensorPtr> &inputs_const) const {
444   std::vector<tensor::TensorPtr> inputs(inputs_const);
445   size_t input_ctrl_size = kLoopSinkTensorNum;
446   uint32_t device_memcpy_nums = 0;
447   MS_EXCEPTION_IF_NULL(kernel_graph);
448   UpdateCtrlInputTensor(kernel_graph, &inputs, &input_ctrl_size);
449   auto &input_nodes = kernel_graph->input_nodes();
450   if ((inputs.size() + input_ctrl_size) - kLoopSinkTensorNum != input_nodes.size()) {
451     MS_LOG(EXCEPTION) << "Tensor input:" << inputs.size() << " is not equal graph inputs:" << input_nodes.size()
452                       << ", input_ctrl_size:" << input_ctrl_size;
453   }
454   auto ms_context = MsContext::GetInstance();
455   MS_EXCEPTION_IF_NULL(ms_context);
456   auto enable_mem_scheduler = ms_context->get_param<bool>(MS_CTX_ENABLE_MEM_SCHEDULER);
457   if (enable_mem_scheduler) {
458     kernel_graph->SetInputTensors(inputs);
459     return;
460   }
461   for (auto item : tensor_device_addr_map_) {
462     auto output_tensor = item.first;
463     output_tensor->set_device_address(item.second);
464   }
465   SyncStream();
466   for (size_t i = 0; i < inputs.size(); ++i) {
467     auto tensor = inputs[i];
468     MS_EXCEPTION_IF_NULL(tensor);
469     auto input_node = input_nodes[i];
470     MS_EXCEPTION_IF_NULL(input_node);
471     auto size = LongToSize(tensor->data().nbytes());
472     if (!input_node->isa<Parameter>()) {
473       continue;
474     }
475     auto input_param = input_node->cast<ParameterPtr>();
476     MS_EXCEPTION_IF_NULL(input_param);
477     if (!input_param->IsUsedByRealKernelInGraph(kernel_graph->graph_id())) {
478       tensor->set_sync_status(kNoNeedSync);
479       continue;
480     } else if (input_param->has_dynamic_shape()) {
481       auto tensor_shape = tensor->shape();
482       std::vector<size_t> shape_tmp;
483       (void)std::transform(tensor_shape.begin(), tensor_shape.end(), std::back_inserter(shape_tmp), IntToSize);
484       AnfAlgo::SetOutputInferTypeAndShape({AnfAlgo::GetOutputInferDataType(input_node, 0)}, {shape_tmp},
485                                           input_node.get());
486       size = abstract::ShapeSize(shape_tmp) * abstract::TypeIdSize(tensor->data_type());
487     }
488     if (AnfAlgo::OutputAddrExist(input_node, 0) &&
489         TensorNeedSync(kernel_graph, input_node, tensor, &device_memcpy_nums)) {
490 #if ((defined ENABLE_CPU) && (!defined _WIN32))
491       const std::string &param_name = input_node->fullname_with_scope();
492       if (ps::ps_cache_instance.IsHashTable(param_name)) {
493         continue;
494       }
495 #endif
496       auto device_address = AnfAlgo::GetMutableOutputAddr(input_node, 0);
497       MS_EXCEPTION_IF_NULL(device_address);
498       if (size != 0 &&
499           !device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(input_node, 0), size, tensor->data_type(),
500                                             tensor->data_c(), tensor->device_info().host_format_)) {
501         MS_LOG(EXCEPTION) << "SyncHostToDevice failed.";
502       }
503       if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode ||
504           AnfAlgo::IsParameterWeight(input_param) || kernel_graph->IsUpdatedParameter(input_param)) {
505         tensor->set_device_address(device_address);
506       }
507       if (kernel_graph->IsUpdatedParameter(input_param)) {
508         tensor->SetIsUpdateByDevice();
509       }
510     }
511     tensor->set_sync_status(kNoNeedSync);
512   }
513   if (device_memcpy_nums > 0) {
514     auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
515     MS_EXCEPTION_IF_NULL(runtime_instance);
516     auto compute_stream = runtime_instance->compute_stream();
517     auto model_stream = runtime_instance->GetModelStream(kernel_graph->graph_id());
518     auto memcpy_event = runtime_instance->CreateDeviceEvent();
519     memcpy_event->set_wait_stream(model_stream);
520     memcpy_event->set_record_stream(compute_stream);
521     memcpy_event->RecordEvent();
522     memcpy_event->WaitEvent();
523   }
524 }
525 
CompileGraphImpl(const AnfNodePtrList & lst,const AnfNodePtrList & outputs)526 GraphId AscendSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
527   MS_LOG(INFO) << "Start";
528   // construct graph, if successfully, graph_sum_ + 1
529   auto graph = ConstructKernelGraph(lst, outputs);
530   auto graph_id = graph->graph_id();
531   InitAllBucket(graph);
532   MS_LOG(INFO) << "Compile graph " << graph_id << " success";
533   return graph_id;
534 }
535 
CompileGraphImpl(NotNull<FuncGraphPtr> func_graph)536 GraphId AscendSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) {
537   MS_LOG(INFO) << "Start";
538   std::vector<KernelGraphPtr> all_graphs;
539   auto root_graph = ConstructKernelGraph(func_graph, &all_graphs);
540   for (const auto &graph : all_graphs) {
541     graph->set_root_graph_id(root_graph->graph_id());
542   }
543   UnifyMindIR(root_graph);
544   // Update Graph Dynamic Shape Attr
545   UpdateAllGraphDynamicShapeAttr(all_graphs);
546   opt::BackendCommonOptimization(root_graph);
547   // empty graph dont entry to backend
548   if (root_graph->execution_order().empty()) {
549     MS_LOG(INFO) << root_graph->ToString() << " is empty graph.";
550     AnfAlgo::InsertMakeTupleForOutput(NOT_NULL(root_graph));
551     root_graph->set_executable(false);
552     InitRuntimeResource();
553     return root_graph->graph_id();
554   }
555 
556   // Handle control flow by auto-monad.
557   HandleControlFlow(NOT_NULL(root_graph));
558 
559   std::set<KernelGraphPtr> memo;
560   // add all graphs to manager first, so that don't have to make new manager in following passes.
561   auto manager = Manage(root_graph, true);
562   AddGraphToManager(NOT_NULL(root_graph), NOT_NULL(manager), NOT_NULL(&memo));
563   memo.clear();
564 
565   // resource initialize
566   InitRuntimeResource();
567 
568   IrFusionPass(NOT_NULL(root_graph), NOT_NULL(&memo));
569   memo.clear();
570   SelectKernel(NOT_NULL(root_graph));
571   memo.clear();
572 
573   HardwareOptimize(NOT_NULL(root_graph), NOT_NULL(&memo));
574   memo.clear();
575 #ifdef ENABLE_DEBUGGER
576   // load graphs to debugger.
577   if (debugger_ && debugger_->DebuggerBackendEnabled()) {
578     LoadGraphsToDbg(NOT_NULL(root_graph), NOT_NULL(&memo));
579   }
580 #endif
581   memo.clear();
582   UpdateRefOutputMap(NOT_NULL(root_graph), NOT_NULL(&memo));
583   memo.clear();
584   // add make_tuple to the output graph
585   AnfAlgo::InsertMakeTupleForOutput(NOT_NULL(root_graph));
586   // root root_graph valiate,include genearte execute order and so on
587   RootGraphExecutorValidate(NOT_NULL(root_graph), all_graphs);
588 #ifdef ENABLE_DUMP_IR
589   // dump graph before remove nop nodes
590   auto context_ptr = MsContext::GetInstance();
591   MS_EXCEPTION_IF_NULL(context_ptr);
592   bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
593   if (save_graphs) {
594     DumpIRProto(root_graph, "before_removeNop_" + std::to_string(graph_sum_));
595   }
596 #endif
597 
598   // adjust kernel
599   AdjustKernel(root_graph);
600 #if ENABLE_CPU && ENABLE_D
601   InitPsWorker(root_graph);
602 #endif
603   // assign stream
604   AssignStream(NOT_NULL(root_graph));
605 #ifndef ENABLE_SECURITY
606   // insert profiling point
607   device::KernelAdjust::GetInstance().Profiling(NOT_NULL(root_graph.get()));
608 #endif
609   device::KernelAdjust::GetInstance().InsertOverflowCheckOperations(NOT_NULL(root_graph));
610   // build kernel
611   BuildKernel(root_graph);
612 #ifndef ENABLE_SECURITY
613   SetSummaryNodes(root_graph.get());
614 #endif
615   // Alloc memory for child graph's inputs
616   AssignStaticMemory(NOT_NULL(root_graph), NOT_NULL(&memo));
617   memo.clear();
618   // Alloc memory for root graph's inputs and node's outputs, workspace
619   MemoryAlloc(root_graph.get());
620   // generate and load task into device
621   Load(root_graph);
622   root_graph->SetInputNodes();
623   root_graph->SetOptimizerFlag();
624   DumpAllGraphs(all_graphs);
625   // Save memory profiling data to proto file
626 #ifndef ENABLE_SECURITY
627   auto profiling_instance = MemoryProfiling::GetInstance();
628   if (profiling_instance.IsMemoryProfilingEnable()) {
629     auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
630     MS_EXCEPTION_IF_NULL(runtime_instance);
631     uint64_t mem_size = runtime_instance->GetAvailableMemMaxSize();
632     profiling_instance.SetDeviceMemSize(mem_size);
633     profiling_instance.SaveMemoryProfiling();
634   }
635 #endif
636   // return the root_graph id to backend
637   auto graph_id = root_graph->graph_id();
638   return graph_id;
639 }
640 
641 #ifndef ENABLE_SECURITY
SetFinalGraphSummaryFlag(const std::shared_ptr<KernelGraph> & kernel_graph)642 void AscendSession::SetFinalGraphSummaryFlag(const std::shared_ptr<KernelGraph> &kernel_graph) {
643   MS_EXCEPTION_IF_NULL(kernel_graph);
644   auto graph_order = GetGraphOrder(kernel_graph->graph_id());
645   for (auto graph_id : graph_order) {
646     auto child_graph = GetGraph(graph_id);
647     if (child_graph == nullptr) {
648       continue;
649     }
650     if (child_graph->summary_node_exist()) {
651       kernel_graph->set_summary_node_exist(true);
652       return;
653     }
654   }
655   kernel_graph->set_summary_node_exist(false);
656 }
657 #endif
658 
BuildGraphImpl(GraphId graph_id)659 void AscendSession::BuildGraphImpl(GraphId graph_id) {
660   MS_LOG(INFO) << "Start";
661   auto graph = GetGraph(graph_id);
662   MS_EXCEPTION_IF_NULL(graph);
663   // resource initialize
664   InitRuntimeResource();
665   // multiple graph handle
666   if (graph_id == final_graph_id_) {
667     MS_LOG(EXCEPTION) << "Unexpected graph id:" << graph_id << ", final_graph_id_:" << final_graph_id_;
668   }
669   auto single_graph = GetGraph(graph_id);
670   MS_EXCEPTION_IF_NULL(single_graph);
671   CompileChildGraph(single_graph);
672   // set the distinction label of single graph
673   single_graph->set_stream_distinction_label(graph_id);
674   single_graph->UpdateExecuteKernelStreamLabel();
675   // adjust execution order because  merge child graph and other special operations
676   AdjustKernel(graph);
677 #if ENABLE_CPU && ENABLE_D
678   InitPsWorker(graph);
679 #endif
680   // Assign streams for control sink and hccl and so on
681   AssignStream(NOT_NULL(graph));
682 #ifndef ENABLE_SECURITY
683   device::KernelAdjust::GetInstance().Profiling(NOT_NULL(graph.get()));
684 #endif
685   device::KernelAdjust::GetInstance().InsertOverflowCheckOperations(NOT_NULL(graph));
686   // build kernel if node is cnode
687   BuildKernel(graph);
688   auto ms_context = MsContext::GetInstance();
689   MS_EXCEPTION_IF_NULL(ms_context);
690 #ifdef ENABLE_DEBUGGER
691   if (debugger_ && debugger_->partial_memory()) {
692     debugger_->PreExecute(graph);
693   }
694 #endif
695   if (ms_context->get_param<bool>(MS_CTX_PRECOMPILE_ONLY)) {
696     MS_LOG(INFO) << "Precompile only, stop in build kernel step";
697   } else {
698     // alloc memory, including static memory and dynamic memory
699     MemoryAlloc(graph.get());
700     auto enable_mem_scheduler = ms_context->get_param<bool>(MS_CTX_ENABLE_MEM_SCHEDULER);
701     if (!enable_mem_scheduler) {
702       AnfAlgo::CacheAddrForGraph(graph);
703     }
704     // generate and load task info to device if it is sink mode
705     Load(graph);
706   }
707   // sync the initial const tensor to device
708   SyncInitialTenosrToDevice();
709   DumpAllGraphs({graph});
710   MS_LOG(INFO) << "End";
711 }
712 
CompileChildGraph(const KernelGraphPtr & child_graph)713 void AscendSession::CompileChildGraph(const KernelGraphPtr &child_graph) {
714   MS_EXCEPTION_IF_NULL(child_graph);
715   MS_LOG(INFO) << "CompileChildGraph " << child_graph->ToString();
716   opt::AscendBackendIRFusionOptimization(child_graph);
717   child_graph->SetExecOrderByDefault();
718 #ifdef ENABLE_DUMP_IR
719   auto context_ptr = MsContext::GetInstance();
720   MS_EXCEPTION_IF_NULL(context_ptr);
721   bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
722   if (save_graphs) {
723     std::string file_name = "select_kernel_before_graph_" + std::to_string(child_graph->graph_id()) + ".ir";
724     DumpIR(file_name, child_graph);
725   }
726 #endif
727   // select kernel build info
728   SelectKernel(*child_graph);
729 #ifdef ENABLE_DUMP_IR
730   if (save_graphs) {
731     std::string file_name = "select_kernel_after_graph_" + std::to_string(child_graph->graph_id()) + ".ir";
732     DumpIR(file_name, child_graph);
733   }
734 #endif
735   // optimize graph
736   HardwareOptimize(child_graph);
737   // assign static memory of parameters
738   auto enable_mem_scheduler = context_ptr->get_param<bool>(MS_CTX_ENABLE_MEM_SCHEDULER);
739   if (!enable_mem_scheduler) {
740     auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
741     MS_EXCEPTION_IF_NULL(runtime_instance);
742     runtime_instance->AssignStaticMemoryInput(*child_graph);
743     runtime_instance->AssignStaticMemoryValueNode(*child_graph);
744   }
745 }
746 
IsSupportSummary()747 bool AscendSession::IsSupportSummary() { return !device::KernelAdjust::NeedInsertSwitch(); }
748 
PreExecuteGraph(const std::shared_ptr<KernelGraph> & kernel_graph,const std::vector<tensor::TensorPtr> & inputs,VectorRef * const)749 void AscendSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
750                                     const std::vector<tensor::TensorPtr> &inputs, VectorRef *const) {
751 #ifdef ENABLE_DEBUGGER
752   if (debugger_) {
753     debugger_->PreExecute(kernel_graph);
754   }
755 #endif
756 #if ENABLE_CPU && ENABLE_D
757   // Initialize parameter server
758   InitPSParamAndOptim(kernel_graph, inputs);
759   std::string channel_name;
760   if (ps::PsDataPrefetch::GetInstance().cache_enable() && IsGetNextGraph(kernel_graph, &channel_name)) {
761     ps::ps_cache_instance.IncreaseGraphStep(channel_name);
762   }
763 #endif
764 }
765 
PostExecuteGraph(const std::shared_ptr<KernelGraph> & kernel_graph,const std::vector<tensor::TensorPtr> &,VectorRef * const)766 void AscendSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
767                                      const std::vector<tensor::TensorPtr> &, VectorRef *const) {
768   // summary
769 #ifndef ENABLE_SECURITY
770   Summary(kernel_graph.get());
771 #endif
772 #ifdef ENABLE_DEBUGGER
773   // load tensor from device for debugger
774   if (debugger_ && debugger_->debugger_enabled()) {
775     LoadTensor(kernel_graph);
776   }
777   // debugger post-execution processing
778   if (debugger_) {
779     debugger_->PostExecute();
780   }
781 #endif
782 }
783 
ExecuteGraph(const std::shared_ptr<KernelGraph> & kernel_graph)784 void AscendSession::ExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph) { Execute(kernel_graph, true); }
785 
RunOpHardwareOptimize(const std::shared_ptr<session::KernelGraph> & kernel_graph) const786 void AscendSession::RunOpHardwareOptimize(const std::shared_ptr<session::KernelGraph> &kernel_graph) const {
787   MS_LOG(INFO) << "HardwareOptimize Start";
788   opt::RunOpAscendBackendOptimization(kernel_graph);
789   MS_LOG(INFO) << "HardwareOptimize Finish";
790 }
791 
BuildOpImpl(const OpRunInfo & op_run_info,const GraphInfo & graph_info,const std::vector<tensor::TensorPtr> & input_tensors,const std::vector<int64_t> & tensors_mask)792 KernelGraphPtr AscendSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
793                                           const std::vector<tensor::TensorPtr> &input_tensors,
794                                           const std::vector<int64_t> &tensors_mask) {
795   auto it = run_op_graphs_.find(graph_info);
796   if (it != run_op_graphs_.end()) {
797     return it->second;
798   }
799 
800   const auto &graph = PreBuildOp(op_run_info, input_tensors, tensors_mask);
801   MS_EXCEPTION_IF_NULL(graph);
802   // init runtime resource
803   InitRuntimeResource();
804   // build kernel
805   RunOpAdjustKernel(graph);
806   BuildKernel(graph);
807   auto enable_op_graph_cache = MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
808   if (enable_op_graph_cache) {
809     run_op_graphs_[graph_info] = graph;
810   }
811   return graph;
812 }
813 
BindAddressToTensor(const std::map<tensor::TensorPtr,session::KernelWithIndex> & tensor_to_node) const814 void AscendSession::BindAddressToTensor(
815   const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node) const {
816   auto ms_context = MsContext::GetInstance();
817   MS_EXCEPTION_IF_NULL(ms_context);
818   for (const auto &item : tensor_to_node) {
819     auto &tensor = item.first;
820     auto &node = item.second.first;
821     auto &output_index = item.second.second;
822     DeviceAddressPtr address = nullptr;
823     if (ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER)) {
824       address = AnfAlgo::GetMutableOutputAddr(node, output_index, false);
825     } else {
826       address = AnfAlgo::GetMutableOutputAddr(node, output_index);
827     }
828     MS_EXCEPTION_IF_NULL(tensor);
829     tensor->set_device_address(address);
830   }
831 }
832 
LaunchFunc(const KernelGraphPtr & graph,const std::map<tensor::TensorPtr,session::KernelWithIndex> & tensor_to_node,bool is_dynamic_shape,const std::vector<tensor::TensorPtr> & input_tensors)833 void AscendSession::LaunchFunc(const KernelGraphPtr &graph,
834                                const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node,
835                                bool is_dynamic_shape, const std::vector<tensor::TensorPtr> &input_tensors) {
836   MS_EXCEPTION_IF_NULL(graph);
837   // Wait for AllReduce
838   for (auto &tensor : input_tensors) {
839     if (tensor->NeedWaitDevice()) {
840       tensor->WaitDevice();
841     }
842   }
843 
844   RunOpRemoveNopNode(graph);
845   RunOpMemoryAllocNew(input_tensors, tensor_to_node, *graph);
846   AnfAlgo::CacheAddrForGraph(graph);
847   // Bind Device Ptr to DeviceAddress of Tensor
848   BindAddressToTensor(tensor_to_node);
849   RunOpGenKernelEvent(graph.get());
850 
851   if (is_dynamic_shape) {
852     BuildDynamicKernel(graph);
853   }
854 
855   LoadInputData(graph, input_tensors);
856   Execute(graph, false);
857   RunOpMemoryClear(graph.get());
858 }
859 
BatchBuildKernel(const std::vector<std::shared_ptr<SessionTask>> & build_tasks)860 void AscendSession::BatchBuildKernel(const std::vector<std::shared_ptr<SessionTask>> &build_tasks) {
861   std::vector<CNodePtr> node_to_build;
862   std::vector<KernelGraphPtr> graphs;
863 
864   // Hide Nop Node && Collect nodes to build.
865   for (const auto &task : build_tasks) {
866     MS_EXCEPTION_IF_NULL(task);
867     const auto &context = task->context();
868     MS_EXCEPTION_IF_NULL(context);
869     const auto &graph = context->graph();
870     MS_EXCEPTION_IF_NULL(graph);
871 
872     RunOpHideNopNode(graph);
873 
874     const auto &nodes = graph->execution_order();
875     std::copy(nodes.begin(), nodes.end(), std::back_inserter(node_to_build));
876     graphs.push_back(graph);
877   }
878 
879   // Build first time.
880   BuildKernel(node_to_build);
881 
882   std::vector<CNodePtr> atomic_node_to_build;
883   for (auto &graph : graphs) {
884     device::ascend::KernelBuildPreprocess(graph.get());
885     const auto &nodes = graph->execution_order();
886     std::copy(nodes.begin(), nodes.end(), std::back_inserter(atomic_node_to_build));
887   }
888   // Build AtomicClean.
889   BuildKernel(atomic_node_to_build);
890 }
891 
PrepareForOutputTensor(const KernelGraphPtr & graph,const std::vector<tensor::TensorPtr> & input_tensors,std::map<tensor::TensorPtr,session::KernelWithIndex> * tensor_to_node,VectorRef * outputs) const892 void AscendSession::PrepareForOutputTensor(const KernelGraphPtr &graph,
893                                            const std::vector<tensor::TensorPtr> &input_tensors,
894                                            std::map<tensor::TensorPtr, session::KernelWithIndex> *tensor_to_node,
895                                            VectorRef *outputs) const {
896   // Create DeviceAddress For Output Tensor(contain: Shape, Format, DType)
897   auto runtime_instance = device::KernelRuntimeManager::Instance().GetCurrentKernelRuntime();
898   runtime_instance->RunOpMallocPre(*graph, input_tensors);
899   runtime_instance->UpdateRefNodeOutputMem(*graph);
900   // CREATE OUTPUT TENSOR ADDRESS
901   UpdateOutputs(graph, outputs, input_tensors, tensor_to_node);
902 }
903 
StoreCNodePrimitive(const KernelGraphPtr & graph)904 void StoreCNodePrimitive(const KernelGraphPtr &graph) {
905   const auto &nodes = graph->execution_order();
906   for (auto &node : nodes) {
907     auto primitive = AnfAlgo::GetCNodePrimitive(node);
908     MS_EXCEPTION_IF_NULL(primitive);
909     auto new_primitive = std::make_shared<Primitive>(*primitive);
910     node->set_input(kAnfPrimitiveIndex, NewValueNode(new_primitive));
911   }
912 }
913 
CreateKernelGraph(const GraphInfo & graph_info,OpRunInfo * op_run_info,std::vector<tensor::TensorPtr> * input_tensors,const std::vector<int64_t> & tensors_mask,bool cache_miss)914 KernelGraphPtr AscendSession::CreateKernelGraph(const GraphInfo &graph_info, OpRunInfo *op_run_info,
915                                                 std::vector<tensor::TensorPtr> *input_tensors,
916                                                 const std::vector<int64_t> &tensors_mask, bool cache_miss) {
917   auto &task_manager = PynativeTaskManager::GetInstance();
918   KernelGraphPtr graph = nullptr;
919   if (cache_miss) {
920     graph = PreBuildOp(*op_run_info, *input_tensors, tensors_mask);
921     MS_EXCEPTION_IF_NULL(graph);
922     InitRuntimeResource();
923     run_op_graphs_[graph_info] = graph;
924   } else {
925     if (!task_manager.QueueEmpty()) {
926       graph = PreBuildOp(*op_run_info, *input_tensors, tensors_mask);
927       InitRuntimeResource();
928     } else {
929       graph = run_op_graphs_[graph_info];
930     }
931   }
932   return graph;
933 }
934 
DisableLazyBuild(const OpRunInfo & op_run_info)935 bool AscendSession::DisableLazyBuild(const OpRunInfo &op_run_info) {
936   auto ms_context = MsContext::GetInstance();
937   MS_EXCEPTION_IF_NULL(ms_context);
938   return !op_run_info.lazy_build || ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode ||
939          op_run_info.is_dynamic_shape || ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_SYNCHRONIZE);
940 }
941 
RunOpImpl(const GraphInfo & graph_info,OpRunInfo * op_run_info,std::vector<tensor::TensorPtr> * input_tensors,VectorRef * outputs,const std::vector<int64_t> & tensors_mask)942 void AscendSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
943                               std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs,
944                               const std::vector<int64_t> &tensors_mask) {
945   MS_EXCEPTION_IF_NULL(op_run_info);
946   if (DisableLazyBuild(*op_run_info)) {
947     session::PynativeTaskManager::GetInstance().ExecuteRemainingTasks();
948     RunOpImplOrigin(graph_info, op_run_info, input_tensors, outputs, tensors_mask);
949     return;
950   }
951 
952   MS_EXCEPTION_IF_NULL(input_tensors);
953   bool cache_miss = run_op_graphs_.find(graph_info) == run_op_graphs_.end();
954   auto graph = CreateKernelGraph(graph_info, op_run_info, input_tensors, tensors_mask, cache_miss);
955   EraseValueNodeTensor(tensors_mask, input_tensors);
956   MS_EXCEPTION_IF_NULL(graph);
957   std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node;
958   PrepareForOutputTensor(graph, *input_tensors, &tensor_to_node, outputs);
959 
960   auto &task_manager = PynativeTaskManager::GetInstance();
961   if (!cache_miss && task_manager.QueueEmpty()) {
962     // Cache match and there are no task in Queue. Just Launch immediately.
963     LaunchFunc(graph, tensor_to_node, op_run_info->is_dynamic_shape, *input_tensors);
964   } else {
965     auto run_op_context = std::make_shared<RunOpContext>(graph_info, op_run_info->is_dynamic_shape, graph, tensors_mask,
966                                                          *input_tensors, tensor_to_node);
967     task_manager.PushLaunchTask(std::make_shared<LaunchTask>(run_op_context));
968 
969     if (cache_miss || !task_manager.QueueEmpty()) {
970       // Copy Primitive. The attributes of Primitive will be modified.
971       StoreCNodePrimitive(graph);
972       task_manager.PushBuildTask(std::make_shared<BuildTask>(run_op_context));
973     }
974   }
975 
976   if (!task_manager.inited()) {
977     task_manager.Init([this]() { ExecuteAllTaskInQueue(); });
978   }
979 
980   if (task_manager.QueueFull()) {
981     task_manager.ExecuteRemainingTasks();
982   }
983 }
984 
RunOpImplOrigin(const GraphInfo & graph_info,OpRunInfo * op_run_info,std::vector<tensor::TensorPtr> * input_tensors,VectorRef * outputs,const std::vector<int64_t> & tensors_mask)985 void AscendSession::RunOpImplOrigin(const GraphInfo &graph_info, OpRunInfo *op_run_info,
986                                     std::vector<tensor::TensorPtr> *input_tensors, VectorRef *outputs,
987                                     const std::vector<int64_t> &tensors_mask) {
988   MS_EXCEPTION_IF_NULL(input_tensors);
989   MS_EXCEPTION_IF_NULL(op_run_info);
990   const auto &graph = BuildOpImpl(*op_run_info, graph_info, *input_tensors, tensors_mask);
991 
992   EraseValueNodeTensor(tensors_mask, input_tensors);
993 
994   // wait for allreduce
995   for (auto &tensor : *input_tensors) {
996     if (tensor->NeedWaitDevice()) {
997       tensor->WaitDevice();
998     }
999   }
1000   // malloc mem
1001   RunOpRemoveNopNode(graph);
1002   RunOpMemoryAlloc(*input_tensors, graph.get());
1003   RunOpGenKernelEvent(graph.get());
1004   AnfAlgo::CacheAddrForGraph(graph);
1005   // Build dynamic kernel
1006   if (op_run_info->is_dynamic_shape) {
1007     BuildDynamicKernel(graph);
1008   }
1009   // load input data to device
1010   LoadInputData(graph, *input_tensors);
1011   // run op
1012   Execute(graph, false);
1013   // get output
1014   std::map<tensor::TensorPtr, session::KernelWithIndex> tensor_to_node;
1015   UpdateOutputs(graph, outputs, *input_tensors, &tensor_to_node);
1016   // update output abstract of dynamic op to op_run_info
1017   if (op_run_info->is_dynamic_shape) {
1018     UpdateOutputAbstract(graph, op_run_info);
1019   }
1020   RunOpMemoryClear(graph.get());
1021 }
1022 
PreBuildOp(const OpRunInfo & op_run_info,const std::vector<tensor::TensorPtr> & input_tensors,const std::vector<int64_t> & tensors_mask)1023 KernelGraphPtr AscendSession::PreBuildOp(const OpRunInfo &op_run_info,
1024                                          const std::vector<tensor::TensorPtr> &input_tensors,
1025                                          const std::vector<int64_t> &tensors_mask) {
1026   // Construct graph include one op
1027   auto graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask, true);
1028   MS_EXCEPTION_IF_NULL(graph);
1029   opt::RunOpAscendBackendIRFusionOptimization(graph);
1030   SelectKernel(*graph);
1031   RunOpHardwareOptimize(graph);
1032   CacheCNodeOutputInfo(*graph);
1033   return graph;
1034 }
1035 
CacheCNodeOutputInfo(const KernelGraph & graph) const1036 void AscendSession::CacheCNodeOutputInfo(const KernelGraph &graph) const {
1037   auto &nodes = graph.execution_order();
1038   for (auto const &node : nodes) {
1039     std::vector<std::string> formats;
1040     std::vector<TypeId> types;
1041     std::vector<size_t> tensor_sizes;
1042     auto output_num = AnfAlgo::GetOutputTensorNum(node);
1043     for (size_t i = 0; i < output_num; ++i) {
1044       std::string output_format = AnfAlgo::GetOutputFormat(node, i);
1045       auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i);
1046       auto tensor_size = AnfAlgo::GetOutputTensorMemSize(node, i);
1047       formats.emplace_back(output_format);
1048       types.emplace_back(output_type);
1049       tensor_sizes.emplace_back(tensor_size);
1050     }
1051     MS_EXCEPTION_IF_NULL(node);
1052     node->set_user_data<OpRuntimeInfo>(std::make_shared<OpRuntimeInfo>(formats, types, tensor_sizes));
1053   }
1054 
1055   auto &inputs = graph.inputs();
1056   for (const auto &input : inputs) {
1057     MS_EXCEPTION_IF_NULL(input);
1058     if (!input->isa<Parameter>()) {
1059       continue;
1060     }
1061     std::vector<std::string> formats;
1062     std::vector<TypeId> types;
1063     std::vector<size_t> tensor_sizes;
1064     auto output_size = AnfAlgo::GetOutputTensorNum(input);
1065     for (size_t index = 0; index < output_size; index++) {
1066       auto format = AnfAlgo::GetOutputFormat(input, index);
1067       auto type_id = AnfAlgo::GetOutputDeviceDataType(input, index);
1068       if (type_id == kTypeUnknown) {
1069         type_id = AnfAlgo::GetOutputInferDataType(input, index);
1070       }
1071       auto tensor_size = AnfAlgo::GetOutputTensorMemSize(input, index);
1072       formats.emplace_back(format);
1073       types.emplace_back(type_id);
1074       tensor_sizes.emplace_back(tensor_size);
1075     }
1076     input->set_user_data<OpRuntimeInfo>(std::make_shared<OpRuntimeInfo>(formats, types, tensor_sizes));
1077   }
1078 }
1079 
GetOpInputStubTensors(const CNodePtr & cnode,const std::map<AnfNodePtr,size_t> & parameter_index,const std::vector<tensor::TensorPtr> & graph_inputs,const std::map<KernelWithIndex,OutputTensorInfo> & node_output_info,InputTensorInfo * input_tensor_info)1080 void AscendSession::GetOpInputStubTensors(const CNodePtr &cnode, const std::map<AnfNodePtr, size_t> &parameter_index,
1081                                           const std::vector<tensor::TensorPtr> &graph_inputs,
1082                                           const std::map<KernelWithIndex, OutputTensorInfo> &node_output_info,
1083                                           InputTensorInfo *input_tensor_info) {
1084   MS_EXCEPTION_IF_NULL(cnode);
1085   MS_EXCEPTION_IF_NULL(input_tensor_info);
1086   const auto input_tensor_num = AnfAlgo::GetInputTensorNum(cnode);
1087   for (size_t i = 1; i <= input_tensor_num; i += 1) {
1088     const auto &input = cnode->input(i);
1089     auto kernel_with_index = AnfAlgo::VisitKernel(input, 0);
1090     auto real_input = kernel_with_index.first;
1091     MS_EXCEPTION_IF_NULL(real_input);
1092     tensor::TensorPtr tensor = nullptr;
1093     if (real_input->isa<ValueNode>()) {
1094       tensor = GetValueNodeOutputTensor(real_input, kernel_with_index.second);
1095       input_tensor_info->input_tensors_mask.emplace_back(kParameterDataTensorMask);
1096     } else if (real_input->isa<Parameter>()) {
1097       tensor = GetParameterOutputTensor(real_input, parameter_index, graph_inputs);
1098       auto parameter = real_input->cast<ParameterPtr>();
1099       MS_EXCEPTION_IF_NULL(parameter);
1100       input_tensor_info->input_tensors_mask.emplace_back(parameter->has_default() ? kParameterWeightTensorMask
1101                                                                                   : kParameterDataTensorMask);
1102     } else if (real_input->isa<CNode>()) {
1103       bool output_is_weight = false;
1104       tensor = GetCNodeOutputStubTensor(kernel_with_index, node_output_info, &output_is_weight);
1105       input_tensor_info->input_tensors_mask.emplace_back(output_is_weight ? kParameterWeightTensorMask
1106                                                                           : kParameterDataTensorMask);
1107     } else {
1108       MS_LOG(EXCEPTION) << "Invalid input node, node = " << real_input->DebugString();
1109     }
1110     MS_EXCEPTION_IF_NULL(tensor);
1111     MS_LOG(DEBUG) << "Get" << i << "th input tensor of " << cnode->fullname_with_scope() << " from "
1112                   << real_input->fullname_with_scope() << "-" << kernel_with_index.second;
1113     input_tensor_info->input_tensors.emplace_back(tensor);
1114   }
1115 }
1116 
BuildOpsInGraph(const GraphId & graph_id,const std::map<AnfNodePtr,size_t> & parameter_index,const std::vector<tensor::TensorPtr> & graph_inputs,const std::map<KernelWithIndex,size_t> & cnode_refcount)1117 void AscendSession::BuildOpsInGraph(const GraphId &graph_id, const std::map<AnfNodePtr, size_t> &parameter_index,
1118                                     const std::vector<tensor::TensorPtr> &graph_inputs,
1119                                     const std::map<KernelWithIndex, size_t> &cnode_refcount) {
1120   if (built_graph_id_.find(graph_id) != built_graph_id_.end()) {
1121     return;
1122   }
1123   auto graph = GetGraph(graph_id);
1124   MS_EXCEPTION_IF_NULL(graph);
1125   std::map<KernelWithIndex, OutputTensorInfo> op_output_info;
1126   std::vector<CNodePtr> kernels;
1127   std::unordered_map<KernelGraphPtr, GraphInfo> single_op_graphs;
1128   // Collect kernels need to be built in single op graphs
1129   for (const auto &kernel : graph->execution_order()) {
1130     // Generate fake input tensors, tensor masks and input kernel with index
1131     InputTensorInfo input_tensor_info;
1132     GetOpInputStubTensors(kernel, parameter_index, graph_inputs, op_output_info, &input_tensor_info);
1133     // Get OpRunInfo and GraphInfo
1134     OpRunInfo op_run_info;
1135     GetSingleOpRunInfo(kernel, &op_run_info);
1136     if (op_run_info.is_dynamic_shape) {
1137       MS_LOG(INFO) << "BuildOpsInGraph stop, op " << op_run_info.op_name << " is dynamic shape.";
1138       break;
1139     }
1140     const GraphInfo &graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors);
1141     const auto &single_op_graph_iter = run_op_graphs_.find(graph_info);
1142     if (single_op_graph_iter != run_op_graphs_.end()) {
1143       // if graph of same single op exists, the output tensor of current op should be generated
1144       GenOpOutputStubTensor(single_op_graph_iter->second, kernel, cnode_refcount, &op_output_info);
1145       continue;
1146     }
1147     const auto &single_op_graph =
1148       PreBuildOp(op_run_info, input_tensor_info.input_tensors, input_tensor_info.input_tensors_mask);
1149     MS_EXCEPTION_IF_NULL(single_op_graph);
1150     GenOpOutputStubTensor(single_op_graph, kernel, cnode_refcount, &op_output_info);
1151     opt::HideNopNode(single_op_graph.get());
1152     // The graph info could have been changed in PreBuildOp
1153     const GraphInfo &new_graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors);
1154     single_op_graphs.emplace(single_op_graph, new_graph_info);
1155     const auto &execution_order = single_op_graph->execution_order();
1156     std::copy(execution_order.begin(), execution_order.end(), std::back_inserter(kernels));
1157   }
1158   InitRuntimeResource();
1159   // Compile all kernels parallel
1160   BuildKernel(kernels);
1161   // Some new kernel may be added after KernelBuildPreprocess, so collect and build kernels again
1162   kernels.clear();
1163   for (const auto &graph_item : single_op_graphs) {
1164     device::ascend::KernelBuildPreprocess(graph_item.first.get());
1165     const auto &execution_order = graph_item.first->execution_order();
1166     std::copy(execution_order.begin(), execution_order.end(), std::back_inserter(kernels));
1167   }
1168   BuildKernel(kernels);
1169   // Record single op graphs in run_op_graphs_ so that these graphs can be reused in BuildOpImpl
1170   for (const auto &graph_item : single_op_graphs) {
1171     RunOpMemoryClear(graph_item.first.get());
1172     auto enable_op_graph_cache = MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE);
1173     if (enable_op_graph_cache) {
1174       run_op_graphs_[graph_item.second] = graph_item.first;
1175     }
1176     MS_LOG(DEBUG) << "Pre build op finished, graph info: " << graph_item.second;
1177   }
1178   built_graph_id_.insert(graph_id);
1179 }
1180 
1181 // compile graph steps
SelectKernel(const KernelGraph & kernel_graph) const1182 void AscendSession::SelectKernel(const KernelGraph &kernel_graph) const {
1183   MS_LOG(INFO) << "Start!";
1184   size_t raise_precision_count = 0;
1185   size_t reduce_precision_count = 0;
1186   for (const auto &cnode : kernel_graph.execution_order()) {
1187     auto status = device::ascend::SelectKernelInfo(cnode);
1188     AnfAlgo::EraseNodeAttr(kAttrPynativeNextOpName, cnode);
1189     AnfAlgo::EraseNodeAttr(kAttrPynativeNextIndex, cnode);
1190     if (status == device::ascend::kStatusRaisePrecision) {
1191       raise_precision_count++;
1192     } else if (status == device::ascend::kStatusReducePrecision) {
1193       reduce_precision_count++;
1194     }
1195     MS_LOG(INFO) << "Select ApplyKernel: " << cnode->DebugString();
1196   }
1197   auto ms_context = MsContext::GetInstance();
1198   MS_EXCEPTION_IF_NULL(ms_context);
1199   if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) {
1200     if (raise_precision_count > 0) {
1201       MS_LOG(WARNING) << "There has " << raise_precision_count
1202                       << " node/nodes used raise precision to selected the kernel!";
1203     }
1204     if (reduce_precision_count > 0) {
1205       MS_LOG(WARNING) << "There has " << reduce_precision_count
1206                       << " node/nodes used reduce precision to selected the kernel!";
1207     }
1208   }
1209   MS_LOG(INFO) << "Finish!";
1210 }
1211 
1212 #ifndef ENABLE_SECURITY
DumpInit(uint32_t device_id)1213 void DumpInit(uint32_t device_id) {
1214   auto &json_parser = DumpJsonParser::GetInstance();
1215   json_parser.Parse();
1216   json_parser.CopyDumpJsonToDir(device_id);
1217   json_parser.CopyHcclJsonToDir(device_id);
1218   json_parser.CopyMSCfgJsonToDir(device_id);
1219   if (json_parser.async_dump_enabled()) {
1220     if (AdxDataDumpServerInit() != 0) {
1221       MS_LOG(EXCEPTION) << "Adx data dump server init failed";
1222     }
1223   }
1224 }
1225 #endif
1226 
InitRuntimeResource()1227 void AscendSession::InitRuntimeResource() {
1228   MS_LOG(INFO) << "Start!";
1229   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1230   MS_EXCEPTION_IF_NULL(runtime_instance);
1231   if (!runtime_instance->Init()) {
1232     MS_LOG(EXCEPTION) << "Kernel runtime init error.";
1233   }
1234   auto ms_context = MsContext::GetInstance();
1235   MS_EXCEPTION_IF_NULL(ms_context);
1236   auto env_rank_id = common::GetEnv("RANK_ID");
1237   if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
1238     // get actual rank id if it's distribution training case.
1239     rank_id_ = GetRankId();
1240   }
1241 #ifndef ENABLE_SECURITY
1242   DumpInit(rank_id_);
1243 #endif
1244   MS_LOG(INFO) << "Finish!";
1245 }
1246 
HardwareOptimize(const std::shared_ptr<KernelGraph> & kernel_graph) const1247 void AscendSession::HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1248   MS_LOG(INFO) << "HardwareOptimize start!";
1249   opt::AscendBackendOptimization(kernel_graph);
1250   FinalOptimize(kernel_graph);
1251   GraphKernelOptimize(kernel_graph);
1252   MS_EXCEPTION_IF_NULL(kernel_graph);
1253   kernel_graph->SetExecOrderByDefault();
1254   MS_LOG(INFO) << "HardwareOptimize Finish!";
1255 }
1256 
GraphKernelOptimize(const std::shared_ptr<KernelGraph> & kernel_graph) const1257 void AscendSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1258   if (!context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
1259     return;
1260   }
1261   opt::GraphKernelOptimize(kernel_graph);
1262   kernel_graph->SetExecOrderByDefault();
1263 }
1264 
AdjustKernel(const std::shared_ptr<KernelGraph> & kernel_graph) const1265 void AscendSession::AdjustKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1266   MS_LOG(INFO) << "Start!";
1267   opt::HideNopNode(kernel_graph.get());
1268   auto execution_order = kernel_graph->execution_order();
1269   AnfAlgo::ReorderExecList(NOT_NULL(&execution_order));
1270   kernel_graph->set_execution_order(execution_order);
1271   // Insert CLearZero op
1272   // prepare for next step from json get atomic info
1273   BuildKernel(kernel_graph);
1274   device::ascend::KernelBuildPreprocess(kernel_graph.get());
1275   device::KernelAdjust::GetInstance().InsertSwitchLoop(kernel_graph);
1276 #ifdef ENABLE_DUMP_IR
1277   auto context_ptr = MsContext::GetInstance();
1278   MS_EXCEPTION_IF_NULL(context_ptr);
1279   bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
1280   if (save_graphs) {
1281     DumpIR("after_adjust_kernel.ir", kernel_graph);
1282   }
1283 #endif
1284   MS_LOG(INFO) << "Finish!";
1285 }
1286 
RunOpAdjustKernel(const std::shared_ptr<KernelGraph> & kernel_graph) const1287 void AscendSession::RunOpAdjustKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1288   MS_LOG(INFO) << "Start!";
1289   RunOpHideNopNode(kernel_graph);
1290   // Insert CLearZero op
1291   // prepare for next step from json get atomic info
1292   BuildKernel(kernel_graph);
1293   device::ascend::KernelBuildPreprocess(kernel_graph.get());
1294   MS_LOG(INFO) << "Finish!";
1295 }
1296 
AssignStream(NotNull<KernelGraphPtr> kernel_graph) const1297 void AscendSession::AssignStream(NotNull<KernelGraphPtr> kernel_graph) const {
1298   MS_LOG(INFO) << "Start!";
1299   device::ascend::AscendStreamAssign::GetInstance().AssignStream(kernel_graph);
1300   MS_LOG(INFO) << "Finish!";
1301 }
1302 
BuildKernel(const std::shared_ptr<KernelGraph> & kernel_graph) const1303 void AscendSession::BuildKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1304   BuildKernel(kernel_graph->execution_order());
1305 }
1306 
BuildKernel(const std::vector<CNodePtr> & kernels)1307 void AscendSession::BuildKernel(const std::vector<CNodePtr> &kernels) {
1308   MS_LOG(INFO) << "Start!";
1309   struct timeval start_time, end_time;
1310   (void)gettimeofday(&start_time, nullptr);
1311   auto ret = device::ascend::KernelBuild(kernels);
1312   if (!ret) {
1313     MS_LOG(EXCEPTION) << "Kernel build error.";
1314   }
1315   (void)gettimeofday(&end_time, nullptr);
1316   const uint64_t kUSecondInSecond = 1000000;
1317   uint64_t cost = kUSecondInSecond * static_cast<uint64_t>(end_time.tv_sec - start_time.tv_sec);
1318   cost += static_cast<uint64_t>(end_time.tv_usec - start_time.tv_usec);
1319   MS_LOG(INFO) << "KernelBuild run in  " << PRIu64 << " us " << cost;
1320   MS_LOG(INFO) << "Finish!";
1321 }
1322 
BuildDynamicKernel(const std::shared_ptr<KernelGraph> & kernel_graph) const1323 void AscendSession::BuildDynamicKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1324   MS_LOG(DEBUG) << "Start!";
1325   MS_EXCEPTION_IF_NULL(kernel_graph);
1326   const auto &kernels = kernel_graph->execution_order();
1327   auto iter = std::find_if(kernels.begin(), kernels.end(), [](const CNodePtr &kernel) {
1328     return AnfAlgo::GetBooleanAttr(kernel, kAttrOutputIsDynamicShape);
1329   });
1330   if (iter == kernels.end()) {
1331     return;
1332   }
1333   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1334   MS_EXCEPTION_IF_NULL(runtime_instance);
1335   if (!runtime_instance->GenDynamicKernel(*kernel_graph)) {
1336     MS_LOG(DEBUG) << "Graph:" << kernel_graph->graph_id() << " failed to generate dynamic kernel!";
1337   }
1338   MS_LOG(DEBUG) << "Finish!";
1339 }
1340 
GetNextLabelSet(const std::vector<CNodePtr> & kernel_nodes,uint32_t index)1341 static CNodePtr GetNextLabelSet(const std::vector<CNodePtr> &kernel_nodes, uint32_t index) {
1342   size_t node_sizes = kernel_nodes.size();
1343   if (index >= node_sizes - 1) {
1344     MS_LOG(EXCEPTION) << "there is no node after this node:" << kernel_nodes[index]->DebugString();
1345   }
1346   auto kernel = kernel_nodes[index + 1];
1347   if (AnfAlgo::GetCNodeName(kernel) != kLabelSetOpName) {
1348     MS_LOG(EXCEPTION) << "the node is not labelset follow labelgoto/labelswitch, node: "
1349                       << kernel_nodes[index]->DebugString();
1350   }
1351   return kernel;
1352 }
1353 
HandleRecursiveCall(const std::vector<CNodePtr> & kernel_cnodes,const uint32_t & back_label,uint32_t * index,std::vector<CNodePtr> * back)1354 static std::vector<CNodePtr> HandleRecursiveCall(const std::vector<CNodePtr> &kernel_cnodes, const uint32_t &back_label,
1355                                                  uint32_t *index, std::vector<CNodePtr> *back) {
1356   MS_EXCEPTION_IF_NULL(index);
1357   MS_EXCEPTION_IF_NULL(back);
1358   std::vector<CNodePtr> front;
1359   std::vector<CNodePtr> back_temp;
1360   bool back_flag = false;
1361   uint32_t i = *index;
1362   while (i < kernel_cnodes.size()) {
1363     if (!back_flag) {
1364       front.emplace_back(kernel_cnodes[i]);
1365     } else {
1366       back->emplace_back(kernel_cnodes[i]);
1367     }
1368     if (AnfAlgo::HasNodeAttr(kAttrRecursiveEnd, kernel_cnodes[i])) {
1369       *index = i;
1370       back->insert(back->end(), back_temp.begin(), back_temp.end());
1371       return front;
1372     }
1373     if (AnfAlgo::HasNodeAttr(kAttrRecursive, kernel_cnodes[i])) {
1374       back_flag = true;
1375       if (!AnfAlgo::IsLabelIndexInNode(kernel_cnodes[i], back_label)) {
1376         auto temp = HandleRecursiveCall(kernel_cnodes, back_label, &(++i), &back_temp);
1377         front.insert(front.end(), temp.begin(), temp.end());
1378       }
1379     }
1380     i++;
1381   }
1382   return front;
1383 }
1384 
UnfoldRecursiveExecOrder(KernelGraph * kernel_graph)1385 static void UnfoldRecursiveExecOrder(KernelGraph *kernel_graph) {
1386   MS_EXCEPTION_IF_NULL(kernel_graph);
1387   if (!kernel_graph->recursive_call()) {
1388     return;
1389   }
1390   auto kernel_cnodes = kernel_graph->mem_reuse_exec_order();
1391   std::vector<CNodePtr> mem_reuse_order;
1392   mem_reuse_order.reserve(kernel_cnodes.size());
1393   for (uint32_t i = 0; i < kernel_cnodes.size(); i++) {
1394     if (!AnfAlgo::HasNodeAttr(kAttrRecursiveStart, kernel_cnodes[i])) {
1395       mem_reuse_order.emplace_back(kernel_cnodes[i]);
1396       continue;
1397     }
1398     auto label_id = AnfAlgo::GetNodeAttr<uint32_t>(kernel_cnodes[i], kAttrLabelIndex);
1399     std::vector<CNodePtr> back;
1400     auto front = HandleRecursiveCall(kernel_cnodes, label_id, &i, &back);
1401     mem_reuse_order.insert(mem_reuse_order.end(), front.begin(), front.end());
1402     mem_reuse_order.insert(mem_reuse_order.end(), back.begin(), back.end());
1403   }
1404   kernel_graph->set_mem_reuse_exec_order(mem_reuse_order);
1405 }
1406 
GetSubGraphExecOrder(const KernelGraph * kernel_graph,uint32_t index,const CNodePtr & back_node,std::vector<CNodePtr> * mem_reuse_order)1407 static void GetSubGraphExecOrder(const KernelGraph *kernel_graph, uint32_t index, const CNodePtr &back_node,
1408                                  std::vector<CNodePtr> *mem_reuse_order) {
1409   MS_EXCEPTION_IF_NULL(kernel_graph);
1410   MS_EXCEPTION_IF_NULL(mem_reuse_order);
1411   auto label_id = AnfAlgo::GetNodeAttr<uint32_t>(back_node, kAttrLabelIndex);
1412   auto kernel_cnodes = kernel_graph->execution_order();
1413   for (auto i = index; i < kernel_cnodes.size(); i++) {
1414     mem_reuse_order->emplace_back(kernel_cnodes[i]);
1415     if (AnfAlgo::IsLabelIndexInNode(kernel_cnodes[i], label_id)) {
1416       return;
1417     }
1418   }
1419 }
1420 
InitMemReuseExecOrder(KernelGraph * kernel_graph)1421 void InitMemReuseExecOrder(KernelGraph *kernel_graph) {
1422   MS_EXCEPTION_IF_NULL(kernel_graph);
1423   if (!kernel_graph->subgraph_multi_call()) {
1424     return;
1425   }
1426   std::unordered_map<uint32_t, uint32_t> label_id_index_map;
1427   auto kernel_cnodes = kernel_graph->execution_order();
1428   std::vector<CNodePtr> mem_reuse_order;
1429   for (uint32_t i = 0; i < kernel_cnodes.size(); i++) {
1430     mem_reuse_order.emplace_back(kernel_cnodes[i]);
1431     if (AnfAlgo::CheckPrimitiveType(kernel_cnodes[i], prim::kPrimLabelSwitch) &&
1432         !AnfAlgo::HasNodeAttr(kAttrRecursive, kernel_cnodes[i]) &&
1433         !AnfAlgo::HasNodeAttr(kAttrReturn, kernel_cnodes[i])) {
1434       auto label_list = AnfAlgo::GetNodeAttr<std::vector<uint32_t>>(kernel_cnodes[i], kAttrLabelSwitchList);
1435       for (auto label_id : label_list) {
1436         if (label_id_index_map.find(label_id) == label_id_index_map.end()) {
1437           continue;
1438         }
1439         auto back_node = GetNextLabelSet(kernel_cnodes, i);
1440         GetSubGraphExecOrder(kernel_graph, label_id_index_map[label_id], back_node, &mem_reuse_order);
1441       }
1442       continue;
1443     }
1444     if (AnfAlgo::CheckPrimitiveType(kernel_cnodes[i], prim::kPrimLabelGoto) &&
1445         !AnfAlgo::HasNodeAttr(kAttrRecursive, kernel_cnodes[i]) &&
1446         !AnfAlgo::HasNodeAttr(kAttrReturn, kernel_cnodes[i])) {
1447       auto label_id = AnfAlgo::GetNodeAttr<uint32_t>(kernel_cnodes[i], kAttrLabelIndex);
1448       if (label_id_index_map.find(label_id) == label_id_index_map.end()) {
1449         continue;
1450       }
1451       auto back_node = GetNextLabelSet(kernel_cnodes, i);
1452       GetSubGraphExecOrder(kernel_graph, label_id_index_map[label_id], back_node, &mem_reuse_order);
1453       continue;
1454     }
1455     if (AnfAlgo::CheckPrimitiveType(kernel_cnodes[i], prim::kPrimLabelSet) &&
1456         !AnfAlgo::HasNodeAttr(kAttrRecursive, kernel_cnodes[i])) {
1457       auto label_id = AnfAlgo::GetNodeAttr<uint32_t>(kernel_cnodes[i], kAttrLabelIndex);
1458       if (label_id_index_map.find(label_id) != label_id_index_map.end()) {
1459         MS_LOG(EXCEPTION) << "Two labelsets with same label id.";
1460       }
1461       label_id_index_map[label_id] = i;
1462       continue;
1463     }
1464   }
1465   kernel_graph->set_mem_reuse_exec_order(mem_reuse_order);
1466   UnfoldRecursiveExecOrder(kernel_graph);
1467 }
1468 
MemoryAlloc(KernelGraph * kernel_graph) const1469 void AscendSession::MemoryAlloc(KernelGraph *kernel_graph) const {
1470   MS_LOG(INFO) << "Start!";
1471   MS_EXCEPTION_IF_NULL(kernel_graph);
1472   InitMemReuseExecOrder(kernel_graph);
1473   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1474   MS_EXCEPTION_IF_NULL(runtime_instance);
1475   runtime_instance->AssignMemory(*kernel_graph);
1476   MS_LOG(INFO) << "Finish!";
1477 }
1478 
RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> & input_tensors,KernelGraph * kernel_graph) const1479 void AscendSession::RunOpMemoryAlloc(const std::vector<tensor::TensorPtr> &input_tensors,
1480                                      KernelGraph *kernel_graph) const {
1481   MS_EXCEPTION_IF_NULL(kernel_graph);
1482   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1483   MS_EXCEPTION_IF_NULL(runtime_instance);
1484   runtime_instance->RunOpAssignMemory(input_tensors, *kernel_graph);
1485 }
1486 
RunOpMemoryAllocNew(const std::vector<tensor::TensorPtr> & input_tensors,const std::map<tensor::TensorPtr,session::KernelWithIndex> & tensor_to_node,const KernelGraph & kernel_graph) const1487 void AscendSession::RunOpMemoryAllocNew(const std::vector<tensor::TensorPtr> &input_tensors,
1488                                         const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node,
1489                                         const KernelGraph &kernel_graph) const {
1490   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1491   MS_EXCEPTION_IF_NULL(runtime_instance);
1492   runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph, tensor_to_node);
1493 }
1494 
RunOpGenKernelEvent(const KernelGraph * graph) const1495 void AscendSession::RunOpGenKernelEvent(const KernelGraph *graph) const {
1496   MS_EXCEPTION_IF_NULL(graph);
1497   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1498   MS_EXCEPTION_IF_NULL(runtime_instance);
1499   runtime_instance->GenKernelEvents(*graph);
1500 }
1501 
RunOpMemoryClear(const KernelGraph * kernel_graph) const1502 void AscendSession::RunOpMemoryClear(const KernelGraph *kernel_graph) const {
1503   MS_EXCEPTION_IF_NULL(kernel_graph);
1504   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1505   MS_EXCEPTION_IF_NULL(runtime_instance);
1506   runtime_instance->RunOpClearMemory(*kernel_graph);
1507 }
1508 
Load(const std::shared_ptr<KernelGraph> & kernel_graph) const1509 void AscendSession::Load(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1510   MS_LOG(INFO) << "Start!";
1511   auto context_ptr = MsContext::GetInstance();
1512   MS_EXCEPTION_IF_NULL(context_ptr);
1513   bool is_task_sink = context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
1514   (void)device::KernelAdjust::GetInstance().StepLoadCtrlInputs(kernel_graph);
1515   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1516   MS_EXCEPTION_IF_NULL(runtime_instance);
1517   bool ret_ok = runtime_instance->Load(*kernel_graph, is_task_sink);
1518   if (!ret_ok) {
1519     MS_LOG(EXCEPTION) << "Load task error!";
1520   }
1521   MS_LOG(INFO) << "Finish!";
1522 }
1523 
Execute(const std::shared_ptr<KernelGraph> & kernel_graph,bool is_task) const1524 void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bool is_task) const {
1525   MS_LOG(DEBUG) << "Start!";
1526   bool is_task_sink = false;
1527   if (is_task) {
1528     auto context_ptr = MsContext::GetInstance();
1529     MS_EXCEPTION_IF_NULL(context_ptr);
1530     is_task_sink = context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK);
1531   }
1532   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1533   MS_EXCEPTION_IF_NULL(runtime_instance);
1534   if (is_task && is_task_sink) {
1535 #ifndef ENABLE_SECURITY
1536     DumpSetup(kernel_graph);
1537 #endif
1538   }
1539   bool ret_ok = runtime_instance->Run(*kernel_graph, is_task_sink);
1540   if (is_task && is_task_sink) {
1541 #ifndef ENABLE_SECURITY
1542     Dump(kernel_graph);
1543 #endif
1544   }
1545   if (!ret_ok) {
1546 #ifdef ENABLE_DUMP_IR
1547     mindspore::RDR::TriggerAll();
1548 #endif
1549     MS_LOG(EXCEPTION) << "run task error!";
1550   }
1551   MS_LOG(DEBUG) << "Finish!";
1552 }
1553 
1554 #ifndef ENABLE_SECURITY
DumpSetup(const std::shared_ptr<KernelGraph> & kernel_graph) const1555 void AscendSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1556   MS_LOG(DEBUG) << "Start!";
1557   MS_EXCEPTION_IF_NULL(kernel_graph);
1558   E2eDump::DumpSetup(kernel_graph.get());
1559   MS_LOG(DEBUG) << "Finish!";
1560 }
1561 
Dump(const std::shared_ptr<KernelGraph> & kernel_graph) const1562 void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1563   MS_LOG(DEBUG) << "Start!";
1564   MS_EXCEPTION_IF_NULL(kernel_graph);
1565   E2eDump::DumpData(kernel_graph.get(), rank_id_);
1566   MS_LOG(DEBUG) << "Finish!";
1567 }
1568 #endif
1569 
DumpAllGraphs(const std::vector<KernelGraphPtr> & all_graphs)1570 void AscendSession::DumpAllGraphs(const std::vector<KernelGraphPtr> &all_graphs) {
1571 #ifdef ENABLE_DUMP_IR
1572   auto context_ptr = MsContext::GetInstance();
1573   MS_EXCEPTION_IF_NULL(context_ptr);
1574   bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
1575   auto &json_parser = DumpJsonParser::GetInstance();
1576   json_parser.Parse();
1577   if (!save_graphs && !json_parser.e2e_dump_enabled() && !json_parser.async_dump_enabled() &&
1578       !mindspore::RecorderManager::Instance().RdrEnable()) {
1579     return;
1580   }
1581   auto kernel_runtime = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1582   MS_EXCEPTION_IF_NULL(kernel_runtime);
1583   for (auto &graph : all_graphs) {
1584     MS_EXCEPTION_IF_NULL(graph);
1585     std::string name = "graph_build." + std::to_string(graph->graph_id());
1586     DumpGraphParams dump_params = {true, static_cast<int>(kWholeStack)};
1587     (void)mindspore::RDR::RecordAnfGraph(SUBMODULE_ID, name, graph, dump_params, ".ir;.pb");
1588     if (save_graphs) {
1589       std::string file_name = "graph_build_" + std::to_string(graph->graph_id()) + ".ir";
1590       DumpIR(file_name, graph, true, kWholeStack);
1591       DumpIRProto(graph, "vm_build_" + std::to_string(graph->graph_id()));
1592       DumpIR("trace_code_graph", graph, true, kWholeStack);
1593     }
1594     std::string final_graph = "trace_code_graph_" + std::to_string(graph->graph_id());
1595     if (json_parser.e2e_dump_enabled() || json_parser.async_dump_enabled()) {
1596       std::string root_dir = json_parser.path() + "/rank_" + std::to_string(rank_id_);
1597       std::string target_dir = root_dir + "/graphs";
1598       std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir";
1599       DumpIRProtoWithSrcInfo(graph, final_graph, target_dir, kDebugWholeStack);
1600       DumpIR("trace_code_graph", graph, true, kWholeStack, ir_file_path);
1601       DumpGraphExeOrder("ms_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv", root_dir,
1602                         graph->execution_order());
1603     }
1604   }
1605 #endif
1606 }
1607 
LoadTensor(const std::shared_ptr<KernelGraph> & kernel_graph) const1608 void AscendSession::LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const {
1609   MS_LOG(INFO) << "Start!";
1610   MS_EXCEPTION_IF_NULL(kernel_graph);
1611   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1612   MS_EXCEPTION_IF_NULL(runtime_instance);
1613   (void)runtime_instance->LoadData(*kernel_graph);
1614   MS_LOG(INFO) << "Finish!";
1615 }
1616 
1617 #ifndef ENABLE_SECURITY
RecurseSetSummaryNodes(KernelGraph * graph,std::map<std::string,std::pair<AnfNodePtr,int>> * summary)1618 void AscendSession::RecurseSetSummaryNodes(KernelGraph *graph,
1619                                            std::map<std::string, std::pair<AnfNodePtr, int>> *summary) {
1620   MS_EXCEPTION_IF_NULL(graph);
1621   MS_EXCEPTION_IF_NULL(summary);
1622   // if final graph have no child graph
1623   auto graph_order_iter = graph_execute_orders_.find(graph->graph_id());
1624   if (graph_order_iter == graph_execute_orders_.end()) {
1625     SessionBasic::SetSummaryNodes(graph);
1626     auto summary_nodes = graph->summary_nodes();
1627     summary->insert(summary_nodes.begin(), summary_nodes.end());
1628     return;
1629   }
1630   // for every child graph, find summary nodes
1631   auto graph_order = GetGraphOrder(graph->graph_id());
1632   for (size_t i = 0; i < graph_order.size(); i++) {
1633     auto child_graph = GetGraph(graph_order[i]);
1634     if (child_graph == nullptr) {
1635       continue;
1636     }
1637     SessionBasic::SetSummaryNodes(child_graph.get());
1638     auto child_graph_summary = child_graph->summary_nodes();
1639     summary->insert(child_graph_summary.begin(), child_graph_summary.end());
1640     RecurseSetSummaryNodes(child_graph.get(), summary);
1641   }
1642   graph->set_summary_nodes(*summary);
1643 }
1644 
SetSummaryNodes(KernelGraph * graph)1645 void AscendSession::SetSummaryNodes(KernelGraph *graph) {
1646   MS_LOG(DEBUG) << "Update summary Start";
1647   MS_EXCEPTION_IF_NULL(graph);
1648   auto summary_nodes = graph->summary_nodes();
1649   std::map<std::string, std::pair<AnfNodePtr, int>> summary;
1650   summary.insert(summary_nodes.begin(), summary_nodes.end());
1651   RecurseSetSummaryNodes(graph, &summary);
1652   graph->set_summary_nodes(summary);
1653   MS_LOG(DEBUG) << "Update summary end size: " << summary.size();
1654 }
1655 #endif
1656 
MergeGraphExecOrder()1657 void AscendSession::MergeGraphExecOrder() {
1658   MS_LOG(INFO) << "Start!";
1659   // merge graph order
1660   auto &graph_order = GetGraphOrder(final_graph_id_);
1661   auto &graph_type = GetGraphOrderType(final_graph_id_);
1662   auto final_graph = GetGraph(final_graph_id_);
1663   MS_EXCEPTION_IF_NULL(final_graph);
1664   if (graph_order.empty()) {
1665     MS_LOG(WARNING) << "Graph output is a lonely variable not linked to any op!";
1666     return;
1667   }
1668   if (graph_order.size() > 1) {
1669     auto context_ptr = MsContext::GetInstance();
1670     MS_EXCEPTION_IF_NULL(context_ptr);
1671     if (!context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) {
1672       MS_LOG(EXCEPTION) << "Control sink network should run with task-sink mode!";
1673     }
1674   }
1675   // if first graph is common,the final graph has no label,then set the stream of final graph same with the first graph
1676   SetStreamDistinctionLabel(final_graph, graph_order[0], false);
1677   std::vector<CNodePtr> final_exec_order = final_graph->execution_order();
1678   KernelGraphPtr last_graph = nullptr;
1679   for (size_t i = 0; i < graph_order.size(); i++) {
1680     auto graph_id = graph_order[i];
1681     if (graph_type[i] == BRANCH_END || graph_type[i] == BRANCH_START) {
1682       continue;
1683     }
1684     auto child_graph = GetGraph(graph_id);
1685     last_graph = child_graph;
1686     MS_EXCEPTION_IF_NULL(child_graph);
1687     auto exec_order = child_graph->execution_order();
1688     MS_LOG(INFO) << "Merge graph,graph_id " << graph_id;
1689     (void)std::transform(exec_order.begin(), exec_order.end(), std::back_inserter(final_exec_order),
1690                          [&](CNodePtr node) -> CNodePtr {
1691                            AnfAlgo::SetStreamDistinctionLabel(child_graph->stream_distinction_label(), node.get());
1692                            return node;
1693                          });
1694     // add all value nodes of child graphs to final graph
1695     for (auto &value_node : child_graph->graph_value_nodes()) {
1696       final_graph->AddValueNodeToGraph(value_node);
1697     }
1698     // copy ref map to final graph
1699     auto child_ref_map = child_graph->GetRefMap();
1700     for (auto &item : child_ref_map) {
1701       if (final_graph->IsInRefOutputMap(item.first)) {
1702         MS_LOG(EXCEPTION) << "The ref pair is already in final graph!";
1703       }
1704       final_graph->AddRefCorrespondPairs(item.first, item.second);
1705     }
1706   }
1707   // set final_exec_order into final graph
1708   MS_EXCEPTION_IF_NULL(final_graph);
1709 #ifndef ENABLE_SECURITY
1710   DumpGraphExeOrder(final_exec_order);
1711 #endif
1712   final_graph->set_execution_order(final_exec_order);
1713 }
1714 
GetGraphOrder(GraphId final_graph_id) const1715 const std::vector<GraphId> &AscendSession::GetGraphOrder(GraphId final_graph_id) const {
1716   auto graph_order_iter = graph_execute_orders_.find(final_graph_id);
1717   if (graph_order_iter == graph_execute_orders_.end()) {
1718     MS_LOG(EXCEPTION) << "Final graph" << final_graph_id << "has no child graph";
1719   }
1720   return graph_order_iter->second;
1721 }
1722 
GetGraphOrderType(GraphId final_graph_id) const1723 const std::vector<GraphType> &AscendSession::GetGraphOrderType(GraphId final_graph_id) const {
1724   auto graph_type_iter = graph_order_types_.find(final_graph_id);
1725   if (graph_type_iter == graph_order_types_.end()) {
1726     MS_LOG(EXCEPTION) << "Final graph" << final_graph_id << "has no graph_order_types_";
1727   }
1728   return graph_type_iter->second;
1729 }
1730 
SyncInitialTenosrToDevice()1731 void AscendSession::SyncInitialTenosrToDevice() {
1732   for (auto &item : initial_tenosrs_) {
1733     auto to_graph_id = item.first.first;
1734     auto input_idx = item.first.second;
1735     auto front_tensor = item.second;
1736     auto to_graph = GetGraph(to_graph_id);
1737     MS_EXCEPTION_IF_NULL(to_graph);
1738     std::vector<AnfNodePtr> graph_inputs = to_graph->inputs();
1739     if (input_idx >= graph_inputs.size()) {
1740       MS_LOG(EXCEPTION) << "Input_index " << input_idx << " out of range size " << graph_inputs.size();
1741     }
1742     auto backend_parameter = graph_inputs[input_idx];
1743     // sync data from host to device
1744     MS_EXCEPTION_IF_NULL(front_tensor);
1745     size_t tensor_size = LongToSize(front_tensor->data().nbytes());
1746     auto addr = AnfAlgo::GetOutputAddr(backend_parameter, 0);
1747     MS_EXCEPTION_IF_NULL(addr);
1748     if (!addr->SyncHostToDevice(trans::GetRuntimePaddingShape(backend_parameter, 0), tensor_size,
1749                                 front_tensor->data_type(), front_tensor->data_c(),
1750                                 front_tensor->device_info().host_format_)) {
1751       MS_LOG(EXCEPTION) << "Tensor SyncHostToDevice fail!";
1752     }
1753   }
1754 }
1755 
RootGraphExecutorValidate(NotNull<KernelGraphPtr> graph,const std::vector<KernelGraphPtr> & all_graphs)1756 void AscendSession::RootGraphExecutorValidate(NotNull<KernelGraphPtr> graph,
1757                                               const std::vector<KernelGraphPtr> &all_graphs) {
1758   AscendAutoMonad auto_monad(graph);
1759   auto_monad.GenerateExecuteOrder();
1760   if (graph->label_num() > kLabelNumsThreshold) {
1761     MS_LOG(EXCEPTION) << "This model with " << all_graphs.size() << " graphs needs " << graph->label_num()
1762                       << " labels, which out of range of [0, 1024).\n1. Check if front-end composition is correct.\n"
1763                       << "2. Optimize model expression and reduce the number of graphs and labels.";
1764   }
1765 }
1766 
IrFusionPass(const NotNull<KernelGraphPtr> graph,NotNull<std::set<KernelGraphPtr> * > memo)1767 void AscendSession::IrFusionPass(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) {
1768   if (memo->find(graph) != memo->end()) {
1769     return;
1770   }
1771   memo->insert(graph.get());
1772   opt::AscendBackendIRFusionOptimization(graph);
1773   graph->SetExecOrderByDefault();
1774 #ifdef ENABLE_DUMP_IR
1775   auto context_ptr = MsContext::GetInstance();
1776   MS_EXCEPTION_IF_NULL(context_ptr);
1777   bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
1778   if (save_graphs) {
1779     std::string file_name = "select_kernel_before_graph_" + std::to_string(graph->graph_id()) + ".ir";
1780     DumpIR(file_name, graph.get());
1781   }
1782 #endif
1783 
1784   for (auto &child_graph : graph->child_graph_order()) {
1785     IrFusionPass(NOT_NULL(child_graph.lock()), memo);
1786   }
1787 }
1788 
SelectKernel(NotNull<KernelGraphPtr> root_graph)1789 void AscendSession::SelectKernel(NotNull<KernelGraphPtr> root_graph) {
1790   MS_LOG(INFO) << "Start select kernel.";
1791   size_t raise_precision_count = 0;
1792   size_t reduce_precision_count = 0;
1793 
1794   std::set<KernelGraphPtr> memo;
1795   RecurseSelectKernelInfo(root_graph, NOT_NULL(&memo), &raise_precision_count, &reduce_precision_count);
1796   memo.clear();
1797 
1798   auto ms_context = MsContext::GetInstance();
1799   MS_EXCEPTION_IF_NULL(ms_context);
1800   if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) {
1801     if (raise_precision_count > 0) {
1802       MS_LOG(WARNING) << "There are " << raise_precision_count
1803                       << " node/nodes used raise precision to selected the kernel!";
1804     }
1805     if (reduce_precision_count > 0) {
1806       MS_LOG(WARNING) << "There are " << reduce_precision_count
1807                       << " node/nodes used reduce precision to selected the kernel!";
1808     }
1809   }
1810   MS_LOG(INFO) << "Finish!";
1811 }
1812 
RecurseSelectKernelInfo(NotNull<KernelGraphPtr> graph,NotNull<std::set<KernelGraphPtr> * > const memo,size_t * const raise_precision_count,size_t * const reduce_precision_count) const1813 void AscendSession::RecurseSelectKernelInfo(NotNull<KernelGraphPtr> graph,
1814                                             NotNull<std::set<KernelGraphPtr> *> const memo,
1815                                             size_t *const raise_precision_count,
1816                                             size_t *const reduce_precision_count) const {
1817   if (memo->find(graph) != memo->end()) {
1818     return;
1819   }
1820   memo->insert(graph.get());
1821   MS_LOG(INFO) << "Start to select kernel info in graph: " << graph->graph_id();
1822 
1823   for (const auto &cnode : graph->execution_order()) {
1824     if (AnfAlgo::IsCondControlKernel(cnode)) {
1825       std::vector<KernelGraphPtr> child_graphs;
1826       if (AnfAlgo::HasNodeAttr(kAttrChildGraph, cnode)) {
1827         child_graphs = AnfAlgo::GetNodeAttr<std::vector<KernelGraphPtr>>(cnode, kAttrChildGraph);
1828       }
1829       for (auto &child_graph : child_graphs) {
1830         RecurseSelectKernelInfo(NOT_NULL(child_graph), memo, raise_precision_count, reduce_precision_count);
1831       }
1832     }
1833 
1834     auto status = device::ascend::SelectKernelInfo(cnode);
1835     if (status == device::ascend::kStatusRaisePrecision) {
1836       (*raise_precision_count)++;
1837     } else if (status == device::ascend::kStatusReducePrecision) {
1838       (*reduce_precision_count)++;
1839     }
1840   }
1841 #ifdef ENABLE_DUMP_IR
1842   auto context_ptr = MsContext::GetInstance();
1843   MS_EXCEPTION_IF_NULL(context_ptr);
1844   bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
1845   if (save_graphs) {
1846     std::string file_name = "select_kernel_after_graph_" + std::to_string(graph->graph_id()) + ".ir";
1847     DumpIR(file_name, graph.get());
1848   }
1849 #endif
1850   MS_LOG(INFO) << "Finish selecting kernel info in graph: " << graph->graph_id();
1851 }
1852 
HardwareOptimize(NotNull<KernelGraphPtr> graph,NotNull<std::set<KernelGraphPtr> * > const memo) const1853 void AscendSession::HardwareOptimize(NotNull<KernelGraphPtr> graph,
1854                                      NotNull<std::set<KernelGraphPtr> *> const memo) const {
1855   if (memo->find(graph) != memo->end()) {
1856     return;
1857   }
1858   memo->insert(graph.get());
1859 
1860   MS_LOG(INFO) << "Start to do HardwareOptimize in graph: " << graph->graph_id();
1861 
1862   HardwareOptimize(graph.get());
1863   for (auto &child_graph : graph->child_graph_order()) {
1864     HardwareOptimize(NOT_NULL(child_graph.lock()), memo);
1865   }
1866   MS_LOG(INFO) << "Finish doing HardwareOptimize in graph: " << graph->graph_id();
1867 }
1868 
1869 #ifdef ENABLE_DEBUGGER
LoadGraphsToDbg(NotNull<KernelGraphPtr> graph,NotNull<std::set<KernelGraphPtr> * > const memo) const1870 void AscendSession::LoadGraphsToDbg(NotNull<KernelGraphPtr> graph,
1871                                     NotNull<std::set<KernelGraphPtr> *> const memo) const {
1872   if (memo->find(graph) != memo->end()) {
1873     return;
1874   }
1875   memo->insert(graph.get());
1876 
1877   MS_LOG(INFO) << "Start to do LoadGraphsToDbg in graph: " << graph->graph_id();
1878 
1879   MS_EXCEPTION_IF_NULL(debugger_);
1880   debugger_->LoadGraphs(graph);
1881   MS_LOG(INFO) << "graph_sum_: " << graph_sum_;
1882   for (auto &child_graph : graph->child_graph_order()) {
1883     LoadGraphsToDbg(NOT_NULL(child_graph.lock()), memo);
1884   }
1885   MS_LOG(INFO) << "Finish doing LoadGraphsToDbg in graph: " << graph->graph_id();
1886 }
1887 #endif
1888 
AssignStaticMemory(NotNull<KernelGraphPtr> graph,NotNull<std::set<KernelGraphPtr> * > const memo) const1889 void AscendSession::AssignStaticMemory(NotNull<KernelGraphPtr> graph,
1890                                        NotNull<std::set<KernelGraphPtr> *> const memo) const {
1891   if (memo->find(graph) != memo->end()) {
1892     return;
1893   }
1894   memo->insert(graph.get());
1895 
1896   MS_LOG(INFO) << "Start to assign static memory for parameter in graph: " << graph->graph_id();
1897   // assign static memory for parameters
1898   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1899   MS_EXCEPTION_IF_NULL(runtime_instance);
1900   runtime_instance->ClearGlobalIdleMem();
1901   runtime_instance->AssignStaticMemoryInput(*graph.get());
1902   runtime_instance->AssignStaticMemoryValueNode(*graph.get());
1903   for (auto &child_graph : graph->child_graph_order()) {
1904     AssignStaticMemory(NOT_NULL(child_graph.lock()), memo);
1905   }
1906   MS_LOG(INFO) << "Finish assigning static memory for parameter in graph: " << graph->graph_id();
1907 }
1908 
UpdateRefOutputMap(NotNull<KernelGraphPtr> graph,NotNull<std::set<KernelGraphPtr> * > const memo) const1909 void AscendSession::UpdateRefOutputMap(NotNull<KernelGraphPtr> graph,
1910                                        NotNull<std::set<KernelGraphPtr> *> const memo) const {
1911   if (memo->find(graph) != memo->end()) {
1912     return;
1913   }
1914   memo->insert(graph.get());
1915 
1916   for (auto &child_graph : graph->child_graph_order()) {
1917     std::shared_ptr<KernelGraph> child_graph_ptr = child_graph.lock();
1918     MS_EXCEPTION_IF_NULL(child_graph_ptr);
1919     UpdateRefOutputMap(NOT_NULL(child_graph_ptr), memo);
1920     // copy ref map to final graph
1921     auto child_ref_map = child_graph_ptr->GetRefMap();
1922     for (auto &item : child_ref_map) {
1923       if (graph->IsInRefOutputMap(item.first)) {
1924         MS_LOG(WARNING) << "The ref pair <" << item.first.first->DebugString() << ", " << item.first.second
1925                         << "> is already in " << graph->ToString();
1926         continue;
1927       }
1928       graph->AddRefCorrespondPairs(item.first, item.second);
1929     }
1930   }
1931 }
1932 
SyncStream() const1933 void AscendSession::SyncStream() const {
1934   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
1935   MS_EXCEPTION_IF_NULL(runtime_instance);
1936   auto ret = runtime_instance->SyncStream();
1937   if (!ret) {
1938     MS_LOG(EXCEPTION) << "Sync stream error!";
1939   }
1940 }
1941 
CreateBucket(uint32_t bucket_id,uint32_t bucket_size)1942 std::shared_ptr<device::Bucket> AscendSession::CreateBucket(uint32_t bucket_id, uint32_t bucket_size) {
1943   auto bucket = std::make_shared<device::ascend::AscendBucket>(bucket_id, bucket_size);
1944 
1945   auto kernel_runtime = device::KernelRuntimeManager::Instance().GetCurrentKernelRuntime();
1946   MS_EXCEPTION_IF_NULL(kernel_runtime);
1947   auto compute_stream = kernel_runtime->compute_stream();
1948   auto communication_stream = kernel_runtime->communication_stream();
1949   MS_EXCEPTION_IF_NULL(compute_stream);
1950   MS_EXCEPTION_IF_NULL(communication_stream);
1951 
1952   MS_EXCEPTION_IF_NULL(bucket);
1953   bucket->Init({compute_stream}, {communication_stream});
1954   return bucket;
1955 }
1956 
ReportWarningMessage()1957 void AscendSession::ReportWarningMessage() {
1958   const string &warning_message = ErrorManager::GetInstance().GetWarningMessage();
1959   if (!warning_message.empty()) {
1960     MS_LOG(WARNING) << "Ascend warning message:\n" << warning_message;
1961   }
1962 }
1963 
ReportErrorMessage()1964 void AscendSession::ReportErrorMessage() {
1965   const string &error_message = ErrorManager::GetInstance().GetErrorMessage();
1966   if (!error_message.empty() && error_message.find(kUnknowErrorString) == string::npos) {
1967     MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message;
1968   }
1969 }
1970 
SetThreadContext()1971 void AscendSession::SetThreadContext() { ErrorManager::GetInstance().GenWorkStreamIdDefault(); }
1972 
ExecuteAllTaskInQueue()1973 void AscendSession::ExecuteAllTaskInQueue() {
1974   // Execute All Task
1975   auto &task_manager = PynativeTaskManager::GetInstance();
1976   if (task_manager.QueueEmpty()) {
1977     return;
1978   }
1979 
1980   try {
1981     MS_LOG(DEBUG) << "Start";
1982     auto ms_context = MsContext::GetInstance();
1983     auto infer_flag = ms_context->get_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER);
1984     ms_context->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER, true);
1985 
1986     BatchBuildKernel(task_manager.GetAllBuildTasks());
1987     task_manager.ClearAllBuildTasks();
1988 
1989     // Launch one by one
1990     const auto &launch_tasks = task_manager.GetAllLaunchTasks();
1991     while (!launch_tasks.empty()) {
1992       auto &launch_task = launch_tasks.front();
1993       const auto &context = launch_task->context();
1994       LaunchFunc(context->graph(), context->tensor_to_node(), context->is_dynamic_shape(), context->input_tensors());
1995       task_manager.PopLaunchTask();
1996     }
1997 
1998     ms_context->set_param<bool>(MS_CTX_ENABLE_PYNATIVE_INFER, infer_flag);
1999     MS_LOG(DEBUG) << "End";
2000   } catch (const std::exception &ex) {
2001     task_manager.Reset();
2002     throw(std::runtime_error(ex.what()));
2003   } catch (...) {
2004     task_manager.Reset();
2005     std::string exName(abi::__cxa_current_exception_type()->name());
2006     MS_LOG(EXCEPTION) << "Error occurred when execute task in queue. Exception name: " << exName;
2007   }
2008 }
UpdateOutputTensors(const VectorRef * outputs,const std::map<tensor::TensorPtr,session::KernelWithIndex> & tensor_to_node,std::map<DeviceAddressPtr,DeviceAddressPtr> *)2009 void AscendSession::UpdateOutputTensors(const VectorRef *outputs,
2010                                         const std::map<tensor::TensorPtr, session::KernelWithIndex> &tensor_to_node,
2011                                         std::map<DeviceAddressPtr, DeviceAddressPtr> *) {
2012   auto context_ptr = MsContext::GetInstance();
2013   MS_EXCEPTION_IF_NULL(context_ptr);
2014   auto enable_mem_scheduler = context_ptr->get_param<bool>(MS_CTX_ENABLE_MEM_SCHEDULER);
2015   if (enable_mem_scheduler) {
2016     return;
2017   }
2018   MS_EXCEPTION_IF_NULL(outputs);
2019   tensor_device_addr_map_.clear();
2020   for (const auto &item : *outputs) {
2021     if (utils::isa<VectorRefPtr>(item)) {
2022       const auto &vector_ref = utils::cast<VectorRef>(item);
2023       std::map<DeviceAddressPtr, DeviceAddressPtr> new_to_old_device_address;
2024       UpdateOutputTensors(&vector_ref, tensor_to_node, &new_to_old_device_address);
2025     } else if (utils::isa<tensor::TensorPtr>(item)) {
2026       const auto &tensor = utils::cast<tensor::TensorPtr>(item);
2027       MS_EXCEPTION_IF_NULL(tensor);
2028       const auto &iter = tensor_to_node.find(tensor);
2029       if (iter != tensor_to_node.end()) {
2030         const auto &node = iter->second.first;
2031         size_t output_index = iter->second.second;
2032         if (!AnfAlgo::OutputAddrExist(node, output_index, true)) {
2033           continue;
2034         }
2035         const auto &address = AnfAlgo::GetMutableOutputAddr(node, output_index);
2036         tensor->set_device_address(address);
2037         if (IsVMGraphTaskSink() && tensor->NeedSyncDeviceToHostImmediately()) {
2038           auto dst_device_address = AssignExtraMemForGraphOutput(tensor, node, output_index);
2039           MS_EXCEPTION_IF_NULL(dst_device_address);
2040           if (!dst_device_address->SyncDeviceToDevice(trans::GetRuntimePaddingShape(node, output_index),
2041                                                       address->GetSize(), address->type_id(), address->GetPtr(),
2042                                                       address->format())) {
2043             MS_LOG(EXCEPTION) << "SyncDeviceToDevice failed!";
2044           }
2045           tensor->set_sync_status(kNoNeedSync);
2046           tensor_device_addr_map_[tensor] = dst_device_address;
2047         }
2048 
2049         if (AnfAlgo::IsDynamicShape(node)) {
2050           const auto &updated_shape = AnfAlgo::GetOutputInferShape(node, output_index);
2051           ShapeVector int_shape;
2052           (void)std::transform(updated_shape.begin(), updated_shape.end(), std::back_inserter(int_shape), SizeToInt);
2053           (void)tensor->set_shape(int_shape);
2054         }
2055       }
2056       if (tensor->NeedSyncDeviceToHostImmediately()) {
2057         tensor->data_sync(false);
2058         tensor->set_device_address(nullptr);
2059         tensor->set_sync_status(kNeedSyncHostToDevice);
2060       }
2061     }
2062   }
2063 }
AssignExtraMemForGraphOutput(const tensor::TensorPtr & tensor,const AnfNodePtr & node,size_t index) const2064 DeviceAddressPtr AscendSession::AssignExtraMemForGraphOutput(const tensor::TensorPtr &tensor, const AnfNodePtr &node,
2065                                                              size_t index) const {
2066   MS_EXCEPTION_IF_NULL(tensor);
2067   MS_EXCEPTION_IF_NULL(node);
2068   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_);
2069   MS_EXCEPTION_IF_NULL(runtime_instance);
2070   return runtime_instance->AssignExtraStaticMem(tensor, node, index);
2071 }
2072 }  // namespace session
2073 }  // namespace mindspore
2074