• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "plugin/device/ascend/optimizer/ge_backend_optimization.h"
18 
19 #include <memory>
20 #include <string>
21 #include "backend/common/pass/dropout_gen_mask_fusion.h"
22 #include "backend/common/pass/common_subexpression_elimination.h"
23 #include "backend/common/pass/erase_visit_attr.h"
24 #include "include/common/debug/anf_ir_dump.h"
25 #include "include/common/debug/dump_proto.h"
26 #include "include/backend/optimizer/optimizer.h"
27 #include "backend/common/pass/add_parallel_group_id_attr.h"
28 #include "include/backend/debug/profiler/profiling.h"
29 #include "plugin/device/ascend/optimizer/ge/all_to_all_v_for_ge.h"
30 #include "plugin/device/ascend/optimizer/ge/maketuple_depend_remover.h"
31 #include "plugin/device/ascend/optimizer/ge/fused_cast_add.h"
32 #include "plugin/device/ascend/optimizer/ge/expand_dims_for_batchnorm.h"
33 #include "plugin/device/ascend/optimizer/ge/convert_data_depend_to_control_depend.h"
34 #include "plugin/device/ascend/optimizer/ge/convert_condition_input_to_scalar.h"
35 #include "plugin/device/ascend/optimizer/ge/hcom/add_parallel_group_for_hcom.h"
36 #include "plugin/device/ascend/optimizer/ge/hcom/insert_tensor_move_for_hccl_op_ge.h"
37 #include "plugin/device/ascend/optimizer/ge/hcom/insert_depend_for_all_gather_ge.h"
38 #include "plugin/device/ascend/optimizer/ge/trans_depend_value_to_int32.h"
39 #include "plugin/device/ascend/optimizer/ge/expander_fallback.h"
40 #include "plugin/device/ascend/optimizer/ge/insert_identity.h"
41 #include "plugin/device/ascend/optimizer/ge/dropout_gen_mask_depend.h"
42 #include "plugin/device/ascend/optimizer/ge/unfold_maketuple.h"
43 #include "plugin/device/ascend/optimizer/ge/unfold_nested_output.h"
44 #include "plugin/device/ascend/optimizer/ge/resize_bilinear_add_attr.h"
45 #include "plugin/device/ascend/optimizer/ge/process_call_inline.h"
46 #include "plugin/device/ascend/optimizer/ge/process_partial_inline.h"
47 #include "plugin/device/ascend/optimizer/format_type/deal_ref_output.h"
48 #include "plugin/device/ascend/optimizer/ge/hcom/insert_load_for_allgather.h"
49 #include "plugin/device/ascend/optimizer/format_type/set_fracz_group_attr.h"
50 #include "plugin/device/ascend/optimizer/ge/shape_unify_mindir.h"
51 #include "plugin/device/ascend/optimizer/ge/inputs_unify_mindir.h"
52 #include "plugin/device/ascend/optimizer/ge/maketuple_unify_mindir.h"
53 #include "plugin/device/ascend/optimizer/ge/add_cast_for_ge.h"
54 #include "plugin/device/ascend/optimizer/ge/bce_with_logits_loss_for_ge.h"
55 #include "plugin/device/ascend/optimizer/ge/scalar_unify_mindir.h"
56 #include "plugin/device/ascend/optimizer/ge/tuple_unify_mindir.h"
57 #include "plugin/device/ascend/optimizer/ge/add_noop_to_es_grad.h"
58 #include "plugin/device/ascend/optimizer/ir_fission/seed_adapter.h"
59 #include "plugin/device/ascend/optimizer/ir_fission/ascend_convert_tuple_input_to_dynamic_input.h"
60 #include "plugin/device/ascend/optimizer/backend_common_unify_mindir.h"
61 #include "plugin/device/ascend/optimizer/ge/remove_tensor_to_scalar_or_tuple_ops.h"
62 #include "plugin/device/ascend/optimizer/ge/scalar_ops_output_unify_mindir.h"
63 #include "plugin/device/ascend/optimizer/ge/ge_convert_const_input_to_tensor_input.h"
64 #include "backend/common/pass/insert_type_transform_op.h"
65 #include "backend/common/pass/insert_tensor_move_for_communication.h"
66 #include "plugin/device/ascend/optimizer/enhancer/eliminate_maketuple_getitem.h"
67 #include "plugin/device/ascend/optimizer/ge/convert_pad_v3_paddings.h"
68 #include "plugin/device/ascend/optimizer/ge/broadcast_for_select.h"
69 #include "plugin/device/ascend/optimizer/ge/fa_alltoallv_parallel.h"
70 #include "plugin/device/ascend/optimizer/ir_fusion/shape_reshape_fusion.h"
71 
72 namespace mindspore {
73 namespace opt {
GEBackendOptimization(const KernelGraphPtr & kernel_graph)74 void GEBackendOptimization(const KernelGraphPtr &kernel_graph) {
75   MS_EXCEPTION_IF_NULL(kernel_graph);
76   MS_LOG(DEBUG) << "Status record: start ascend backend optimize ge pass. graph id: " << kernel_graph->graph_id();
77   PROF_START(ascend_backend_optimize_ge);
78   auto context_ptr = MsContext::GetInstance();
79   MS_EXCEPTION_IF_NULL(context_ptr);
80 #ifdef ENABLE_DUMP_IR
81   if (context_ptr->CanDump(kIntroductory)) {
82     std::string file_name = "hwopt_d_before_opt_ge_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
83     DumpIR(file_name, kernel_graph, true, kWholeStack);
84   }
85 #endif
86   auto optimizer = std::make_shared<GraphOptimizer>();
87   auto opt_ge_pm = std::make_shared<PassManager>("opt_ge_pm");
88   opt_ge_pm->AddPass(std::make_shared<opt::GEConvertConstInputToTensorInput>());
89   opt_ge_pm->AddPass(std::make_shared<opt::RemoveTensorToScalarOrTupleOps>());
90   opt_ge_pm->AddPass(std::make_shared<opt::AllToAllvForGE>());
91   opt_ge_pm->AddPass(std::make_shared<opt::FaAlltoAllvParallel>());
92   opt_ge_pm->AddPass(std::make_shared<opt::InsertLoadForAllGather>());
93   opt_ge_pm->AddPass(std::make_shared<opt::InsertTensorMoveForHcclOpGe>());
94   opt_ge_pm->AddPass(std::make_shared<opt::InsertDependForAllGatherGe>());
95   opt_ge_pm->AddPass(std::make_shared<opt::ConvertCondInputToScalar>());
96   opt_ge_pm->AddPass(std::make_shared<opt::ConvertDataDependToControlDepend>());
97   opt_ge_pm->AddPass(std::make_shared<opt::MakeTupleDependRemover>());
98   opt_ge_pm->AddPass(std::make_shared<opt::FusedCastAdd>());
99   opt_ge_pm->AddPass(std::make_shared<opt::AddParallelGroupForHcom>());
100   opt_ge_pm->AddPass(std::make_shared<opt::ExpandDimsForBatchNorm>());
101   opt_ge_pm->AddPass(std::make_shared<opt::DropoutGenMaskDepend>());
102   opt_ge_pm->AddPass(std::make_shared<opt::AddCastForGe>());
103   opt_ge_pm->AddPass(std::make_shared<opt::ResizeBilinearAddAttr>());
104   opt_ge_pm->AddPass(std::make_shared<opt::AscendConvertTupleInputToDynamicInput>(true, true));
105   opt_ge_pm->AddPass(std::make_shared<opt::UnfoldNestedOutput>("unfold_nested_output"));
106   opt_ge_pm->AddPass(std::make_shared<opt::UnfoldMaketuple>("unfold_nested_maketuple"));
107   opt_ge_pm->AddPass(std::make_shared<opt::BroadCastForSelect>());
108   opt_ge_pm->AddPass(std::make_shared<opt::AddNoOpToESGrad>());
109   opt_ge_pm->AddPass(std::make_shared<opt::BCEWithLogitsLossForGe>());
110 
111   optimizer->AddPassManager(opt_ge_pm);
112   (void)optimizer->Optimize(kernel_graph);
113   kernel_graph->SetExecOrderByDefault();
114 #ifdef ENABLE_DUMP_IR
115   if (context_ptr->CanDump(kIntroductory)) {
116     std::string file_name = "hwopt_d_end_opt_ge_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
117     DumpIR(file_name, kernel_graph, true, kWholeStack);
118   }
119 #endif
120   PROF_END(ascend_backend_optimize_ge);
121   MS_LOG(DEBUG) << "Status record: end ascend backend optimize ge pass. graph id: " << kernel_graph->graph_id();
122 }
123 
GEBackendOptimizeACL(const KernelGraphPtr & kernel_graph)124 void GEBackendOptimizeACL(const KernelGraphPtr &kernel_graph) {
125   MS_EXCEPTION_IF_NULL(kernel_graph);
126   MS_LOG(DEBUG) << "Status record: start ascend backend optimize acl pass. graph id: " << kernel_graph->graph_id();
127   profiler::CollectHostInfo("Ascend", "Graph Optimization", "BackendOptimization_OptimizeACL", 0, 0, 0);
128   PROF_START(ascend_backend_optimize_acl);
129   auto context_ptr = MsContext::GetInstance();
130   MS_EXCEPTION_IF_NULL(context_ptr);
131 #ifdef ENABLE_DUMP_IR
132   if (context_ptr->CanDump(kIntroductory)) {
133     std::string file_name = "hwopt_d_before_opt_acl_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
134     DumpIR(file_name, kernel_graph, true, kWholeStack);
135   }
136 #endif
137   auto optimizer = std::make_shared<GraphOptimizer>();
138   auto opt_acl_pm = std::make_shared<PassManager>("opt_acl_pm");
139   opt_acl_pm->AddPass(std::make_shared<SeedAdapter>());
140 
141   if (common::IsEnableRuntimeConfig(common::kRuntimeInsertTensorMove)) {
142     opt_acl_pm->AddPass(std::make_shared<opt::InsertTensorMoveForHcclOpGe>());
143   } else {
144     opt_acl_pm->AddPass(std::make_shared<InsertTensorMoveForCommunication>());
145   }
146   opt_acl_pm->AddPass(std::make_shared<opt::TransDependValueToInt32>());
147   opt_acl_pm->AddPass(std::make_shared<opt::ProcessCallInline>());
148   opt_acl_pm->AddPass(std::make_shared<opt::ProcessPartialInline>());
149   opt_acl_pm->AddPass(std::make_shared<opt::ExpanderFallback>());
150   opt_acl_pm->AddPass(std::make_shared<opt::ConvertPadV3Paddings>());
151   opt_acl_pm->AddPass(std::make_shared<opt::ConvertPadV3GradPaddings>());
152   opt_acl_pm->AddPass(std::make_shared<opt::ResizeBilinearAddAttr>());
153   opt_acl_pm->AddPass(std::make_shared<opt::AddParallelGroupIdAttr>());
154   optimizer->AddPassManager(opt_acl_pm);
155   (void)optimizer->Optimize(kernel_graph);
156   kernel_graph->SetExecOrderByDefault();
157 #ifdef ENABLE_DUMP_IR
158   if (context_ptr->CanDump(kIntroductory)) {
159     std::string file_name = "hwopt_d_end_opt_acl_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
160     DumpIR(file_name, kernel_graph, true, kWholeStack);
161   }
162 #endif
163   PROF_END(ascend_backend_optimize_acl);
164   profiler::CollectHostInfo("Ascend", "Graph Optimization", "BackendOptimization_OptimizeACL", 0, 0, 1);
165   MS_LOG(DEBUG) << "Status record: end ascend backend optimize acl pass. graph id: " << kernel_graph->graph_id();
166 }
167 
GEBackendOptimizeACLAfterKernelSelect(const KernelGraphPtr & kernel_graph)168 void GEBackendOptimizeACLAfterKernelSelect(const KernelGraphPtr &kernel_graph) {
169   MS_EXCEPTION_IF_NULL(kernel_graph);
170   MS_LOG(DEBUG) << "Status record: start ascend backend optimize acl pass after kernel select. graph id: "
171                 << kernel_graph->graph_id();
172   profiler::CollectHostInfo("Ascend", "Graph Optimization", "BackendOptimization_OptimizeACLAfterKernelSelect", 0, 0,
173                             0);
174   PROF_START(ascend_backend_optimize_acl_after_kernel_select);
175   auto context_ptr = MsContext::GetInstance();
176   MS_EXCEPTION_IF_NULL(context_ptr);
177 #ifdef ENABLE_DUMP_IR
178   if (context_ptr->CanDump(kIntroductory)) {
179     std::string file_name =
180       "hwopt_d_before_opt_acl_graph_after_kernel_select_" + std::to_string(kernel_graph->graph_id()) + ".ir";
181     DumpIR(file_name, kernel_graph, true, kWholeStack);
182   }
183 #endif
184   auto optimizer = std::make_shared<GraphOptimizer>();
185   auto opt_acl_after_kernel_select_pm = std::make_shared<PassManager>("opt_acl_after_kernel_select_pm");
186   opt_acl_after_kernel_select_pm->AddPass(std::make_shared<SetFraczGroupAttr>());
187   opt_acl_after_kernel_select_pm->AddPass(std::make_shared<InsertIdentity>());
188   opt_acl_after_kernel_select_pm->AddPass(std::make_shared<EraseVisitAttr>());
189   opt_acl_after_kernel_select_pm->AddPass(std::make_shared<DealRefOutput>());
190   if (!kernel_graph->is_from_single_op() && !kernel_graph->has_flag(kFlagIsPyNativeBpropKernelGraph)) {
191     opt_acl_after_kernel_select_pm->AddPass(std::make_shared<opt::InsertTypeTransformOp>());
192   }
193   if (!kernel_graph->is_graph_run_mode() && context_ptr->ascend_soc_version() != "ascend910") {
194     opt_acl_after_kernel_select_pm->AddPass(std::make_shared<opt::ShapeReshapeFusion>());
195     opt_acl_after_kernel_select_pm->AddPass(std::make_shared<opt::ShapeReshapeDirectFusion>());
196   }
197 
198   optimizer->AddPassManager(opt_acl_after_kernel_select_pm);
199   (void)optimizer->Optimize(kernel_graph);
200   kernel_graph->SetExecOrderByDefault();
201 #ifdef ENABLE_DUMP_IR
202   if (context_ptr->CanDump(kIntroductory)) {
203     std::string file_name =
204       "hwopt_d_end_opt_acl_graph_after_kernel_select_" + std::to_string(kernel_graph->graph_id()) + ".ir";
205     DumpIR(file_name, kernel_graph, true, kWholeStack);
206   }
207 #endif
208   PROF_END(ascend_backend_optimize_acl_after_kernel_select);
209   profiler::CollectHostInfo("Ascend", "Graph Optimization", "BackendOptimization_OptimizeACLAfterKernelSelect", 0, 0,
210                             1);
211   MS_LOG(DEBUG) << "Status record: end ascend backend optimize acl pass. graph id: " << kernel_graph->graph_id();
212 }
213 
GEUnifyMindIR(const KernelGraphPtr & kernel_graph)214 void GEUnifyMindIR(const KernelGraphPtr &kernel_graph) {
215   profiler::CollectHostInfo("GE", "Graph Optimization", "BackendOptimization_UnifyMindIR", 0, 0, 0);
216   MS_EXCEPTION_IF_NULL(kernel_graph);
217   auto context_ptr = MsContext::GetInstance();
218   MS_EXCEPTION_IF_NULL(context_ptr);
219 #ifdef ENABLE_DUMP_IR
220   if (context_ptr->CanDump(kIntroductory)) {
221     std::string file_name = "hwopt_d_before_unify_mindir_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
222     DumpIR(file_name, kernel_graph);
223     DumpIRProto(kernel_graph, "before_unify_mindir_hwopt_" + std::to_string(kernel_graph->graph_id()));
224   }
225 #endif
226   auto optimizer = std::make_shared<opt::GraphOptimizer>();
227   optimizer->AddPassManager(GetGEUnifyMindIRPassManager());
228   (void)optimizer->Optimize(kernel_graph);
229   kernel_graph->SetExecOrderByDefault();
230 #ifdef ENABLE_DUMP_IR
231   if (context_ptr->CanDump(kIntroductory)) {
232     std::string file_name = "hwopt_d_after_unify_mindir_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
233     DumpIR(file_name, kernel_graph);
234   }
235 #endif
236   profiler::CollectHostInfo("GE", "Graph Optimization", "BackendOptimization_UnifyMindIR", 0, 0, 1);
237 }
238 
GEAfterInlineOptimize(const KernelGraphPtr & kernel_graph)239 void GEAfterInlineOptimize(const KernelGraphPtr &kernel_graph) {
240   profiler::CollectHostInfo("GE", "Graph Optimization", "BackendOptimization_AfterInline", 0, 0, 0);
241   MS_EXCEPTION_IF_NULL(kernel_graph);
242   auto context_ptr = MsContext::GetInstance();
243   MS_EXCEPTION_IF_NULL(context_ptr);
244 #ifdef ENABLE_DUMP_IR
245   if (context_ptr->CanDump(kIntroductory)) {
246     std::string file_name =
247       "hwopt_d_before_inline_optimize_mindir_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
248     DumpIR(file_name, kernel_graph);
249   }
250 #endif
251   auto optimizer = std::make_shared<opt::GraphOptimizer>();
252   auto after_inline_pm = std::make_shared<PassManager>("after_inline_pm");
253   after_inline_pm->AddPass(std::make_shared<DropoutGenMaskFusion>());
254   after_inline_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
255   after_inline_pm->AddPass(std::make_shared<EliminateMaketupleGetitem>());
256   optimizer->AddPassManager(after_inline_pm);
257   (void)optimizer->Optimize(kernel_graph);
258   kernel_graph->SetExecOrderByDefault();
259 #ifdef ENABLE_DUMP_IR
260   if (context_ptr->CanDump(kIntroductory)) {
261     std::string file_name =
262       "hwopt_d_after_inline_optimize_mindir_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
263     DumpIR(file_name, kernel_graph);
264   }
265 #endif
266   profiler::CollectHostInfo("GE", "Graph Optimization", "BackendOptimization_AfterInline", 0, 0, 1);
267 }
268 
GEDynamicUnifyMindIR(const FuncGraphPtr & func_graph)269 void GEDynamicUnifyMindIR(const FuncGraphPtr &func_graph) {
270   profiler::CollectHostInfo("GE", "GE Dynamic Shape Unify MindIR", "GEBackend_Dynamic_UnifyMindIR", 0, 0, 0);
271   MS_EXCEPTION_IF_NULL(func_graph);
272   auto context_ptr = MsContext::GetInstance();
273   MS_EXCEPTION_IF_NULL(context_ptr);
274 #ifdef ENABLE_DUMP_IR
275   if (context_ptr->CanDump(kIntroductory)) {
276     std::string file_name = "hwopt_d_before_ge_dynamic_shape_unify_mindir_graph.ir";
277     DumpIR(file_name, func_graph);
278     DumpIRProto(func_graph, "before_ge_dynamic_shape_unify_mindir_hwopt");
279   }
280 #endif
281   auto dynamic_unify_mindir_pm = std::make_shared<opt::PassManager>("ge_dynamic_unify_mindir_pm");
282   dynamic_unify_mindir_pm->AddPass(std::make_shared<opt::ScalarOpsOutputUnifyMindIR>());
283   dynamic_unify_mindir_pm->AddPass(std::make_shared<opt::ShapeUnifyMindIR>());
284   dynamic_unify_mindir_pm->AddPass(std::make_shared<opt::MakeTupleUnifyMindIR>());
285   dynamic_unify_mindir_pm->AddPass(std::make_shared<opt::InputsUnifyMindIR>());
286   dynamic_unify_mindir_pm->AddPass(std::make_shared<opt::ScalarUnifyMindIR>());
287   dynamic_unify_mindir_pm->AddPass(std::make_shared<opt::TupleUnifyMindIR>());
288   auto optimizer = std::make_shared<opt::GraphOptimizer>();
289   optimizer->AddPassManager(dynamic_unify_mindir_pm);
290   (void)optimizer->Optimize(func_graph);
291 #ifdef ENABLE_DUMP_IR
292   if (context_ptr->CanDump(kIntroductory)) {
293     std::string file_name = "hwopt_d_after_ge_dynamic_shape_unify_mindir_graph.ir";
294     DumpIR(file_name, func_graph);
295   }
296 #endif
297   profiler::CollectHostInfo("GE", "GE Dynamic Shape Unify MindIR", "GEBackend_Dynamic_UnifyMindIR", 0, 0, 1);
298 }
299 
GetGEUnifyMindIRPassManager()300 PassManagerPtr GetGEUnifyMindIRPassManager() {
301   auto unify_mindir_pm = std::make_shared<opt::PassManager>("ge_unify_mindir_pm");
302   MS_EXCEPTION_IF_NULL(unify_mindir_pm);
303   GetBackendCommonUnifyMindIRPassManager(&unify_mindir_pm);
304   return unify_mindir_pm;
305 }
306 }  // namespace opt
307 }  // namespace mindspore
308