1 /**
2 * Copyright 2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "plugin/device/ascend/optimizer/ge_backend_optimization.h"
18
19 #include <memory>
20 #include <string>
21 #include "backend/common/pass/dropout_gen_mask_fusion.h"
22 #include "backend/common/pass/common_subexpression_elimination.h"
23 #include "backend/common/pass/erase_visit_attr.h"
24 #include "include/common/debug/anf_ir_dump.h"
25 #include "include/common/debug/dump_proto.h"
26 #include "include/backend/optimizer/optimizer.h"
27 #include "backend/common/pass/add_parallel_group_id_attr.h"
28 #include "include/backend/debug/profiler/profiling.h"
29 #include "plugin/device/ascend/optimizer/ge/all_to_all_v_for_ge.h"
30 #include "plugin/device/ascend/optimizer/ge/maketuple_depend_remover.h"
31 #include "plugin/device/ascend/optimizer/ge/fused_cast_add.h"
32 #include "plugin/device/ascend/optimizer/ge/expand_dims_for_batchnorm.h"
33 #include "plugin/device/ascend/optimizer/ge/convert_data_depend_to_control_depend.h"
34 #include "plugin/device/ascend/optimizer/ge/convert_condition_input_to_scalar.h"
35 #include "plugin/device/ascend/optimizer/ge/hcom/add_parallel_group_for_hcom.h"
36 #include "plugin/device/ascend/optimizer/ge/hcom/insert_tensor_move_for_hccl_op_ge.h"
37 #include "plugin/device/ascend/optimizer/ge/hcom/insert_depend_for_all_gather_ge.h"
38 #include "plugin/device/ascend/optimizer/ge/trans_depend_value_to_int32.h"
39 #include "plugin/device/ascend/optimizer/ge/expander_fallback.h"
40 #include "plugin/device/ascend/optimizer/ge/insert_identity.h"
41 #include "plugin/device/ascend/optimizer/ge/dropout_gen_mask_depend.h"
42 #include "plugin/device/ascend/optimizer/ge/unfold_maketuple.h"
43 #include "plugin/device/ascend/optimizer/ge/unfold_nested_output.h"
44 #include "plugin/device/ascend/optimizer/ge/resize_bilinear_add_attr.h"
45 #include "plugin/device/ascend/optimizer/ge/process_call_inline.h"
46 #include "plugin/device/ascend/optimizer/ge/process_partial_inline.h"
47 #include "plugin/device/ascend/optimizer/format_type/deal_ref_output.h"
48 #include "plugin/device/ascend/optimizer/ge/hcom/insert_load_for_allgather.h"
49 #include "plugin/device/ascend/optimizer/format_type/set_fracz_group_attr.h"
50 #include "plugin/device/ascend/optimizer/ge/shape_unify_mindir.h"
51 #include "plugin/device/ascend/optimizer/ge/inputs_unify_mindir.h"
52 #include "plugin/device/ascend/optimizer/ge/maketuple_unify_mindir.h"
53 #include "plugin/device/ascend/optimizer/ge/add_cast_for_ge.h"
54 #include "plugin/device/ascend/optimizer/ge/bce_with_logits_loss_for_ge.h"
55 #include "plugin/device/ascend/optimizer/ge/scalar_unify_mindir.h"
56 #include "plugin/device/ascend/optimizer/ge/tuple_unify_mindir.h"
57 #include "plugin/device/ascend/optimizer/ge/add_noop_to_es_grad.h"
58 #include "plugin/device/ascend/optimizer/ir_fission/seed_adapter.h"
59 #include "plugin/device/ascend/optimizer/ir_fission/ascend_convert_tuple_input_to_dynamic_input.h"
60 #include "plugin/device/ascend/optimizer/backend_common_unify_mindir.h"
61 #include "plugin/device/ascend/optimizer/ge/remove_tensor_to_scalar_or_tuple_ops.h"
62 #include "plugin/device/ascend/optimizer/ge/scalar_ops_output_unify_mindir.h"
63 #include "plugin/device/ascend/optimizer/ge/ge_convert_const_input_to_tensor_input.h"
64 #include "backend/common/pass/insert_type_transform_op.h"
65 #include "backend/common/pass/insert_tensor_move_for_communication.h"
66 #include "plugin/device/ascend/optimizer/enhancer/eliminate_maketuple_getitem.h"
67 #include "plugin/device/ascend/optimizer/ge/convert_pad_v3_paddings.h"
68 #include "plugin/device/ascend/optimizer/ge/broadcast_for_select.h"
69 #include "plugin/device/ascend/optimizer/ge/fa_alltoallv_parallel.h"
70 #include "plugin/device/ascend/optimizer/ir_fusion/shape_reshape_fusion.h"
71
72 namespace mindspore {
73 namespace opt {
GEBackendOptimization(const KernelGraphPtr & kernel_graph)74 void GEBackendOptimization(const KernelGraphPtr &kernel_graph) {
75 MS_EXCEPTION_IF_NULL(kernel_graph);
76 MS_LOG(DEBUG) << "Status record: start ascend backend optimize ge pass. graph id: " << kernel_graph->graph_id();
77 PROF_START(ascend_backend_optimize_ge);
78 auto context_ptr = MsContext::GetInstance();
79 MS_EXCEPTION_IF_NULL(context_ptr);
80 #ifdef ENABLE_DUMP_IR
81 if (context_ptr->CanDump(kIntroductory)) {
82 std::string file_name = "hwopt_d_before_opt_ge_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
83 DumpIR(file_name, kernel_graph, true, kWholeStack);
84 }
85 #endif
86 auto optimizer = std::make_shared<GraphOptimizer>();
87 auto opt_ge_pm = std::make_shared<PassManager>("opt_ge_pm");
88 opt_ge_pm->AddPass(std::make_shared<opt::GEConvertConstInputToTensorInput>());
89 opt_ge_pm->AddPass(std::make_shared<opt::RemoveTensorToScalarOrTupleOps>());
90 opt_ge_pm->AddPass(std::make_shared<opt::AllToAllvForGE>());
91 opt_ge_pm->AddPass(std::make_shared<opt::FaAlltoAllvParallel>());
92 opt_ge_pm->AddPass(std::make_shared<opt::InsertLoadForAllGather>());
93 opt_ge_pm->AddPass(std::make_shared<opt::InsertTensorMoveForHcclOpGe>());
94 opt_ge_pm->AddPass(std::make_shared<opt::InsertDependForAllGatherGe>());
95 opt_ge_pm->AddPass(std::make_shared<opt::ConvertCondInputToScalar>());
96 opt_ge_pm->AddPass(std::make_shared<opt::ConvertDataDependToControlDepend>());
97 opt_ge_pm->AddPass(std::make_shared<opt::MakeTupleDependRemover>());
98 opt_ge_pm->AddPass(std::make_shared<opt::FusedCastAdd>());
99 opt_ge_pm->AddPass(std::make_shared<opt::AddParallelGroupForHcom>());
100 opt_ge_pm->AddPass(std::make_shared<opt::ExpandDimsForBatchNorm>());
101 opt_ge_pm->AddPass(std::make_shared<opt::DropoutGenMaskDepend>());
102 opt_ge_pm->AddPass(std::make_shared<opt::AddCastForGe>());
103 opt_ge_pm->AddPass(std::make_shared<opt::ResizeBilinearAddAttr>());
104 opt_ge_pm->AddPass(std::make_shared<opt::AscendConvertTupleInputToDynamicInput>(true, true));
105 opt_ge_pm->AddPass(std::make_shared<opt::UnfoldNestedOutput>("unfold_nested_output"));
106 opt_ge_pm->AddPass(std::make_shared<opt::UnfoldMaketuple>("unfold_nested_maketuple"));
107 opt_ge_pm->AddPass(std::make_shared<opt::BroadCastForSelect>());
108 opt_ge_pm->AddPass(std::make_shared<opt::AddNoOpToESGrad>());
109 opt_ge_pm->AddPass(std::make_shared<opt::BCEWithLogitsLossForGe>());
110
111 optimizer->AddPassManager(opt_ge_pm);
112 (void)optimizer->Optimize(kernel_graph);
113 kernel_graph->SetExecOrderByDefault();
114 #ifdef ENABLE_DUMP_IR
115 if (context_ptr->CanDump(kIntroductory)) {
116 std::string file_name = "hwopt_d_end_opt_ge_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
117 DumpIR(file_name, kernel_graph, true, kWholeStack);
118 }
119 #endif
120 PROF_END(ascend_backend_optimize_ge);
121 MS_LOG(DEBUG) << "Status record: end ascend backend optimize ge pass. graph id: " << kernel_graph->graph_id();
122 }
123
GEBackendOptimizeACL(const KernelGraphPtr & kernel_graph)124 void GEBackendOptimizeACL(const KernelGraphPtr &kernel_graph) {
125 MS_EXCEPTION_IF_NULL(kernel_graph);
126 MS_LOG(DEBUG) << "Status record: start ascend backend optimize acl pass. graph id: " << kernel_graph->graph_id();
127 profiler::CollectHostInfo("Ascend", "Graph Optimization", "BackendOptimization_OptimizeACL", 0, 0, 0);
128 PROF_START(ascend_backend_optimize_acl);
129 auto context_ptr = MsContext::GetInstance();
130 MS_EXCEPTION_IF_NULL(context_ptr);
131 #ifdef ENABLE_DUMP_IR
132 if (context_ptr->CanDump(kIntroductory)) {
133 std::string file_name = "hwopt_d_before_opt_acl_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
134 DumpIR(file_name, kernel_graph, true, kWholeStack);
135 }
136 #endif
137 auto optimizer = std::make_shared<GraphOptimizer>();
138 auto opt_acl_pm = std::make_shared<PassManager>("opt_acl_pm");
139 opt_acl_pm->AddPass(std::make_shared<SeedAdapter>());
140
141 if (common::IsEnableRuntimeConfig(common::kRuntimeInsertTensorMove)) {
142 opt_acl_pm->AddPass(std::make_shared<opt::InsertTensorMoveForHcclOpGe>());
143 } else {
144 opt_acl_pm->AddPass(std::make_shared<InsertTensorMoveForCommunication>());
145 }
146 opt_acl_pm->AddPass(std::make_shared<opt::TransDependValueToInt32>());
147 opt_acl_pm->AddPass(std::make_shared<opt::ProcessCallInline>());
148 opt_acl_pm->AddPass(std::make_shared<opt::ProcessPartialInline>());
149 opt_acl_pm->AddPass(std::make_shared<opt::ExpanderFallback>());
150 opt_acl_pm->AddPass(std::make_shared<opt::ConvertPadV3Paddings>());
151 opt_acl_pm->AddPass(std::make_shared<opt::ConvertPadV3GradPaddings>());
152 opt_acl_pm->AddPass(std::make_shared<opt::ResizeBilinearAddAttr>());
153 opt_acl_pm->AddPass(std::make_shared<opt::AddParallelGroupIdAttr>());
154 optimizer->AddPassManager(opt_acl_pm);
155 (void)optimizer->Optimize(kernel_graph);
156 kernel_graph->SetExecOrderByDefault();
157 #ifdef ENABLE_DUMP_IR
158 if (context_ptr->CanDump(kIntroductory)) {
159 std::string file_name = "hwopt_d_end_opt_acl_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
160 DumpIR(file_name, kernel_graph, true, kWholeStack);
161 }
162 #endif
163 PROF_END(ascend_backend_optimize_acl);
164 profiler::CollectHostInfo("Ascend", "Graph Optimization", "BackendOptimization_OptimizeACL", 0, 0, 1);
165 MS_LOG(DEBUG) << "Status record: end ascend backend optimize acl pass. graph id: " << kernel_graph->graph_id();
166 }
167
GEBackendOptimizeACLAfterKernelSelect(const KernelGraphPtr & kernel_graph)168 void GEBackendOptimizeACLAfterKernelSelect(const KernelGraphPtr &kernel_graph) {
169 MS_EXCEPTION_IF_NULL(kernel_graph);
170 MS_LOG(DEBUG) << "Status record: start ascend backend optimize acl pass after kernel select. graph id: "
171 << kernel_graph->graph_id();
172 profiler::CollectHostInfo("Ascend", "Graph Optimization", "BackendOptimization_OptimizeACLAfterKernelSelect", 0, 0,
173 0);
174 PROF_START(ascend_backend_optimize_acl_after_kernel_select);
175 auto context_ptr = MsContext::GetInstance();
176 MS_EXCEPTION_IF_NULL(context_ptr);
177 #ifdef ENABLE_DUMP_IR
178 if (context_ptr->CanDump(kIntroductory)) {
179 std::string file_name =
180 "hwopt_d_before_opt_acl_graph_after_kernel_select_" + std::to_string(kernel_graph->graph_id()) + ".ir";
181 DumpIR(file_name, kernel_graph, true, kWholeStack);
182 }
183 #endif
184 auto optimizer = std::make_shared<GraphOptimizer>();
185 auto opt_acl_after_kernel_select_pm = std::make_shared<PassManager>("opt_acl_after_kernel_select_pm");
186 opt_acl_after_kernel_select_pm->AddPass(std::make_shared<SetFraczGroupAttr>());
187 opt_acl_after_kernel_select_pm->AddPass(std::make_shared<InsertIdentity>());
188 opt_acl_after_kernel_select_pm->AddPass(std::make_shared<EraseVisitAttr>());
189 opt_acl_after_kernel_select_pm->AddPass(std::make_shared<DealRefOutput>());
190 if (!kernel_graph->is_from_single_op() && !kernel_graph->has_flag(kFlagIsPyNativeBpropKernelGraph)) {
191 opt_acl_after_kernel_select_pm->AddPass(std::make_shared<opt::InsertTypeTransformOp>());
192 }
193 if (!kernel_graph->is_graph_run_mode() && context_ptr->ascend_soc_version() != "ascend910") {
194 opt_acl_after_kernel_select_pm->AddPass(std::make_shared<opt::ShapeReshapeFusion>());
195 opt_acl_after_kernel_select_pm->AddPass(std::make_shared<opt::ShapeReshapeDirectFusion>());
196 }
197
198 optimizer->AddPassManager(opt_acl_after_kernel_select_pm);
199 (void)optimizer->Optimize(kernel_graph);
200 kernel_graph->SetExecOrderByDefault();
201 #ifdef ENABLE_DUMP_IR
202 if (context_ptr->CanDump(kIntroductory)) {
203 std::string file_name =
204 "hwopt_d_end_opt_acl_graph_after_kernel_select_" + std::to_string(kernel_graph->graph_id()) + ".ir";
205 DumpIR(file_name, kernel_graph, true, kWholeStack);
206 }
207 #endif
208 PROF_END(ascend_backend_optimize_acl_after_kernel_select);
209 profiler::CollectHostInfo("Ascend", "Graph Optimization", "BackendOptimization_OptimizeACLAfterKernelSelect", 0, 0,
210 1);
211 MS_LOG(DEBUG) << "Status record: end ascend backend optimize acl pass. graph id: " << kernel_graph->graph_id();
212 }
213
GEUnifyMindIR(const KernelGraphPtr & kernel_graph)214 void GEUnifyMindIR(const KernelGraphPtr &kernel_graph) {
215 profiler::CollectHostInfo("GE", "Graph Optimization", "BackendOptimization_UnifyMindIR", 0, 0, 0);
216 MS_EXCEPTION_IF_NULL(kernel_graph);
217 auto context_ptr = MsContext::GetInstance();
218 MS_EXCEPTION_IF_NULL(context_ptr);
219 #ifdef ENABLE_DUMP_IR
220 if (context_ptr->CanDump(kIntroductory)) {
221 std::string file_name = "hwopt_d_before_unify_mindir_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
222 DumpIR(file_name, kernel_graph);
223 DumpIRProto(kernel_graph, "before_unify_mindir_hwopt_" + std::to_string(kernel_graph->graph_id()));
224 }
225 #endif
226 auto optimizer = std::make_shared<opt::GraphOptimizer>();
227 optimizer->AddPassManager(GetGEUnifyMindIRPassManager());
228 (void)optimizer->Optimize(kernel_graph);
229 kernel_graph->SetExecOrderByDefault();
230 #ifdef ENABLE_DUMP_IR
231 if (context_ptr->CanDump(kIntroductory)) {
232 std::string file_name = "hwopt_d_after_unify_mindir_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
233 DumpIR(file_name, kernel_graph);
234 }
235 #endif
236 profiler::CollectHostInfo("GE", "Graph Optimization", "BackendOptimization_UnifyMindIR", 0, 0, 1);
237 }
238
GEAfterInlineOptimize(const KernelGraphPtr & kernel_graph)239 void GEAfterInlineOptimize(const KernelGraphPtr &kernel_graph) {
240 profiler::CollectHostInfo("GE", "Graph Optimization", "BackendOptimization_AfterInline", 0, 0, 0);
241 MS_EXCEPTION_IF_NULL(kernel_graph);
242 auto context_ptr = MsContext::GetInstance();
243 MS_EXCEPTION_IF_NULL(context_ptr);
244 #ifdef ENABLE_DUMP_IR
245 if (context_ptr->CanDump(kIntroductory)) {
246 std::string file_name =
247 "hwopt_d_before_inline_optimize_mindir_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
248 DumpIR(file_name, kernel_graph);
249 }
250 #endif
251 auto optimizer = std::make_shared<opt::GraphOptimizer>();
252 auto after_inline_pm = std::make_shared<PassManager>("after_inline_pm");
253 after_inline_pm->AddPass(std::make_shared<DropoutGenMaskFusion>());
254 after_inline_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
255 after_inline_pm->AddPass(std::make_shared<EliminateMaketupleGetitem>());
256 optimizer->AddPassManager(after_inline_pm);
257 (void)optimizer->Optimize(kernel_graph);
258 kernel_graph->SetExecOrderByDefault();
259 #ifdef ENABLE_DUMP_IR
260 if (context_ptr->CanDump(kIntroductory)) {
261 std::string file_name =
262 "hwopt_d_after_inline_optimize_mindir_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
263 DumpIR(file_name, kernel_graph);
264 }
265 #endif
266 profiler::CollectHostInfo("GE", "Graph Optimization", "BackendOptimization_AfterInline", 0, 0, 1);
267 }
268
GEDynamicUnifyMindIR(const FuncGraphPtr & func_graph)269 void GEDynamicUnifyMindIR(const FuncGraphPtr &func_graph) {
270 profiler::CollectHostInfo("GE", "GE Dynamic Shape Unify MindIR", "GEBackend_Dynamic_UnifyMindIR", 0, 0, 0);
271 MS_EXCEPTION_IF_NULL(func_graph);
272 auto context_ptr = MsContext::GetInstance();
273 MS_EXCEPTION_IF_NULL(context_ptr);
274 #ifdef ENABLE_DUMP_IR
275 if (context_ptr->CanDump(kIntroductory)) {
276 std::string file_name = "hwopt_d_before_ge_dynamic_shape_unify_mindir_graph.ir";
277 DumpIR(file_name, func_graph);
278 DumpIRProto(func_graph, "before_ge_dynamic_shape_unify_mindir_hwopt");
279 }
280 #endif
281 auto dynamic_unify_mindir_pm = std::make_shared<opt::PassManager>("ge_dynamic_unify_mindir_pm");
282 dynamic_unify_mindir_pm->AddPass(std::make_shared<opt::ScalarOpsOutputUnifyMindIR>());
283 dynamic_unify_mindir_pm->AddPass(std::make_shared<opt::ShapeUnifyMindIR>());
284 dynamic_unify_mindir_pm->AddPass(std::make_shared<opt::MakeTupleUnifyMindIR>());
285 dynamic_unify_mindir_pm->AddPass(std::make_shared<opt::InputsUnifyMindIR>());
286 dynamic_unify_mindir_pm->AddPass(std::make_shared<opt::ScalarUnifyMindIR>());
287 dynamic_unify_mindir_pm->AddPass(std::make_shared<opt::TupleUnifyMindIR>());
288 auto optimizer = std::make_shared<opt::GraphOptimizer>();
289 optimizer->AddPassManager(dynamic_unify_mindir_pm);
290 (void)optimizer->Optimize(func_graph);
291 #ifdef ENABLE_DUMP_IR
292 if (context_ptr->CanDump(kIntroductory)) {
293 std::string file_name = "hwopt_d_after_ge_dynamic_shape_unify_mindir_graph.ir";
294 DumpIR(file_name, func_graph);
295 }
296 #endif
297 profiler::CollectHostInfo("GE", "GE Dynamic Shape Unify MindIR", "GEBackend_Dynamic_UnifyMindIR", 0, 0, 1);
298 }
299
GetGEUnifyMindIRPassManager()300 PassManagerPtr GetGEUnifyMindIRPassManager() {
301 auto unify_mindir_pm = std::make_shared<opt::PassManager>("ge_unify_mindir_pm");
302 MS_EXCEPTION_IF_NULL(unify_mindir_pm);
303 GetBackendCommonUnifyMindIRPassManager(&unify_mindir_pm);
304 return unify_mindir_pm;
305 }
306 } // namespace opt
307 } // namespace mindspore
308