• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2019-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "backend/optimizer/ascend/ascend_backend_optimization.h"
17 #include <algorithm>
18 #include <list>
19 #include <memory>
20 #include <vector>
21 #include <string>
22 #include "backend/optimizer/common/optimizer.h"
23 #include "backend/optimizer/ascend/ir_fission/dynamic_rnn_grad_fission_v2.h"
24 #include "backend/optimizer/ascend/ir_fission/dynamic_gru_v2_grad_fission.h"
25 #include "backend/optimizer/ascend/ir_fission/bn_split.h"
26 #include "backend/optimizer/ascend/ir_fission/bn_grad_split.h"
27 #include "backend/optimizer/ascend/ir_fission/batch_norm_grad_split.h"
28 #include "backend/optimizer/ascend/ir_fission/batch_norm_bert_fission.h"
29 #include "backend/optimizer/ascend/ir_fission/single_batch_norm_fission.h"
30 #include "backend/optimizer/ascend/ir_fission/tensor_scatter_update_fission.h"
31 #include "backend/optimizer/ascend/ir_fission/reduce_min_fission.h"
32 #include "backend/optimizer/ascend/ir_fusion/fused_batch_norm_fusion.h"
33 #include "backend/optimizer/ascend/ir_fission/layer_norm_grad_split.h"
34 #include "backend/optimizer/ascend/ir_fission/unsorted_segment_sum_fission.h"
35 #include "backend/optimizer/ascend/ir_fission/gather_v2_ds_fission.h"
36 #include "backend/optimizer/ascend/ir_fission/bce_with_logits_loss_fission.h"
37 #include "backend/optimizer/ascend/ir_fission/cdist_fission.h"
38 #include "backend/optimizer/pass/communication_op_fusion.h"
39 #include "backend/optimizer/ascend/ir_fusion/square_sum_fusion.h"
40 #include "backend/optimizer/ascend/ir_fusion/clip_by_norm_no_div_square_sum_fusion.h"
41 #include "backend/optimizer/ascend/ir_fusion/lamb_update_with_lr_rule_fusion.h"
42 #include "backend/optimizer/ascend/ir_fusion/prelu_fusion.h"
43 #include "backend/optimizer/ascend/ir_fusion/clip_by_value_fusion.h"
44 #include "backend/optimizer/ascend/ir_fusion/confusion_softmax_grad_rule.h"
45 #include "backend/optimizer/ascend/ir_fusion/lamb_next_mv_rule.h"
46 #include "backend/optimizer/ascend/ir_fusion/lamb_next_mv_with_decay_rule.h"
47 #include "backend/optimizer/ascend/ir_fusion/lamb_next_right_rule.h"
48 #include "backend/optimizer/ascend/ir_fusion/lamb_update_with_lr_v2.h"
49 #include "backend/optimizer/ascend/ir_fusion/layer_norm_beta_gamma_backprop_fusion.h"
50 #include "backend/optimizer/ascend/ir_fusion/reshape_transpose_fusion.h"
51 #include "backend/optimizer/ascend/ir_fusion/transpose_reshape_fusion.h"
52 #include "backend/optimizer/ascend/ir_fusion/adam_apply_one_fusion.h"
53 #include "backend/optimizer/ascend/ir_fusion/adam_apply_one_with_decay_rule.h"
54 #include "backend/optimizer/ascend/ir_fusion/parameter_and_transop_fusion.h"
55 #include "backend/optimizer/ascend/ir_fusion/refresh_parameter_format.h"
56 #include "backend/optimizer/ascend/ir_fusion/transpose_transdata_fusion.h"
57 #include "backend/optimizer/ascend/ir_fission/transdata_split.h"
58 #include "backend/optimizer/ascend/ir_fission/topk_split.h"
59 #include "backend/optimizer/ascend/ir_fission/lin_space_fission.h"
60 #include "backend/optimizer/ascend/ir_fission/space_to_depth_split.h"
61 #include "backend/optimizer/ascend/ir_fission/diag_fission.h"
62 #include "backend/optimizer/ascend/ir_fission/diag_part_fission.h"
63 #include "backend/optimizer/ascend/ir_fission/max_pool3d_grad_grad_fission.h"
64 #include "backend/optimizer/ascend/ir_fusion/avgpool_3d_fusion.h"
65 #include "backend/optimizer/ascend/ir_fusion/avgpool_3d_grad_fusion.h"
66 #include "backend/optimizer/ascend/ir_fusion/momentum_lossscale_fusion.h"
67 #include "backend/optimizer/ascend/ir_fusion/mul_add_fusion.h"
68 #include "backend/optimizer/ascend/ir_fusion/mul_addn_fusion.h"
69 #include "backend/optimizer/ascend/ir_fusion/matmul_biasadd_fusion.h"
70 #include "backend/optimizer/ascend/ir_fusion/remove_reshape_pair.h"
71 #include "backend/optimizer/ascend/ir_fusion/derelu_fusion.h"
72 #include "backend/optimizer/ascend/ir_fusion/batchnorm_to_bninfer.h"
73 #include "backend/optimizer/ascend/ir_fusion/batchnormgrad_to_bninfergrad.h"
74 #include "backend/optimizer/ascend/ir_fusion/confusion_mul_grad_fusion.h"
75 #include "backend/optimizer/ascend/ir_fusion/softmax_grad_ext_fusion.h"
76 #include "backend/optimizer/ascend/ir_fusion/bn_reduce_grad_conv2d_backprop_filter_fusion.h"
77 #include "backend/optimizer/ascend/ir_fusion/transposed_update_fusion.h"
78 #include "backend/optimizer/ascend/format_type/insert_trans_op.h"
79 #include "backend/optimizer/ascend/format_type/trans_op_format_refine.h"
80 #include "backend/optimizer/ascend/format_type/dynamic_rnn_grad_reformat.h"
81 #include "backend/optimizer/ascend/format_type/insert_transpose_for_basiclstm_op.h"
82 #include "backend/optimizer/ascend/format_type/insert_transpose_for_dyanmic_gru_v2.h"
83 #include "backend/optimizer/ascend/format_type/rectify_do_mask_kernel_info.h"
84 #include "backend/optimizer/ascend/format_type/change_axis_of_reduce_kernel.h"
85 #include "backend/optimizer/ascend/format_type/convert_cast_format.h"
86 #include "backend/optimizer/ascend/format_type/set_fracz_group_attr.h"
87 #include "backend/optimizer/pass/getitem_tuple.h"
88 #include "backend/optimizer/pass/optimize_dependence.h"
89 #include "backend/optimizer/pass/erase_visit_attr.h"
90 #include "backend/optimizer/ascend/format_type/insert_cast.h"
91 #include "backend/optimizer/ascend/format_type/convert_unsupported_transnode_to_aicpu.h"
92 #include "backend/optimizer/pass/eliminate_redundant_op.h"
93 #include "backend/optimizer/pass/common_subexpression_elimination.h"
94 #include "backend/optimizer/ascend/format_type/merge_cast_to_op.h"
95 #include "backend/optimizer/ascend/format_type/check_consistency.h"
96 #include "backend/optimizer/ascend/buffer_fusion/ub_pattern_fusion.h"
97 #include "backend/optimizer/ascend/buffer_fusion/eltwise_fusion_pass.h"
98 #include "backend/optimizer/ascend/buffer_fusion/multi_output_fusion_pass.h"
99 #include "backend/optimizer/ascend/buffer_fusion/conv2dbackprop_eltwise_eltwise_fusion_pass.h"
100 #include "backend/optimizer/ascend/buffer_fusion/conv2dbackprop_eltwise_fusion_pass.h"
101 #include "backend/optimizer/ascend/buffer_fusion/conv_single_in_fusion_pass.h"
102 #include "backend/optimizer/ascend/buffer_fusion/conv_double_in_fusion_pass.h"
103 #include "backend/optimizer/ascend/buffer_fusion/matmul_eltwise_fusion_pass.h"
104 #include "backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.h"
105 #include "backend/optimizer/ascend/buffer_fusion/batchmatmul_fusedmuladd_fusion_pass.h"
106 #include "backend/optimizer/ascend/buffer_fusion/depthwiseconv_eltwise_fusion_pass.h"
107 #include "backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.h"
108 #include "backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_eltwise_fusion_pass.h"
109 #include "backend/optimizer/ascend/buffer_fusion/conv_bnreduce_fusion_pass.h"
110 #include "backend/optimizer/ascend/buffer_fusion/reduce_eltwise_fusion_pass.h"
111 #include "backend/optimizer/ascend/buffer_fusion/segment_eltwise_fusion_pass.h"
112 #include "backend/optimizer/ascend/format_type/deal_ref_and_split_unsupported_transdata.h"
113 #include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h"
114 #include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_cascade.h"
115 #include "backend/optimizer/ascend/enhancer/insert_pad_for_nms_with_mask.h"
116 #include "backend/optimizer/ascend/format_type/insert_transdata_for_runop.h"
117 #include "backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h"
118 #include "backend/optimizer/ascend/ir_fission/addn_fission.h"
119 #include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h"
120 #include "backend/optimizer/ascend/ir_fission/batch_norm_grad_infer_fission.h"
121 #include "backend/optimizer/ascend/ir_fission/split_fission.h"
122 #include "backend/optimizer/ascend/ir_fission/splitv_fission.h"
123 #include "backend/optimizer/ascend/ir_fusion/add_input_to_output.h"
124 #include "backend/optimizer/ascend/format_type/remove_internal_output.h"
125 #include "backend/optimizer/ascend/ir_fission/concat_fission.h"
126 #include "backend/optimizer/ascend/ir_fission/pack_fission.h"
127 #include "backend/optimizer/ascend/enhancer/concat_outputs_for_all_gather.h"
128 #include "backend/optimizer/ascend/enhancer/insert_depend_for_all_gather.h"
129 #include "backend/optimizer/ascend/enhancer/split_inputs_for_reduce_scatter.h"
130 #include "backend/optimizer/ascend/enhancer/add_placeholder_for_dynamic_rnn.h"
131 #include "backend/optimizer/ascend/enhancer/add_placeholder_for_dynamic_gru.h"
132 #include "backend/optimizer/ascend/enhancer/add_attr_for_3d_graph.h"
133 #include "backend/optimizer/ascend/enhancer/split_n_optimizer.h"
134 #include "backend/optimizer/pass/adjust_depend_for_parallel_optimizer_recompute_all_gather.h"
135 #include "backend/kernel_compiler/tbe/ascend_kernel_compile.h"
136 #include "utils/ms_context.h"
137 #include "utils/config_manager.h"
138 #include "utils/context/graph_kernel_flags.h"
139 #include "debug/anf_ir_dump.h"
140 #include "debug/dump_proto.h"
141 #ifdef ENABLE_DUMP_IR
142 #include "debug/rdr/running_data_recorder.h"
143 #endif
144 namespace mindspore {
145 namespace opt {
146 namespace {
AddAscendIRFusionRulesPass(PassManager * ir_fusion_pm)147 void AddAscendIRFusionRulesPass(PassManager *ir_fusion_pm) {
148   MS_EXCEPTION_IF_NULL(ir_fusion_pm);
149   ir_fusion_pm->AddPass(std::make_shared<LambUpdateWithLRRuleFusion>());
150   ir_fusion_pm->AddPass(std::make_shared<LambNextMVWithDecayRuleCond1>());
151   ir_fusion_pm->AddPass(std::make_shared<LambNextMVWithDecayRuleCond2>());
152   ir_fusion_pm->AddPass(std::make_shared<LambNextMVWithDecayRuleCond3>());
153   ir_fusion_pm->AddPass(std::make_shared<LambNextMVWithDecayRuleCond4>());
154   ir_fusion_pm->AddPass(std::make_shared<LambNextMVRuleCond1>());
155   ir_fusion_pm->AddPass(std::make_shared<LambNextMVRuleCond2>());
156   ir_fusion_pm->AddPass(std::make_shared<LambNextMVRuleCond3>());
157   ir_fusion_pm->AddPass(std::make_shared<LambNextMVRuleCond4>());
158   ir_fusion_pm->AddPass(std::make_shared<LambNextRightRule>());
159   ir_fusion_pm->AddPass(std::make_shared<LambUpdateWithLrV2>());
160   ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneCond1Fusion>());
161   ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneCond2Fusion>());
162   ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneCond3Fusion>());
163   ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneCond4Fusion>());
164   ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneWithDecayRuleCond1>());
165   ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneWithDecayRuleCond2>());
166   ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneWithDecayRuleCond3>());
167   ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneWithDecayRuleCond4>());
168   ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneWithDecayRuleCond5>());
169   ir_fusion_pm->AddPass(std::make_shared<ClipByNormNoDivSquareSumFusion>());
170   ir_fusion_pm->AddPass(std::make_shared<SquareSumFusion>());
171   ir_fusion_pm->AddPass(std::make_shared<ClipByValueFusion>());
172   ir_fusion_pm->AddPass(std::make_shared<PReluFusion>());
173 }
174 
AddAscendIRFusionPass(PassManager * ir_fusion_pm)175 void AddAscendIRFusionPass(PassManager *ir_fusion_pm) {
176   MS_EXCEPTION_IF_NULL(ir_fusion_pm);
177   ir_fusion_pm->AddPass(std::make_shared<SingleBatchNormFission>());
178   ir_fusion_pm->AddPass(std::make_shared<BatchNorm2BNInfer>());
179   ir_fusion_pm->AddPass(std::make_shared<BatchNormGrad2BNInferGrad>());
180   ir_fusion_pm->AddPass(std::make_shared<BatchNormGradInferFission>());
181   ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
182   ir_fusion_pm->AddPass(std::make_shared<SoftmaxGradExtFusion>());
183   ir_fusion_pm->AddPass(std::make_shared<SoftmaxGradExtFusionV2>());
184   ir_fusion_pm->AddPass(std::make_shared<SoftmaxGradExtFusionV3>());
185   ir_fusion_pm->AddPass(std::make_shared<ConfusionMulGradFusion>());
186   ir_fusion_pm->AddPass(std::make_shared<ConfusionSoftmaxGradRule>());
187   ir_fusion_pm->AddPass(std::make_shared<ReshapeTransposeFusion>());
188   ir_fusion_pm->AddPass(std::make_shared<TransposeReshapeFusion>());
189   ir_fusion_pm->AddPass(std::make_shared<TopKSplit>());
190   ir_fusion_pm->AddPass(std::make_shared<LinSpaceFission>());
191   ir_fusion_pm->AddPass(std::make_shared<DiagFission>());
192   ir_fusion_pm->AddPass(std::make_shared<DiagPartFission>());
193   ir_fusion_pm->AddPass(std::make_shared<MaxPool3DGradGradFission>());
194   ir_fusion_pm->AddPass(std::make_shared<AvgPool3DFusion>());
195   ir_fusion_pm->AddPass(std::make_shared<AvgPool3DGradFusion>());
196   ir_fusion_pm->AddPass(std::make_shared<MomentumLossscaleFusion>());
197   ir_fusion_pm->AddPass(std::make_shared<MulAddFusion>());
198   ir_fusion_pm->AddPass(std::make_shared<MulAddNFusion>());
199   ir_fusion_pm->AddPass(std::make_shared<MatmulBiasaddFusion>());
200   ir_fusion_pm->AddPass(std::make_shared<AddnFission>());
201   ir_fusion_pm->AddPass(std::make_shared<DereluFusion>());
202   ir_fusion_pm->AddPass(std::make_shared<TransposeTransDataFusion>());
203   ir_fusion_pm->AddPass(std::make_shared<InsertPlaceholderForDynamicRNN>());
204   ir_fusion_pm->AddPass(std::make_shared<DynamicRnnGradFissionV2>());
205   ir_fusion_pm->AddPass(std::make_shared<SplitFission>());
206   ir_fusion_pm->AddPass(std::make_shared<SplitVFission>());
207   ir_fusion_pm->AddPass(std::make_shared<SpaceToDepthSplit>());
208   ir_fusion_pm->AddPass(std::make_shared<TensorScatterUpdateFission>());
209   ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
210   ir_fusion_pm->AddPass(std::make_shared<PackFission>());
211   ir_fusion_pm->AddPass(std::make_shared<ConcatFission>());
212   ir_fusion_pm->AddPass(std::make_shared<ReduceMinFission>());
213   ir_fusion_pm->AddPass(std::make_shared<UnsortSegmentSumFission>());
214   ir_fusion_pm->AddPass(std::make_shared<GatherV2DsFission>());
215   ir_fusion_pm->AddPass(std::make_shared<BCEWithLogitsLossFission>());
216   ir_fusion_pm->AddPass(std::make_shared<CdistFission>());
217   ir_fusion_pm->AddPass(std::make_shared<CdistGradFission>());
218   ir_fusion_pm->AddPass(std::make_shared<BNReduceGradConv2dBackpropFilterFusion>());
219 }
220 }  // namespace
221 
AscendDataLayout(const std::shared_ptr<session::KernelGraph> & kernel_graph)222 void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
223   MS_EXCEPTION_IF_NULL(kernel_graph);
224   auto optimizer = std::make_shared<GraphOptimizer>();
225   auto data_layout_pm = std::make_shared<PassManager>("transop_pm");
226   data_layout_pm->AddPass(std::make_shared<RectifyDoMaskKernelInfo>());
227   data_layout_pm->AddPass(std::make_shared<DynamicRNNGradReformat>());
228   data_layout_pm->AddPass(std::make_shared<ChangeAxisOfReduceKernel>());
229   auto ms_context = MsContext::GetInstance();
230   MS_EXCEPTION_IF_NULL(ms_context);
231   if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
232     data_layout_pm->AddPass(std::make_shared<RunOpInsertTransData>());
233   } else {
234     data_layout_pm->AddPass(std::make_shared<MergeCastToOp>());
235     data_layout_pm->AddPass(std::make_shared<ConvertCastFormat>());
236     data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>());
237     data_layout_pm->AddPass(std::make_shared<InsertTransOp>());
238     data_layout_pm->AddPass(std::make_shared<GetitemTuple>());
239   }
240   data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>());
241   data_layout_pm->AddPass(std::make_shared<AddIoFormatAttrFor3DGraph>());
242   data_layout_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
243   data_layout_pm->AddPass(std::make_shared<RemoveReshapePair>());
244   data_layout_pm->AddPass(std::make_shared<EliminateRedundantOp>());
245   data_layout_pm->AddPass(std::make_shared<InsertTransposeForDynamicGRUV2>());
246   data_layout_pm->AddPass(std::make_shared<OptimizeDependence>());
247   data_layout_pm->AddPass(std::make_shared<TransDataSplit>());
248   data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>());
249   data_layout_pm->AddPass(std::make_shared<RemoveInternalOutputTransOp>());
250   optimizer->AddPassManager(data_layout_pm);
251   (void)optimizer->Optimize(kernel_graph);
252   kernel_graph->SetExecOrderByDefault();
253 }
254 
AscendMixPrecision(const std::shared_ptr<session::KernelGraph> & kernel_graph)255 void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
256   MS_EXCEPTION_IF_NULL(kernel_graph);
257   auto optimizer = std::make_shared<GraphOptimizer>();
258   auto mixed_precision_pm = std::make_shared<PassManager>("cast_pm");
259   mixed_precision_pm->AddPass(std::make_shared<InsertCast>());
260   mixed_precision_pm->AddPass(std::make_shared<GetitemTuple>());
261   mixed_precision_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
262   mixed_precision_pm->AddPass(std::make_shared<EliminateRedundantOp>());
263   mixed_precision_pm->AddPass(std::make_shared<OptimizeDependence>());
264   mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
265   mixed_precision_pm->AddPass(std::make_shared<DealRefAndSpiltUnSupportedTransdata>());
266   mixed_precision_pm->AddPass(std::make_shared<GetitemTuple>());
267   mixed_precision_pm->AddPass(std::make_shared<MergeCastToOp>());
268   mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>());
269   mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
270   mixed_precision_pm->AddPass(std::make_shared<TransOpFormatRefine>());
271   mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
272   mixed_precision_pm->AddPass(std::make_shared<TransposedUpdateFusion>());
273   mixed_precision_pm->AddPass(std::make_shared<ConvertUnSupportNodeToAICPU>());
274   mixed_precision_pm->AddPass(std::make_shared<RemoveInternalOutputCast>());
275   optimizer->AddPassManager(mixed_precision_pm);
276   (void)optimizer->Optimize(kernel_graph);
277   kernel_graph->SetExecOrderByDefault();
278 }
279 
AscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGraph> & kernel_graph)280 void AscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
281   auto context_ptr = MsContext::GetInstance();
282   MS_EXCEPTION_IF_NULL(context_ptr);
283 #ifdef ENABLE_DUMP_IR
284   bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
285   if (save_graphs) {
286     std::string file_name = "hwopt_d_ir_fusion_before_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
287     DumpIR(file_name, kernel_graph);
288     DumpIRProto(kernel_graph, "before_hwopt_" + std::to_string(kernel_graph->graph_id()));
289   }
290 #endif
291   auto optimizer = std::make_shared<GraphOptimizer>();
292   auto ir_fusion_pm = std::make_shared<PassManager>("ir_fusion_pm");
293   ir_fusion_pm->AddPass(std::make_shared<BnSplit>());
294   ir_fusion_pm->AddPass(std::make_shared<BnGradSplit>());
295   ir_fusion_pm->AddPass(std::make_shared<SyncBnSplit>());
296   ir_fusion_pm->AddPass(std::make_shared<SyncBnGradSplit>());
297   ir_fusion_pm->AddPass(std::make_shared<LayerNormGradSplit>());
298   ir_fusion_pm->AddPass(std::make_shared<InsertPadForNMSWithMask>());
299   ir_fusion_pm->AddPass(std::make_shared<InsertPlaceholderForDynamicGRUV2>());
300   ir_fusion_pm->AddPass(std::make_shared<DynamicGRUV2GradFission>());
301   AddAscendIRFusionRulesPass(ir_fusion_pm.get());
302   AddAscendIRFusionPass(ir_fusion_pm.get());
303 
304   if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && context_ptr->get_param<bool>(MS_CTX_ENABLE_LOOP_SINK) &&
305       ConfigManager::GetInstance().iter_num() > 1) {
306     ir_fusion_pm->AddPass(std::make_shared<InsertTensorMoveForGetNext>());
307     ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
308     ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
309   }
310   ir_fusion_pm->AddPass(std::make_shared<InsertTensorMoveForHcclOp>());
311   ir_fusion_pm->AddPass(std::make_shared<InsertTranspose>());
312   ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
313   ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
314   optimizer->AddPassManager(ir_fusion_pm);
315   (void)optimizer->Optimize(kernel_graph);
316   kernel_graph->SetExecOrderByDefault();
317 #ifdef ENABLE_DUMP_IR
318   if (save_graphs) {
319     std::string file_name = "hwopt_d_ir_fusion_after_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
320     DumpIR(file_name, kernel_graph);
321   }
322 #endif
323 }
324 
RunOpAscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGraph> & kernel_graph)325 void RunOpAscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
326   auto context_ptr = MsContext::GetInstance();
327   MS_EXCEPTION_IF_NULL(context_ptr);
328   if (!context_ptr->get_param<bool>(MS_CTX_IR_FUSION_FLAG)) {
329     MS_LOG(INFO) << "IRFusion is not enable, skip";
330     return;
331   }
332 #ifdef ENABLE_DUMP_IR
333   bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
334   if (save_graphs) {
335     DumpIR("hwopt_d_ir_fusion_before.ir", kernel_graph);
336   }
337 #endif
338   auto optimizer = std::make_shared<GraphOptimizer>();
339   auto ir_fusion_pm = std::make_shared<PassManager>("ir_fusion_pm");
340   ir_fusion_pm->AddPass(std::make_shared<InsertPlaceholderForDynamicRNN>());
341   ir_fusion_pm->AddPass(std::make_shared<DynamicGRUV2GradFission>());
342   ir_fusion_pm->AddPass(std::make_shared<InsertPlaceholderForDynamicGRUV2>());
343   ir_fusion_pm->AddPass(std::make_shared<DynamicRnnGradFissionV2>());
344   ir_fusion_pm->AddPass(std::make_shared<SplitFission>());
345   ir_fusion_pm->AddPass(std::make_shared<SplitVFission>());
346   ir_fusion_pm->AddPass(std::make_shared<ConcatFission>());
347   ir_fusion_pm->AddPass(std::make_shared<BnSplit>());
348   ir_fusion_pm->AddPass(std::make_shared<BnGradSplit>());
349   ir_fusion_pm->AddPass(std::make_shared<LayerNormGradSplit>());
350   ir_fusion_pm->AddPass(std::make_shared<TopKSplit>());
351   ir_fusion_pm->AddPass(std::make_shared<LinSpaceFission>());
352   ir_fusion_pm->AddPass(std::make_shared<SpaceToDepthSplit>());
353   ir_fusion_pm->AddPass(std::make_shared<DiagFission>());
354   ir_fusion_pm->AddPass(std::make_shared<DiagPartFission>());
355   ir_fusion_pm->AddPass(std::make_shared<MaxPool3DGradGradFission>());
356   ir_fusion_pm->AddPass(std::make_shared<AvgPool3DFusion>());
357   ir_fusion_pm->AddPass(std::make_shared<AvgPool3DGradFusion>());
358   ir_fusion_pm->AddPass(std::make_shared<AddnFission>());
359   ir_fusion_pm->AddPass(std::make_shared<InsertPadForNMSWithMask>());
360   ir_fusion_pm->AddPass(std::make_shared<TensorScatterUpdateFission>());
361   ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
362   ir_fusion_pm->AddPass(std::make_shared<CdistFission>());
363   ir_fusion_pm->AddPass(std::make_shared<CdistGradFission>());
364   ir_fusion_pm->AddPass(std::make_shared<BCEWithLogitsLossFission>());
365   ir_fusion_pm->AddPass(std::make_shared<InsertTensorMoveForHcclOp>());
366 
367   optimizer->AddPassManager(ir_fusion_pm);
368   (void)optimizer->Optimize(kernel_graph);
369   kernel_graph->SetExecOrderByDefault();
370 #ifdef ENABLE_DUMP_IR
371   if (save_graphs) {
372     DumpIR("hwopt_d_ir_fusion_after.ir", kernel_graph);
373   }
374 #endif
375 }
376 
RunOpAscendBackendOptimization(const std::shared_ptr<session::KernelGraph> & kernel_graph)377 void RunOpAscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
378   // data layout optimization
379   AscendDataLayout(kernel_graph);
380   // mixed precision optimization
381   AscendMixPrecision(kernel_graph);
382   // other optimization
383   auto optimizer = std::make_shared<GraphOptimizer>();
384   auto other_pm = std::make_shared<PassManager>("other_pm");
385   other_pm->AddPass(std::make_shared<SetFraczGroupAttr>());
386   optimizer->AddPassManager(other_pm);
387   (void)optimizer->Optimize(kernel_graph);
388   kernel_graph->SetExecOrderByDefault();
389 }
390 
AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> & kernel_graph)391 void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
392   auto context_ptr = MsContext::GetInstance();
393   MS_EXCEPTION_IF_NULL(context_ptr);
394 #ifdef ENABLE_DUMP_IR
395   bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
396   if (save_graphs) {
397     std::string file_name = "hwopt_d_before_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
398     DumpIR(file_name, kernel_graph);
399   }
400 #endif
401   // data layout optimization
402   AscendDataLayout(kernel_graph);
403   // mixed precision optimization
404   AscendMixPrecision(kernel_graph);
405   // other optimization
406   auto optimizer = std::make_shared<GraphOptimizer>();
407   auto other_pm = std::make_shared<PassManager>("other_pm");
408   other_pm->AddPass(std::make_shared<SendFusion>());
409   other_pm->AddPass(std::make_shared<RecvFusion>());
410   other_pm->AddPass(std::make_shared<AllReduceFusion>());
411   other_pm->AddPass(std::make_shared<AdjustDependForParallelOptimizerRecomputeAllGather>());
412   other_pm->AddPass(std::make_shared<AllGatherFusion>());
413   other_pm->AddPass(std::make_shared<ConcatOutputsForAllGather>());
414   other_pm->AddPass(std::make_shared<InsertDependForAllGather>());
415   other_pm->AddPass(std::make_shared<ReduceScatterFusion>());
416   other_pm->AddPass(std::make_shared<SplitInputsForReduceScatter>());
417   other_pm->AddPass(std::make_shared<BroadcastFusion>());
418   other_pm->AddPass(std::make_shared<InsertTensorMoveForCascade>());
419   other_pm->AddPass(std::make_shared<ParameterTransOpFusion>());
420   other_pm->AddPass(std::make_shared<RefreshParameterFormat>());
421   other_pm->AddPass(std::make_shared<SplitOpOptimizer>());
422   other_pm->AddPass(std::make_shared<SetFraczGroupAttr>());
423   optimizer->AddPassManager(other_pm);
424   (void)optimizer->Optimize(kernel_graph);
425   kernel_graph->SetExecOrderByDefault();
426   // buffer fusion
427   AscendBackendUBFusionOptimization(kernel_graph);
428 
429   // other2 optimization
430   auto optimizer2 = std::make_shared<GraphOptimizer>();
431   auto other2_pm = std::make_shared<PassManager>("other2_pm");
432   other2_pm->AddPass(std::make_shared<GetitemTuple>());
433   other2_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
434   if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && context_ptr->get_param<bool>(MS_CTX_ENABLE_LOOP_SINK) &&
435       ConfigManager::GetInstance().iter_num() > 1) {
436     other2_pm->AddPass(std::make_shared<GetnextTensorMoveElimination>());
437   }
438   other2_pm->AddPass(std::make_shared<CheckConsistency>());
439   optimizer2->AddPassManager(other2_pm);
440   (void)optimizer2->Optimize(kernel_graph);
441   kernel_graph->SetExecOrderByDefault();
442 #ifdef ENABLE_DUMP_IR
443   const std::vector<CNodePtr> &exec_order = kernel_graph->execution_order();
444   std::string exec_order_name = "graph_exec_order." + std::to_string(kernel_graph->graph_id());
445   (void)mindspore::RDR::RecordGraphExecOrder(SubModuleId::SM_OPTIMIZER, exec_order_name, exec_order);
446   if (save_graphs) {
447     std::string file_name = "hwopt_d_end_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
448     DumpIR(file_name, kernel_graph, true, kWholeStack);
449     DumpIRProto(kernel_graph, "after_hwopt_" + std::to_string(kernel_graph->graph_id()));
450     kernel_graph->DumpFuncGraph("hwopt_d_end");
451   }
452 #endif
453 }
454 
AscendBackendUBFusionOptimization(const std::shared_ptr<session::KernelGraph> & kernel_graph)455 void AscendBackendUBFusionOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
456   auto context_ptr = MsContext::GetInstance();
457   MS_EXCEPTION_IF_NULL(context_ptr);
458   if (!context_ptr->get_param<bool>(MS_CTX_IR_FUSION_FLAG)) {
459     MS_LOG(INFO) << "UBFusion is not enable, skip";
460     return;
461   }
462 
463   if (kernel_graph->is_dynamic_shape()) {
464     MS_LOG(WARNING) << "Dynamic shape skip fusion";
465     return;
466   }
467   auto old_build = common::GetEnv("MS_OLD_BUILD_PROCESS");
468   if (old_build.empty()) {
469     auto &build_manager = kernel::ascend::AscendKernelCompileManager::GetInstance();
470     auto pre_build = common::GetEnv("MS_DISABLE_PREBUILD");
471     if (pre_build.empty()) {
472       build_manager.AscendPreBuild(kernel_graph);
473     }
474   }
475 #ifdef ENABLE_DUMP_IR
476   bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
477   if (save_graphs) {
478     std::string file_name = "hwopt_d_ub_fusion_before_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
479     DumpIR(file_name, kernel_graph);
480   }
481 #endif
482   auto fusion_id_allocator = std::make_shared<FusionIdAllocator>();
483   MS_EXCEPTION_IF_NULL(fusion_id_allocator);
484   fusion_id_allocator->Init();
485   auto optimizer = std::make_shared<GraphOptimizer>();
486   auto ub_fusion_pm = std::make_shared<PassManager>("ub_fusion_pm");
487   ub_fusion_pm->AddPass(std::make_shared<Conv2DBackpropEltwiseEltwiseFusionPass>(fusion_id_allocator));
488   ub_fusion_pm->AddPass(std::make_shared<Conv2DBackpropEltwiseFusionPass>(fusion_id_allocator));
489   ub_fusion_pm->AddPass(std::make_shared<ConvBnReduceFusionPass>(fusion_id_allocator));
490   ub_fusion_pm->AddPass(std::make_shared<ConvSingleInFusionPass>(fusion_id_allocator));
491   ub_fusion_pm->AddPass(std::make_shared<BnupdateEltwiseFusionPass>(fusion_id_allocator));
492   ub_fusion_pm->AddPass(std::make_shared<BnupdateEltwiseEltwiseFusionPass>(fusion_id_allocator));
493   ub_fusion_pm->AddPass(std::make_shared<MatmulEltwiseFusionPass>(fusion_id_allocator));
494   ub_fusion_pm->AddPass(std::make_shared<ConvDoubleInFusionPass>(fusion_id_allocator));
495   ub_fusion_pm->AddPass(std::make_shared<ReduceEltwiseFusionPass>(fusion_id_allocator));
496   ub_fusion_pm->AddPass(std::make_shared<SegmentEltwiseFusionPass>(fusion_id_allocator));
497   ub_fusion_pm->AddPass(std::make_shared<MultiOutputFusionPass>(fusion_id_allocator));
498   if (!context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
499     ub_fusion_pm->AddPass(std::make_shared<EltwiseFusionPass>(fusion_id_allocator));
500   }
501   ub_fusion_pm->AddPass(std::make_shared<DepthwiseConvEltwiseFusionPass>(fusion_id_allocator));
502   ub_fusion_pm->AddPass(std::make_shared<MatmulConfusionTranposeFusionPass>(fusion_id_allocator));
503   if (!context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
504     ub_fusion_pm->AddPass(std::make_shared<BatchMatmulFusedMulAddFusionPass>(fusion_id_allocator));
505   }
506   ub_fusion_pm->AddPass(std::make_shared<UbPatternFusion>());
507   optimizer->AddPassManager(ub_fusion_pm);
508   (void)optimizer->Optimize(kernel_graph);
509   kernel_graph->SetExecOrderByDefault();
510 #ifdef ENABLE_DUMP_IR
511   if (save_graphs) {
512     std::string file_name = "hwopt_d_ub_fusion_after_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
513     DumpIR(file_name, kernel_graph);
514   }
515 #endif
516 }
517 }  // namespace opt
518 }  // namespace mindspore
519