1 /**
2 * Copyright 2019-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "backend/optimizer/ascend/ascend_backend_optimization.h"
17 #include <algorithm>
18 #include <list>
19 #include <memory>
20 #include <vector>
21 #include <string>
22 #include "backend/optimizer/common/optimizer.h"
23 #include "backend/optimizer/ascend/ir_fission/dynamic_rnn_grad_fission_v2.h"
24 #include "backend/optimizer/ascend/ir_fission/dynamic_gru_v2_grad_fission.h"
25 #include "backend/optimizer/ascend/ir_fission/bn_split.h"
26 #include "backend/optimizer/ascend/ir_fission/bn_grad_split.h"
27 #include "backend/optimizer/ascend/ir_fission/batch_norm_grad_split.h"
28 #include "backend/optimizer/ascend/ir_fission/batch_norm_bert_fission.h"
29 #include "backend/optimizer/ascend/ir_fission/single_batch_norm_fission.h"
30 #include "backend/optimizer/ascend/ir_fission/tensor_scatter_update_fission.h"
31 #include "backend/optimizer/ascend/ir_fission/reduce_min_fission.h"
32 #include "backend/optimizer/ascend/ir_fusion/fused_batch_norm_fusion.h"
33 #include "backend/optimizer/ascend/ir_fission/layer_norm_grad_split.h"
34 #include "backend/optimizer/ascend/ir_fission/unsorted_segment_sum_fission.h"
35 #include "backend/optimizer/ascend/ir_fission/gather_v2_ds_fission.h"
36 #include "backend/optimizer/ascend/ir_fission/bce_with_logits_loss_fission.h"
37 #include "backend/optimizer/ascend/ir_fission/cdist_fission.h"
38 #include "backend/optimizer/pass/communication_op_fusion.h"
39 #include "backend/optimizer/ascend/ir_fusion/square_sum_fusion.h"
40 #include "backend/optimizer/ascend/ir_fusion/clip_by_norm_no_div_square_sum_fusion.h"
41 #include "backend/optimizer/ascend/ir_fusion/lamb_update_with_lr_rule_fusion.h"
42 #include "backend/optimizer/ascend/ir_fusion/prelu_fusion.h"
43 #include "backend/optimizer/ascend/ir_fusion/clip_by_value_fusion.h"
44 #include "backend/optimizer/ascend/ir_fusion/confusion_softmax_grad_rule.h"
45 #include "backend/optimizer/ascend/ir_fusion/lamb_next_mv_rule.h"
46 #include "backend/optimizer/ascend/ir_fusion/lamb_next_mv_with_decay_rule.h"
47 #include "backend/optimizer/ascend/ir_fusion/lamb_next_right_rule.h"
48 #include "backend/optimizer/ascend/ir_fusion/lamb_update_with_lr_v2.h"
49 #include "backend/optimizer/ascend/ir_fusion/layer_norm_beta_gamma_backprop_fusion.h"
50 #include "backend/optimizer/ascend/ir_fusion/reshape_transpose_fusion.h"
51 #include "backend/optimizer/ascend/ir_fusion/transpose_reshape_fusion.h"
52 #include "backend/optimizer/ascend/ir_fusion/adam_apply_one_fusion.h"
53 #include "backend/optimizer/ascend/ir_fusion/adam_apply_one_with_decay_rule.h"
54 #include "backend/optimizer/ascend/ir_fusion/parameter_and_transop_fusion.h"
55 #include "backend/optimizer/ascend/ir_fusion/refresh_parameter_format.h"
56 #include "backend/optimizer/ascend/ir_fusion/transpose_transdata_fusion.h"
57 #include "backend/optimizer/ascend/ir_fission/transdata_split.h"
58 #include "backend/optimizer/ascend/ir_fission/topk_split.h"
59 #include "backend/optimizer/ascend/ir_fission/lin_space_fission.h"
60 #include "backend/optimizer/ascend/ir_fission/space_to_depth_split.h"
61 #include "backend/optimizer/ascend/ir_fission/diag_fission.h"
62 #include "backend/optimizer/ascend/ir_fission/diag_part_fission.h"
63 #include "backend/optimizer/ascend/ir_fission/max_pool3d_grad_grad_fission.h"
64 #include "backend/optimizer/ascend/ir_fusion/avgpool_3d_fusion.h"
65 #include "backend/optimizer/ascend/ir_fusion/avgpool_3d_grad_fusion.h"
66 #include "backend/optimizer/ascend/ir_fusion/momentum_lossscale_fusion.h"
67 #include "backend/optimizer/ascend/ir_fusion/mul_add_fusion.h"
68 #include "backend/optimizer/ascend/ir_fusion/mul_addn_fusion.h"
69 #include "backend/optimizer/ascend/ir_fusion/matmul_biasadd_fusion.h"
70 #include "backend/optimizer/ascend/ir_fusion/remove_reshape_pair.h"
71 #include "backend/optimizer/ascend/ir_fusion/derelu_fusion.h"
72 #include "backend/optimizer/ascend/ir_fusion/batchnorm_to_bninfer.h"
73 #include "backend/optimizer/ascend/ir_fusion/batchnormgrad_to_bninfergrad.h"
74 #include "backend/optimizer/ascend/ir_fusion/confusion_mul_grad_fusion.h"
75 #include "backend/optimizer/ascend/ir_fusion/softmax_grad_ext_fusion.h"
76 #include "backend/optimizer/ascend/ir_fusion/bn_reduce_grad_conv2d_backprop_filter_fusion.h"
77 #include "backend/optimizer/ascend/ir_fusion/transposed_update_fusion.h"
78 #include "backend/optimizer/ascend/format_type/insert_trans_op.h"
79 #include "backend/optimizer/ascend/format_type/trans_op_format_refine.h"
80 #include "backend/optimizer/ascend/format_type/dynamic_rnn_grad_reformat.h"
81 #include "backend/optimizer/ascend/format_type/insert_transpose_for_basiclstm_op.h"
82 #include "backend/optimizer/ascend/format_type/insert_transpose_for_dyanmic_gru_v2.h"
83 #include "backend/optimizer/ascend/format_type/rectify_do_mask_kernel_info.h"
84 #include "backend/optimizer/ascend/format_type/change_axis_of_reduce_kernel.h"
85 #include "backend/optimizer/ascend/format_type/convert_cast_format.h"
86 #include "backend/optimizer/ascend/format_type/set_fracz_group_attr.h"
87 #include "backend/optimizer/pass/getitem_tuple.h"
88 #include "backend/optimizer/pass/optimize_dependence.h"
89 #include "backend/optimizer/pass/erase_visit_attr.h"
90 #include "backend/optimizer/ascend/format_type/insert_cast.h"
91 #include "backend/optimizer/ascend/format_type/convert_unsupported_transnode_to_aicpu.h"
92 #include "backend/optimizer/pass/eliminate_redundant_op.h"
93 #include "backend/optimizer/pass/common_subexpression_elimination.h"
94 #include "backend/optimizer/ascend/format_type/merge_cast_to_op.h"
95 #include "backend/optimizer/ascend/format_type/check_consistency.h"
96 #include "backend/optimizer/ascend/buffer_fusion/ub_pattern_fusion.h"
97 #include "backend/optimizer/ascend/buffer_fusion/eltwise_fusion_pass.h"
98 #include "backend/optimizer/ascend/buffer_fusion/multi_output_fusion_pass.h"
99 #include "backend/optimizer/ascend/buffer_fusion/conv2dbackprop_eltwise_eltwise_fusion_pass.h"
100 #include "backend/optimizer/ascend/buffer_fusion/conv2dbackprop_eltwise_fusion_pass.h"
101 #include "backend/optimizer/ascend/buffer_fusion/conv_single_in_fusion_pass.h"
102 #include "backend/optimizer/ascend/buffer_fusion/conv_double_in_fusion_pass.h"
103 #include "backend/optimizer/ascend/buffer_fusion/matmul_eltwise_fusion_pass.h"
104 #include "backend/optimizer/ascend/buffer_fusion/matmul_confusiontranspose_fusion_pass.h"
105 #include "backend/optimizer/ascend/buffer_fusion/batchmatmul_fusedmuladd_fusion_pass.h"
106 #include "backend/optimizer/ascend/buffer_fusion/depthwiseconv_eltwise_fusion_pass.h"
107 #include "backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_fusion_pass.h"
108 #include "backend/optimizer/ascend/buffer_fusion/bnupdate_eltwise_eltwise_fusion_pass.h"
109 #include "backend/optimizer/ascend/buffer_fusion/conv_bnreduce_fusion_pass.h"
110 #include "backend/optimizer/ascend/buffer_fusion/reduce_eltwise_fusion_pass.h"
111 #include "backend/optimizer/ascend/buffer_fusion/segment_eltwise_fusion_pass.h"
112 #include "backend/optimizer/ascend/format_type/deal_ref_and_split_unsupported_transdata.h"
113 #include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h"
114 #include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_cascade.h"
115 #include "backend/optimizer/ascend/enhancer/insert_pad_for_nms_with_mask.h"
116 #include "backend/optimizer/ascend/format_type/insert_transdata_for_runop.h"
117 #include "backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h"
118 #include "backend/optimizer/ascend/ir_fission/addn_fission.h"
119 #include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h"
120 #include "backend/optimizer/ascend/ir_fission/batch_norm_grad_infer_fission.h"
121 #include "backend/optimizer/ascend/ir_fission/split_fission.h"
122 #include "backend/optimizer/ascend/ir_fission/splitv_fission.h"
123 #include "backend/optimizer/ascend/ir_fusion/add_input_to_output.h"
124 #include "backend/optimizer/ascend/format_type/remove_internal_output.h"
125 #include "backend/optimizer/ascend/ir_fission/concat_fission.h"
126 #include "backend/optimizer/ascend/ir_fission/pack_fission.h"
127 #include "backend/optimizer/ascend/enhancer/concat_outputs_for_all_gather.h"
128 #include "backend/optimizer/ascend/enhancer/insert_depend_for_all_gather.h"
129 #include "backend/optimizer/ascend/enhancer/split_inputs_for_reduce_scatter.h"
130 #include "backend/optimizer/ascend/enhancer/add_placeholder_for_dynamic_rnn.h"
131 #include "backend/optimizer/ascend/enhancer/add_placeholder_for_dynamic_gru.h"
132 #include "backend/optimizer/ascend/enhancer/add_attr_for_3d_graph.h"
133 #include "backend/optimizer/ascend/enhancer/split_n_optimizer.h"
134 #include "backend/optimizer/pass/adjust_depend_for_parallel_optimizer_recompute_all_gather.h"
135 #include "backend/kernel_compiler/tbe/ascend_kernel_compile.h"
136 #include "utils/ms_context.h"
137 #include "utils/config_manager.h"
138 #include "utils/context/graph_kernel_flags.h"
139 #include "debug/anf_ir_dump.h"
140 #include "debug/dump_proto.h"
141 #ifdef ENABLE_DUMP_IR
142 #include "debug/rdr/running_data_recorder.h"
143 #endif
144 namespace mindspore {
145 namespace opt {
146 namespace {
AddAscendIRFusionRulesPass(PassManager * ir_fusion_pm)147 void AddAscendIRFusionRulesPass(PassManager *ir_fusion_pm) {
148 MS_EXCEPTION_IF_NULL(ir_fusion_pm);
149 ir_fusion_pm->AddPass(std::make_shared<LambUpdateWithLRRuleFusion>());
150 ir_fusion_pm->AddPass(std::make_shared<LambNextMVWithDecayRuleCond1>());
151 ir_fusion_pm->AddPass(std::make_shared<LambNextMVWithDecayRuleCond2>());
152 ir_fusion_pm->AddPass(std::make_shared<LambNextMVWithDecayRuleCond3>());
153 ir_fusion_pm->AddPass(std::make_shared<LambNextMVWithDecayRuleCond4>());
154 ir_fusion_pm->AddPass(std::make_shared<LambNextMVRuleCond1>());
155 ir_fusion_pm->AddPass(std::make_shared<LambNextMVRuleCond2>());
156 ir_fusion_pm->AddPass(std::make_shared<LambNextMVRuleCond3>());
157 ir_fusion_pm->AddPass(std::make_shared<LambNextMVRuleCond4>());
158 ir_fusion_pm->AddPass(std::make_shared<LambNextRightRule>());
159 ir_fusion_pm->AddPass(std::make_shared<LambUpdateWithLrV2>());
160 ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneCond1Fusion>());
161 ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneCond2Fusion>());
162 ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneCond3Fusion>());
163 ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneCond4Fusion>());
164 ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneWithDecayRuleCond1>());
165 ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneWithDecayRuleCond2>());
166 ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneWithDecayRuleCond3>());
167 ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneWithDecayRuleCond4>());
168 ir_fusion_pm->AddPass(std::make_shared<AdamApplyOneWithDecayRuleCond5>());
169 ir_fusion_pm->AddPass(std::make_shared<ClipByNormNoDivSquareSumFusion>());
170 ir_fusion_pm->AddPass(std::make_shared<SquareSumFusion>());
171 ir_fusion_pm->AddPass(std::make_shared<ClipByValueFusion>());
172 ir_fusion_pm->AddPass(std::make_shared<PReluFusion>());
173 }
174
AddAscendIRFusionPass(PassManager * ir_fusion_pm)175 void AddAscendIRFusionPass(PassManager *ir_fusion_pm) {
176 MS_EXCEPTION_IF_NULL(ir_fusion_pm);
177 ir_fusion_pm->AddPass(std::make_shared<SingleBatchNormFission>());
178 ir_fusion_pm->AddPass(std::make_shared<BatchNorm2BNInfer>());
179 ir_fusion_pm->AddPass(std::make_shared<BatchNormGrad2BNInferGrad>());
180 ir_fusion_pm->AddPass(std::make_shared<BatchNormGradInferFission>());
181 ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
182 ir_fusion_pm->AddPass(std::make_shared<SoftmaxGradExtFusion>());
183 ir_fusion_pm->AddPass(std::make_shared<SoftmaxGradExtFusionV2>());
184 ir_fusion_pm->AddPass(std::make_shared<SoftmaxGradExtFusionV3>());
185 ir_fusion_pm->AddPass(std::make_shared<ConfusionMulGradFusion>());
186 ir_fusion_pm->AddPass(std::make_shared<ConfusionSoftmaxGradRule>());
187 ir_fusion_pm->AddPass(std::make_shared<ReshapeTransposeFusion>());
188 ir_fusion_pm->AddPass(std::make_shared<TransposeReshapeFusion>());
189 ir_fusion_pm->AddPass(std::make_shared<TopKSplit>());
190 ir_fusion_pm->AddPass(std::make_shared<LinSpaceFission>());
191 ir_fusion_pm->AddPass(std::make_shared<DiagFission>());
192 ir_fusion_pm->AddPass(std::make_shared<DiagPartFission>());
193 ir_fusion_pm->AddPass(std::make_shared<MaxPool3DGradGradFission>());
194 ir_fusion_pm->AddPass(std::make_shared<AvgPool3DFusion>());
195 ir_fusion_pm->AddPass(std::make_shared<AvgPool3DGradFusion>());
196 ir_fusion_pm->AddPass(std::make_shared<MomentumLossscaleFusion>());
197 ir_fusion_pm->AddPass(std::make_shared<MulAddFusion>());
198 ir_fusion_pm->AddPass(std::make_shared<MulAddNFusion>());
199 ir_fusion_pm->AddPass(std::make_shared<MatmulBiasaddFusion>());
200 ir_fusion_pm->AddPass(std::make_shared<AddnFission>());
201 ir_fusion_pm->AddPass(std::make_shared<DereluFusion>());
202 ir_fusion_pm->AddPass(std::make_shared<TransposeTransDataFusion>());
203 ir_fusion_pm->AddPass(std::make_shared<InsertPlaceholderForDynamicRNN>());
204 ir_fusion_pm->AddPass(std::make_shared<DynamicRnnGradFissionV2>());
205 ir_fusion_pm->AddPass(std::make_shared<SplitFission>());
206 ir_fusion_pm->AddPass(std::make_shared<SplitVFission>());
207 ir_fusion_pm->AddPass(std::make_shared<SpaceToDepthSplit>());
208 ir_fusion_pm->AddPass(std::make_shared<TensorScatterUpdateFission>());
209 ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
210 ir_fusion_pm->AddPass(std::make_shared<PackFission>());
211 ir_fusion_pm->AddPass(std::make_shared<ConcatFission>());
212 ir_fusion_pm->AddPass(std::make_shared<ReduceMinFission>());
213 ir_fusion_pm->AddPass(std::make_shared<UnsortSegmentSumFission>());
214 ir_fusion_pm->AddPass(std::make_shared<GatherV2DsFission>());
215 ir_fusion_pm->AddPass(std::make_shared<BCEWithLogitsLossFission>());
216 ir_fusion_pm->AddPass(std::make_shared<CdistFission>());
217 ir_fusion_pm->AddPass(std::make_shared<CdistGradFission>());
218 ir_fusion_pm->AddPass(std::make_shared<BNReduceGradConv2dBackpropFilterFusion>());
219 }
220 } // namespace
221
AscendDataLayout(const std::shared_ptr<session::KernelGraph> & kernel_graph)222 void AscendDataLayout(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
223 MS_EXCEPTION_IF_NULL(kernel_graph);
224 auto optimizer = std::make_shared<GraphOptimizer>();
225 auto data_layout_pm = std::make_shared<PassManager>("transop_pm");
226 data_layout_pm->AddPass(std::make_shared<RectifyDoMaskKernelInfo>());
227 data_layout_pm->AddPass(std::make_shared<DynamicRNNGradReformat>());
228 data_layout_pm->AddPass(std::make_shared<ChangeAxisOfReduceKernel>());
229 auto ms_context = MsContext::GetInstance();
230 MS_EXCEPTION_IF_NULL(ms_context);
231 if (ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) {
232 data_layout_pm->AddPass(std::make_shared<RunOpInsertTransData>());
233 } else {
234 data_layout_pm->AddPass(std::make_shared<MergeCastToOp>());
235 data_layout_pm->AddPass(std::make_shared<ConvertCastFormat>());
236 data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>());
237 data_layout_pm->AddPass(std::make_shared<InsertTransOp>());
238 data_layout_pm->AddPass(std::make_shared<GetitemTuple>());
239 }
240 data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>());
241 data_layout_pm->AddPass(std::make_shared<AddIoFormatAttrFor3DGraph>());
242 data_layout_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
243 data_layout_pm->AddPass(std::make_shared<RemoveReshapePair>());
244 data_layout_pm->AddPass(std::make_shared<EliminateRedundantOp>());
245 data_layout_pm->AddPass(std::make_shared<InsertTransposeForDynamicGRUV2>());
246 data_layout_pm->AddPass(std::make_shared<OptimizeDependence>());
247 data_layout_pm->AddPass(std::make_shared<TransDataSplit>());
248 data_layout_pm->AddPass(std::make_shared<EraseVisitAttr>());
249 data_layout_pm->AddPass(std::make_shared<RemoveInternalOutputTransOp>());
250 optimizer->AddPassManager(data_layout_pm);
251 (void)optimizer->Optimize(kernel_graph);
252 kernel_graph->SetExecOrderByDefault();
253 }
254
AscendMixPrecision(const std::shared_ptr<session::KernelGraph> & kernel_graph)255 void AscendMixPrecision(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
256 MS_EXCEPTION_IF_NULL(kernel_graph);
257 auto optimizer = std::make_shared<GraphOptimizer>();
258 auto mixed_precision_pm = std::make_shared<PassManager>("cast_pm");
259 mixed_precision_pm->AddPass(std::make_shared<InsertCast>());
260 mixed_precision_pm->AddPass(std::make_shared<GetitemTuple>());
261 mixed_precision_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
262 mixed_precision_pm->AddPass(std::make_shared<EliminateRedundantOp>());
263 mixed_precision_pm->AddPass(std::make_shared<OptimizeDependence>());
264 mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
265 mixed_precision_pm->AddPass(std::make_shared<DealRefAndSpiltUnSupportedTransdata>());
266 mixed_precision_pm->AddPass(std::make_shared<GetitemTuple>());
267 mixed_precision_pm->AddPass(std::make_shared<MergeCastToOp>());
268 mixed_precision_pm->AddPass(std::make_shared<LayerNormBetaGammaBackpropFusion>());
269 mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
270 mixed_precision_pm->AddPass(std::make_shared<TransOpFormatRefine>());
271 mixed_precision_pm->AddPass(std::make_shared<EraseVisitAttr>());
272 mixed_precision_pm->AddPass(std::make_shared<TransposedUpdateFusion>());
273 mixed_precision_pm->AddPass(std::make_shared<ConvertUnSupportNodeToAICPU>());
274 mixed_precision_pm->AddPass(std::make_shared<RemoveInternalOutputCast>());
275 optimizer->AddPassManager(mixed_precision_pm);
276 (void)optimizer->Optimize(kernel_graph);
277 kernel_graph->SetExecOrderByDefault();
278 }
279
AscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGraph> & kernel_graph)280 void AscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
281 auto context_ptr = MsContext::GetInstance();
282 MS_EXCEPTION_IF_NULL(context_ptr);
283 #ifdef ENABLE_DUMP_IR
284 bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
285 if (save_graphs) {
286 std::string file_name = "hwopt_d_ir_fusion_before_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
287 DumpIR(file_name, kernel_graph);
288 DumpIRProto(kernel_graph, "before_hwopt_" + std::to_string(kernel_graph->graph_id()));
289 }
290 #endif
291 auto optimizer = std::make_shared<GraphOptimizer>();
292 auto ir_fusion_pm = std::make_shared<PassManager>("ir_fusion_pm");
293 ir_fusion_pm->AddPass(std::make_shared<BnSplit>());
294 ir_fusion_pm->AddPass(std::make_shared<BnGradSplit>());
295 ir_fusion_pm->AddPass(std::make_shared<SyncBnSplit>());
296 ir_fusion_pm->AddPass(std::make_shared<SyncBnGradSplit>());
297 ir_fusion_pm->AddPass(std::make_shared<LayerNormGradSplit>());
298 ir_fusion_pm->AddPass(std::make_shared<InsertPadForNMSWithMask>());
299 ir_fusion_pm->AddPass(std::make_shared<InsertPlaceholderForDynamicGRUV2>());
300 ir_fusion_pm->AddPass(std::make_shared<DynamicGRUV2GradFission>());
301 AddAscendIRFusionRulesPass(ir_fusion_pm.get());
302 AddAscendIRFusionPass(ir_fusion_pm.get());
303
304 if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && context_ptr->get_param<bool>(MS_CTX_ENABLE_LOOP_SINK) &&
305 ConfigManager::GetInstance().iter_num() > 1) {
306 ir_fusion_pm->AddPass(std::make_shared<InsertTensorMoveForGetNext>());
307 ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
308 ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
309 }
310 ir_fusion_pm->AddPass(std::make_shared<InsertTensorMoveForHcclOp>());
311 ir_fusion_pm->AddPass(std::make_shared<InsertTranspose>());
312 ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
313 ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
314 optimizer->AddPassManager(ir_fusion_pm);
315 (void)optimizer->Optimize(kernel_graph);
316 kernel_graph->SetExecOrderByDefault();
317 #ifdef ENABLE_DUMP_IR
318 if (save_graphs) {
319 std::string file_name = "hwopt_d_ir_fusion_after_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
320 DumpIR(file_name, kernel_graph);
321 }
322 #endif
323 }
324
RunOpAscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGraph> & kernel_graph)325 void RunOpAscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
326 auto context_ptr = MsContext::GetInstance();
327 MS_EXCEPTION_IF_NULL(context_ptr);
328 if (!context_ptr->get_param<bool>(MS_CTX_IR_FUSION_FLAG)) {
329 MS_LOG(INFO) << "IRFusion is not enable, skip";
330 return;
331 }
332 #ifdef ENABLE_DUMP_IR
333 bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
334 if (save_graphs) {
335 DumpIR("hwopt_d_ir_fusion_before.ir", kernel_graph);
336 }
337 #endif
338 auto optimizer = std::make_shared<GraphOptimizer>();
339 auto ir_fusion_pm = std::make_shared<PassManager>("ir_fusion_pm");
340 ir_fusion_pm->AddPass(std::make_shared<InsertPlaceholderForDynamicRNN>());
341 ir_fusion_pm->AddPass(std::make_shared<DynamicGRUV2GradFission>());
342 ir_fusion_pm->AddPass(std::make_shared<InsertPlaceholderForDynamicGRUV2>());
343 ir_fusion_pm->AddPass(std::make_shared<DynamicRnnGradFissionV2>());
344 ir_fusion_pm->AddPass(std::make_shared<SplitFission>());
345 ir_fusion_pm->AddPass(std::make_shared<SplitVFission>());
346 ir_fusion_pm->AddPass(std::make_shared<ConcatFission>());
347 ir_fusion_pm->AddPass(std::make_shared<BnSplit>());
348 ir_fusion_pm->AddPass(std::make_shared<BnGradSplit>());
349 ir_fusion_pm->AddPass(std::make_shared<LayerNormGradSplit>());
350 ir_fusion_pm->AddPass(std::make_shared<TopKSplit>());
351 ir_fusion_pm->AddPass(std::make_shared<LinSpaceFission>());
352 ir_fusion_pm->AddPass(std::make_shared<SpaceToDepthSplit>());
353 ir_fusion_pm->AddPass(std::make_shared<DiagFission>());
354 ir_fusion_pm->AddPass(std::make_shared<DiagPartFission>());
355 ir_fusion_pm->AddPass(std::make_shared<MaxPool3DGradGradFission>());
356 ir_fusion_pm->AddPass(std::make_shared<AvgPool3DFusion>());
357 ir_fusion_pm->AddPass(std::make_shared<AvgPool3DGradFusion>());
358 ir_fusion_pm->AddPass(std::make_shared<AddnFission>());
359 ir_fusion_pm->AddPass(std::make_shared<InsertPadForNMSWithMask>());
360 ir_fusion_pm->AddPass(std::make_shared<TensorScatterUpdateFission>());
361 ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
362 ir_fusion_pm->AddPass(std::make_shared<CdistFission>());
363 ir_fusion_pm->AddPass(std::make_shared<CdistGradFission>());
364 ir_fusion_pm->AddPass(std::make_shared<BCEWithLogitsLossFission>());
365 ir_fusion_pm->AddPass(std::make_shared<InsertTensorMoveForHcclOp>());
366
367 optimizer->AddPassManager(ir_fusion_pm);
368 (void)optimizer->Optimize(kernel_graph);
369 kernel_graph->SetExecOrderByDefault();
370 #ifdef ENABLE_DUMP_IR
371 if (save_graphs) {
372 DumpIR("hwopt_d_ir_fusion_after.ir", kernel_graph);
373 }
374 #endif
375 }
376
RunOpAscendBackendOptimization(const std::shared_ptr<session::KernelGraph> & kernel_graph)377 void RunOpAscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
378 // data layout optimization
379 AscendDataLayout(kernel_graph);
380 // mixed precision optimization
381 AscendMixPrecision(kernel_graph);
382 // other optimization
383 auto optimizer = std::make_shared<GraphOptimizer>();
384 auto other_pm = std::make_shared<PassManager>("other_pm");
385 other_pm->AddPass(std::make_shared<SetFraczGroupAttr>());
386 optimizer->AddPassManager(other_pm);
387 (void)optimizer->Optimize(kernel_graph);
388 kernel_graph->SetExecOrderByDefault();
389 }
390
AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> & kernel_graph)391 void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
392 auto context_ptr = MsContext::GetInstance();
393 MS_EXCEPTION_IF_NULL(context_ptr);
394 #ifdef ENABLE_DUMP_IR
395 bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
396 if (save_graphs) {
397 std::string file_name = "hwopt_d_before_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
398 DumpIR(file_name, kernel_graph);
399 }
400 #endif
401 // data layout optimization
402 AscendDataLayout(kernel_graph);
403 // mixed precision optimization
404 AscendMixPrecision(kernel_graph);
405 // other optimization
406 auto optimizer = std::make_shared<GraphOptimizer>();
407 auto other_pm = std::make_shared<PassManager>("other_pm");
408 other_pm->AddPass(std::make_shared<SendFusion>());
409 other_pm->AddPass(std::make_shared<RecvFusion>());
410 other_pm->AddPass(std::make_shared<AllReduceFusion>());
411 other_pm->AddPass(std::make_shared<AdjustDependForParallelOptimizerRecomputeAllGather>());
412 other_pm->AddPass(std::make_shared<AllGatherFusion>());
413 other_pm->AddPass(std::make_shared<ConcatOutputsForAllGather>());
414 other_pm->AddPass(std::make_shared<InsertDependForAllGather>());
415 other_pm->AddPass(std::make_shared<ReduceScatterFusion>());
416 other_pm->AddPass(std::make_shared<SplitInputsForReduceScatter>());
417 other_pm->AddPass(std::make_shared<BroadcastFusion>());
418 other_pm->AddPass(std::make_shared<InsertTensorMoveForCascade>());
419 other_pm->AddPass(std::make_shared<ParameterTransOpFusion>());
420 other_pm->AddPass(std::make_shared<RefreshParameterFormat>());
421 other_pm->AddPass(std::make_shared<SplitOpOptimizer>());
422 other_pm->AddPass(std::make_shared<SetFraczGroupAttr>());
423 optimizer->AddPassManager(other_pm);
424 (void)optimizer->Optimize(kernel_graph);
425 kernel_graph->SetExecOrderByDefault();
426 // buffer fusion
427 AscendBackendUBFusionOptimization(kernel_graph);
428
429 // other2 optimization
430 auto optimizer2 = std::make_shared<GraphOptimizer>();
431 auto other2_pm = std::make_shared<PassManager>("other2_pm");
432 other2_pm->AddPass(std::make_shared<GetitemTuple>());
433 other2_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
434 if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && context_ptr->get_param<bool>(MS_CTX_ENABLE_LOOP_SINK) &&
435 ConfigManager::GetInstance().iter_num() > 1) {
436 other2_pm->AddPass(std::make_shared<GetnextTensorMoveElimination>());
437 }
438 other2_pm->AddPass(std::make_shared<CheckConsistency>());
439 optimizer2->AddPassManager(other2_pm);
440 (void)optimizer2->Optimize(kernel_graph);
441 kernel_graph->SetExecOrderByDefault();
442 #ifdef ENABLE_DUMP_IR
443 const std::vector<CNodePtr> &exec_order = kernel_graph->execution_order();
444 std::string exec_order_name = "graph_exec_order." + std::to_string(kernel_graph->graph_id());
445 (void)mindspore::RDR::RecordGraphExecOrder(SubModuleId::SM_OPTIMIZER, exec_order_name, exec_order);
446 if (save_graphs) {
447 std::string file_name = "hwopt_d_end_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
448 DumpIR(file_name, kernel_graph, true, kWholeStack);
449 DumpIRProto(kernel_graph, "after_hwopt_" + std::to_string(kernel_graph->graph_id()));
450 kernel_graph->DumpFuncGraph("hwopt_d_end");
451 }
452 #endif
453 }
454
AscendBackendUBFusionOptimization(const std::shared_ptr<session::KernelGraph> & kernel_graph)455 void AscendBackendUBFusionOptimization(const std::shared_ptr<session::KernelGraph> &kernel_graph) {
456 auto context_ptr = MsContext::GetInstance();
457 MS_EXCEPTION_IF_NULL(context_ptr);
458 if (!context_ptr->get_param<bool>(MS_CTX_IR_FUSION_FLAG)) {
459 MS_LOG(INFO) << "UBFusion is not enable, skip";
460 return;
461 }
462
463 if (kernel_graph->is_dynamic_shape()) {
464 MS_LOG(WARNING) << "Dynamic shape skip fusion";
465 return;
466 }
467 auto old_build = common::GetEnv("MS_OLD_BUILD_PROCESS");
468 if (old_build.empty()) {
469 auto &build_manager = kernel::ascend::AscendKernelCompileManager::GetInstance();
470 auto pre_build = common::GetEnv("MS_DISABLE_PREBUILD");
471 if (pre_build.empty()) {
472 build_manager.AscendPreBuild(kernel_graph);
473 }
474 }
475 #ifdef ENABLE_DUMP_IR
476 bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
477 if (save_graphs) {
478 std::string file_name = "hwopt_d_ub_fusion_before_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
479 DumpIR(file_name, kernel_graph);
480 }
481 #endif
482 auto fusion_id_allocator = std::make_shared<FusionIdAllocator>();
483 MS_EXCEPTION_IF_NULL(fusion_id_allocator);
484 fusion_id_allocator->Init();
485 auto optimizer = std::make_shared<GraphOptimizer>();
486 auto ub_fusion_pm = std::make_shared<PassManager>("ub_fusion_pm");
487 ub_fusion_pm->AddPass(std::make_shared<Conv2DBackpropEltwiseEltwiseFusionPass>(fusion_id_allocator));
488 ub_fusion_pm->AddPass(std::make_shared<Conv2DBackpropEltwiseFusionPass>(fusion_id_allocator));
489 ub_fusion_pm->AddPass(std::make_shared<ConvBnReduceFusionPass>(fusion_id_allocator));
490 ub_fusion_pm->AddPass(std::make_shared<ConvSingleInFusionPass>(fusion_id_allocator));
491 ub_fusion_pm->AddPass(std::make_shared<BnupdateEltwiseFusionPass>(fusion_id_allocator));
492 ub_fusion_pm->AddPass(std::make_shared<BnupdateEltwiseEltwiseFusionPass>(fusion_id_allocator));
493 ub_fusion_pm->AddPass(std::make_shared<MatmulEltwiseFusionPass>(fusion_id_allocator));
494 ub_fusion_pm->AddPass(std::make_shared<ConvDoubleInFusionPass>(fusion_id_allocator));
495 ub_fusion_pm->AddPass(std::make_shared<ReduceEltwiseFusionPass>(fusion_id_allocator));
496 ub_fusion_pm->AddPass(std::make_shared<SegmentEltwiseFusionPass>(fusion_id_allocator));
497 ub_fusion_pm->AddPass(std::make_shared<MultiOutputFusionPass>(fusion_id_allocator));
498 if (!context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
499 ub_fusion_pm->AddPass(std::make_shared<EltwiseFusionPass>(fusion_id_allocator));
500 }
501 ub_fusion_pm->AddPass(std::make_shared<DepthwiseConvEltwiseFusionPass>(fusion_id_allocator));
502 ub_fusion_pm->AddPass(std::make_shared<MatmulConfusionTranposeFusionPass>(fusion_id_allocator));
503 if (!context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
504 ub_fusion_pm->AddPass(std::make_shared<BatchMatmulFusedMulAddFusionPass>(fusion_id_allocator));
505 }
506 ub_fusion_pm->AddPass(std::make_shared<UbPatternFusion>());
507 optimizer->AddPassManager(ub_fusion_pm);
508 (void)optimizer->Optimize(kernel_graph);
509 kernel_graph->SetExecOrderByDefault();
510 #ifdef ENABLE_DUMP_IR
511 if (save_graphs) {
512 std::string file_name = "hwopt_d_ub_fusion_after_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
513 DumpIR(file_name, kernel_graph);
514 }
515 #endif
516 }
517 } // namespace opt
518 } // namespace mindspore
519