• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "plugin/device/ascend/optimizer/backend_common_unify_mindir.h"
17 #include <memory>
18 #include <string>
19 
20 #include "include/common/debug/anf_ir_dump.h"
21 #include "include/common/debug/dump_proto.h"
22 #include "include/backend/optimizer/optimizer.h"
23 #include "include/backend/debug/profiler/profiling.h"
24 #include "backend/common/pass/dropout_gen_mask_fusion.h"
25 #include "backend/common/pass/erase_visit_attr.h"
26 #include "plugin/device/ascend/optimizer/ir_fission/cdist_fission.h"
27 #include "plugin/device/ascend/optimizer/ir_fission/tensor_scatter_fission.h"
28 #include "plugin/device/ascend/optimizer/ir_fission/adam_weight_decay_fission.h"
29 #include "plugin/device/ascend/optimizer/ir_fission/batch_norm_grad_infer_fission.h"
30 #include "plugin/device/ascend/optimizer/ir_fission/bn_split.h"
31 #include "plugin/device/ascend/optimizer/ir_fission/bn_grad_split.h"
32 #include "plugin/device/ascend/optimizer/ir_fission/ascend_convert_tuple_input_to_dynamic_input.h"
33 #include "plugin/device/ascend/optimizer/ir_fusion/batchnorm_to_bninfer.h"
34 #include "plugin/device/ascend/optimizer/ir_fusion/batchnormgrad_to_bninfergrad.h"
35 #include "plugin/device/ascend/optimizer/ir_fusion/histogram_fixed_width_fusion.h"
36 #include "plugin/device/ascend/optimizer/mindir/renorm_split.h"
37 #include "plugin/device/ascend/optimizer/mindir/optimizer_unify_output.h"
38 #include "plugin/device/ascend/optimizer/mindir/space_batch_nd_attr_update.h"
39 #include "plugin/device/ascend/optimizer/mindir/bn_grad_unify_mindir.h"
40 #include "plugin/device/ascend/optimizer/mindir/all_to_all_unify_mindir.h"
41 #include "plugin/device/ascend/optimizer/mindir/neighbor_exchange_v2_unify_mindir.h"
42 #include "plugin/device/ascend/optimizer/mindir/quant_dtype_cast_adjust.h"
43 #include "plugin/device/ascend/optimizer/mindir/fse_decode_adjust.h"
44 #include "plugin/device/ascend/optimizer/mindir/reduce_axis_update.h"
45 #include "plugin/device/ascend/optimizer/mindir/clip_by_norm_fission.h"
46 #include "plugin/device/ascend/optimizer/mindir/specialized_prepare.h"
47 #include "plugin/device/ascend/optimizer/mindir/tensor_array.h"
48 #include "plugin/device/ascend/optimizer/mindir/dropout_unify_mindir.h"
49 #include "plugin/device/ascend/optimizer/mindir/ascend_mindir_op_adapter.h"
50 #include "plugin/device/ascend/optimizer/mindir/sparse_softmax_cross_entropy_with_logits_unify_mindir.h"
51 #include "plugin/device/ascend/optimizer/mindir/adam_weight_decay_unify_mindir.h"
52 #include "plugin/device/ascend/optimizer/mindir/centralization_mindir.h"
53 #include "plugin/device/ascend/optimizer/ge/lamb_fission.h"
54 #include "plugin/device/ascend/optimizer/ge/adjust_print_for_ge.h"
55 #include "plugin/device/ascend/optimizer/ge/getnext_for_ge.h"
56 #include "plugin/device/ascend/optimizer/ir_fusion/adaptive_max_pool2d_fusion.h"
57 #include "plugin/device/ascend/optimizer/ir_fusion/flash_attention_fusion.h"
58 #include "plugin/device/ascend/optimizer/ir_fusion/add_layer_norm_fusion.h"
59 #include "plugin/device/ascend/optimizer/ir_fusion/add_rms_norm_fusion.h"
60 #include "plugin/device/ascend/optimizer/ir_fusion/add_cast_rms_norm_cast_fusion.h"
61 #include "plugin/device/ascend/optimizer/ge/avg_pool_grad_for_ge.h"
62 #include "plugin/device/ascend/optimizer/ir_fusion/mc2_fusion.h"
63 #include "plugin/device/ascend/optimizer/ir_fusion/shape_reshape_fusion.h"
64 #include "plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.h"
65 #include "plugin/device/ascend/optimizer/ir_fusion/matmul_elemwise_fusion.h"
66 #include "plugin/device/ascend/optimizer/ir_fusion/inference_matmul_split_fusion.h"
67 
68 namespace mindspore {
69 namespace opt {
GetBackendCommonUnifyMindIRPassManager(PassManagerPtr * unify_mindir_pm)70 void GetBackendCommonUnifyMindIRPassManager(PassManagerPtr *unify_mindir_pm) {
71   MS_EXCEPTION_IF_NULL(unify_mindir_pm);
72   (*unify_mindir_pm)->AddPass(std::make_shared<RenormSplit>());
73   (*unify_mindir_pm)->AddPass(std::make_shared<opt::ReduceAxisUpdate>());
74   (*unify_mindir_pm)->AddPass(std::make_shared<HistogramFixedWidthFusion>());
75   (*unify_mindir_pm)->AddPass(std::make_shared<opt::ClipByNormFission>());
76   (*unify_mindir_pm)->AddPass(std::make_shared<opt::TensorArrayAddFlowCond1>());
77   (*unify_mindir_pm)->AddPass(std::make_shared<opt::TensorArrayAddFlowCond2>());
78   (*unify_mindir_pm)->AddPass(std::make_shared<opt::GeTensorArrayCastIndex>());
79   (*unify_mindir_pm)->AddPass(std::make_shared<opt::TensorArrayPrepare>());
80   (*unify_mindir_pm)->AddPass(std::make_shared<opt::SpaceToBatchNDAttrUpdate>());
81   (*unify_mindir_pm)->AddPass(std::make_shared<opt::BatchToSpaceNDAttrUpdate>());
82   (*unify_mindir_pm)->AddPass(std::make_shared<opt::AdamWeightDecayUnifyMindIR>());
83   (*unify_mindir_pm)->AddPass(std::make_shared<CdistFission>());
84   (*unify_mindir_pm)->AddPass(std::make_shared<CdistGradFission>());
85 
86   // Since the SparseSoftmaxCrossEntropyWithLogits operator can only use AICPU and has poor execution performance,
87   // it does not take effect for the time being.
88   auto ms_context = MsContext::GetInstance();
89   MS_EXCEPTION_IF_NULL(ms_context);
90   bool graph_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode;
91   bool is_kbk_mode = ms_context->IsKByKExecutorMode();
92   if (graph_mode) {
93     (*unify_mindir_pm)->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
94     (*unify_mindir_pm)->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIRV2>());
95     (*unify_mindir_pm)->AddPass(std::make_shared<opt::SparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
96   } else {
97     // Add PynativeGradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR pass first to avoid the backward loss function
98     // from the python frontend matching the pattern defined in
99     // PynativeSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR.
100     // TODO(hbhu_bin): In mindspore, SparseSoftmaxCrossEntropyWithLogits has different outputs based on the "is_grad"
101     // attribute, but it has two outputs in CANN. These pass cann be removed when convert "is_grad" attribute to input.
102     (*unify_mindir_pm)->AddPass(std::make_shared<opt::PynativeGradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIRV2>());
103     (*unify_mindir_pm)->AddPass(std::make_shared<opt::PynativeGradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
104     (*unify_mindir_pm)->AddPass(std::make_shared<opt::PynativeSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
105   }
106 
107   (*unify_mindir_pm)->AddPass(std::make_shared<opt::DropoutExtUnifyMindIR1>());
108   (*unify_mindir_pm)->AddPass(std::make_shared<opt::DropoutGradExtUnifyMindIR>());
109   (*unify_mindir_pm)->AddPass(std::make_shared<opt::DropoutUnifyMindIR1>());
110   (*unify_mindir_pm)->AddPass(std::make_shared<opt::DropoutGradUnifyMindIR>());
111 
112   (*unify_mindir_pm)->AddPass(std::make_shared<opt::NeighborExchangeUnifyMindIR>());
113   (*unify_mindir_pm)->AddPass(std::make_shared<opt::NeighborExchangeV2UnifyMindIR>());
114   (*unify_mindir_pm)->AddPass(std::make_shared<opt::NeighborExchangeV2GradUnifyMindIR>());
115   (*unify_mindir_pm)->AddPass(std::make_shared<opt::AllToAllUnifyMindIR>());
116   (*unify_mindir_pm)->AddPass(std::make_shared<opt::QuantDTypeCastAdjust>());
117   (*unify_mindir_pm)->AddPass(std::make_shared<opt::FSEDecodeAdjust>());
118   // batchnorm
119   (*unify_mindir_pm)->AddPass(std::make_shared<BnSplit>());
120   (*unify_mindir_pm)->AddPass(std::make_shared<opt::BatchNormGradUnifyMindIR>());
121   (*unify_mindir_pm)->AddPass(std::make_shared<BnGradSplit>());
122   (*unify_mindir_pm)->AddPass(std::make_shared<BatchNormGrad2BNInferGrad>());
123   (*unify_mindir_pm)->AddPass(std::make_shared<BatchNormGradInferFission>());
124   (*unify_mindir_pm)->AddPass(std::make_shared<BatchNorm2BNInfer>());
125   // just rename primitive name
126   (*unify_mindir_pm)->AddPass(std::make_shared<opt::AscendMindIROpAdapter>());
127   (*unify_mindir_pm)->AddPass(std::make_shared<opt::DropoutGenMaskFusion>());
128 
129   (*unify_mindir_pm)->AddPass(std::make_shared<opt::LambFissionGe>());
130   (*unify_mindir_pm)->AddPass(std::make_shared<opt::AdjustPrintForGe>());
131   (*unify_mindir_pm)->AddPass(std::make_shared<opt::GetNextForGE>());
132   (*unify_mindir_pm)->AddPass(std::make_shared<opt::SyncBnSplit>());
133   (*unify_mindir_pm)->AddPass(std::make_shared<opt::SyncBnGradSplit>());
134   (*unify_mindir_pm)->AddPass(std::make_shared<opt::AdaptiveMaxPool2DGeFusion>());
135   (*unify_mindir_pm)->AddPass(std::make_shared<opt::AvgPoolGradForGE>());
136   (*unify_mindir_pm)->AddPass(std::make_shared<opt::FlashAttentionFusionV1>());
137   (*unify_mindir_pm)->AddPass(std::make_shared<opt::FlashAttentionFusionV2>());
138   if (!is_kbk_mode) {
139     (*unify_mindir_pm)->AddPass(std::make_shared<opt::MatmulReduceScatterFusion>());
140     (*unify_mindir_pm)->AddPass(std::make_shared<opt::AllGatherMatmulFusion>());
141   }
142   (*unify_mindir_pm)->AddPass(std::make_shared<opt::CentralizationMindIR>());
143 #ifdef ENABLE_INTERNAL_KERNELS
144   (*unify_mindir_pm)->AddPass(std::make_shared<opt::AddLayernormFusion>());
145   (*unify_mindir_pm)->AddPass(std::make_shared<opt::ShapeReshapeFusion>());
146   (*unify_mindir_pm)->AddPass(std::make_shared<opt::AddRmsNormFusion>());
147   (*unify_mindir_pm)->AddPass(std::make_shared<opt::AddCastRmsNormCastFusion>());
148   (*unify_mindir_pm)->AddPass(std::make_shared<opt::MatMulAllReduceFusion>());
149   (*unify_mindir_pm)->AddPass(std::make_shared<opt::InferenceMatmulSplitFusion>());
150 #endif  // ENABLE_INTERNAL_KERNELS
151 }
152 
AscendUnfoldInputsForSpecialNodes(const KernelGraphPtr & kernel_graph)153 void AscendUnfoldInputsForSpecialNodes(const KernelGraphPtr &kernel_graph) {
154   profiler::CollectHostInfo("Ascend", "Graph Optimization", "BackendOptimization_UnfoldInputsForSpecialNodes", 0, 0, 0);
155   MS_EXCEPTION_IF_NULL(kernel_graph);
156   auto context_ptr = MsContext::GetInstance();
157   MS_EXCEPTION_IF_NULL(context_ptr);
158 #ifdef ENABLE_DUMP_IR
159   if (context_ptr->CanDump(kIntroductory)) {
160     std::string file_name =
161       "hwopt_d_before_unfold_inputs_for_special_nodes_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
162     DumpIR(file_name, kernel_graph, true, kWholeStack);
163     DumpIRProto(kernel_graph,
164                 "before_unfold_inputs_for_special_nodes_hwopt_" + std::to_string(kernel_graph->graph_id()));
165   }
166 #endif
167   auto optimizer = std::make_shared<opt::GraphOptimizer>();
168   auto unfold_inputs_pm = std::make_shared<opt::PassManager>("unfold_inputs_for_special_nodes_pm");
169   unfold_inputs_pm->AddPass(std::make_shared<opt::AscendConvertTupleInputToDynamicInput>());
170 
171   optimizer->AddPassManager(unfold_inputs_pm);
172   (void)optimizer->Optimize(kernel_graph);
173   kernel_graph->SetExecOrderByDefault();
174 #ifdef ENABLE_DUMP_IR
175   if (context_ptr->CanDump(kIntroductory)) {
176     std::string file_name =
177       "hwopt_d_after_unfold_inputs_for_special_nodes_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
178     DumpIR(file_name, kernel_graph, true, kWholeStack);
179   }
180 #endif
181   profiler::CollectHostInfo("Ascend", "Graph Optimization", "BackendOptimization_UnfoldInputsForSpecialNodes", 0, 0, 1);
182 }
183 }  // namespace opt
184 }  // namespace mindspore
185