1 /**
2 * Copyright 2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "plugin/device/ascend/optimizer/backend_common_unify_mindir.h"
17 #include <memory>
18 #include <string>
19
20 #include "include/common/debug/anf_ir_dump.h"
21 #include "include/common/debug/dump_proto.h"
22 #include "include/backend/optimizer/optimizer.h"
23 #include "include/backend/debug/profiler/profiling.h"
24 #include "backend/common/pass/dropout_gen_mask_fusion.h"
25 #include "backend/common/pass/erase_visit_attr.h"
26 #include "plugin/device/ascend/optimizer/ir_fission/cdist_fission.h"
27 #include "plugin/device/ascend/optimizer/ir_fission/tensor_scatter_fission.h"
28 #include "plugin/device/ascend/optimizer/ir_fission/adam_weight_decay_fission.h"
29 #include "plugin/device/ascend/optimizer/ir_fission/batch_norm_grad_infer_fission.h"
30 #include "plugin/device/ascend/optimizer/ir_fission/bn_split.h"
31 #include "plugin/device/ascend/optimizer/ir_fission/bn_grad_split.h"
32 #include "plugin/device/ascend/optimizer/ir_fission/ascend_convert_tuple_input_to_dynamic_input.h"
33 #include "plugin/device/ascend/optimizer/ir_fusion/batchnorm_to_bninfer.h"
34 #include "plugin/device/ascend/optimizer/ir_fusion/batchnormgrad_to_bninfergrad.h"
35 #include "plugin/device/ascend/optimizer/ir_fusion/histogram_fixed_width_fusion.h"
36 #include "plugin/device/ascend/optimizer/mindir/renorm_split.h"
37 #include "plugin/device/ascend/optimizer/mindir/optimizer_unify_output.h"
38 #include "plugin/device/ascend/optimizer/mindir/space_batch_nd_attr_update.h"
39 #include "plugin/device/ascend/optimizer/mindir/bn_grad_unify_mindir.h"
40 #include "plugin/device/ascend/optimizer/mindir/all_to_all_unify_mindir.h"
41 #include "plugin/device/ascend/optimizer/mindir/neighbor_exchange_v2_unify_mindir.h"
42 #include "plugin/device/ascend/optimizer/mindir/quant_dtype_cast_adjust.h"
43 #include "plugin/device/ascend/optimizer/mindir/fse_decode_adjust.h"
44 #include "plugin/device/ascend/optimizer/mindir/reduce_axis_update.h"
45 #include "plugin/device/ascend/optimizer/mindir/clip_by_norm_fission.h"
46 #include "plugin/device/ascend/optimizer/mindir/specialized_prepare.h"
47 #include "plugin/device/ascend/optimizer/mindir/tensor_array.h"
48 #include "plugin/device/ascend/optimizer/mindir/dropout_unify_mindir.h"
49 #include "plugin/device/ascend/optimizer/mindir/ascend_mindir_op_adapter.h"
50 #include "plugin/device/ascend/optimizer/mindir/sparse_softmax_cross_entropy_with_logits_unify_mindir.h"
51 #include "plugin/device/ascend/optimizer/mindir/adam_weight_decay_unify_mindir.h"
52 #include "plugin/device/ascend/optimizer/mindir/centralization_mindir.h"
53 #include "plugin/device/ascend/optimizer/ge/lamb_fission.h"
54 #include "plugin/device/ascend/optimizer/ge/adjust_print_for_ge.h"
55 #include "plugin/device/ascend/optimizer/ge/getnext_for_ge.h"
56 #include "plugin/device/ascend/optimizer/ir_fusion/adaptive_max_pool2d_fusion.h"
57 #include "plugin/device/ascend/optimizer/ir_fusion/flash_attention_fusion.h"
58 #include "plugin/device/ascend/optimizer/ir_fusion/add_layer_norm_fusion.h"
59 #include "plugin/device/ascend/optimizer/ir_fusion/add_rms_norm_fusion.h"
60 #include "plugin/device/ascend/optimizer/ir_fusion/add_cast_rms_norm_cast_fusion.h"
61 #include "plugin/device/ascend/optimizer/ge/avg_pool_grad_for_ge.h"
62 #include "plugin/device/ascend/optimizer/ir_fusion/mc2_fusion.h"
63 #include "plugin/device/ascend/optimizer/ir_fusion/shape_reshape_fusion.h"
64 #include "plugin/device/ascend/optimizer/ir_fusion/matmul_allreduce_fusion.h"
65 #include "plugin/device/ascend/optimizer/ir_fusion/matmul_elemwise_fusion.h"
66 #include "plugin/device/ascend/optimizer/ir_fusion/inference_matmul_split_fusion.h"
67
68 namespace mindspore {
69 namespace opt {
GetBackendCommonUnifyMindIRPassManager(PassManagerPtr * unify_mindir_pm)70 void GetBackendCommonUnifyMindIRPassManager(PassManagerPtr *unify_mindir_pm) {
71 MS_EXCEPTION_IF_NULL(unify_mindir_pm);
72 (*unify_mindir_pm)->AddPass(std::make_shared<RenormSplit>());
73 (*unify_mindir_pm)->AddPass(std::make_shared<opt::ReduceAxisUpdate>());
74 (*unify_mindir_pm)->AddPass(std::make_shared<HistogramFixedWidthFusion>());
75 (*unify_mindir_pm)->AddPass(std::make_shared<opt::ClipByNormFission>());
76 (*unify_mindir_pm)->AddPass(std::make_shared<opt::TensorArrayAddFlowCond1>());
77 (*unify_mindir_pm)->AddPass(std::make_shared<opt::TensorArrayAddFlowCond2>());
78 (*unify_mindir_pm)->AddPass(std::make_shared<opt::GeTensorArrayCastIndex>());
79 (*unify_mindir_pm)->AddPass(std::make_shared<opt::TensorArrayPrepare>());
80 (*unify_mindir_pm)->AddPass(std::make_shared<opt::SpaceToBatchNDAttrUpdate>());
81 (*unify_mindir_pm)->AddPass(std::make_shared<opt::BatchToSpaceNDAttrUpdate>());
82 (*unify_mindir_pm)->AddPass(std::make_shared<opt::AdamWeightDecayUnifyMindIR>());
83 (*unify_mindir_pm)->AddPass(std::make_shared<CdistFission>());
84 (*unify_mindir_pm)->AddPass(std::make_shared<CdistGradFission>());
85
86 // Since the SparseSoftmaxCrossEntropyWithLogits operator can only use AICPU and has poor execution performance,
87 // it does not take effect for the time being.
88 auto ms_context = MsContext::GetInstance();
89 MS_EXCEPTION_IF_NULL(ms_context);
90 bool graph_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode;
91 bool is_kbk_mode = ms_context->IsKByKExecutorMode();
92 if (graph_mode) {
93 (*unify_mindir_pm)->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
94 (*unify_mindir_pm)->AddPass(std::make_shared<opt::GradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIRV2>());
95 (*unify_mindir_pm)->AddPass(std::make_shared<opt::SparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
96 } else {
97 // Add PynativeGradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR pass first to avoid the backward loss function
98 // from the python frontend matching the pattern defined in
99 // PynativeSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR.
100 // TODO(hbhu_bin): In mindspore, SparseSoftmaxCrossEntropyWithLogits has different outputs based on the "is_grad"
101 // attribute, but it has two outputs in CANN. These pass cann be removed when convert "is_grad" attribute to input.
102 (*unify_mindir_pm)->AddPass(std::make_shared<opt::PynativeGradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIRV2>());
103 (*unify_mindir_pm)->AddPass(std::make_shared<opt::PynativeGradSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
104 (*unify_mindir_pm)->AddPass(std::make_shared<opt::PynativeSparseSoftmaxCrossEntropyWithLogitsUnifyMindIR>());
105 }
106
107 (*unify_mindir_pm)->AddPass(std::make_shared<opt::DropoutExtUnifyMindIR1>());
108 (*unify_mindir_pm)->AddPass(std::make_shared<opt::DropoutGradExtUnifyMindIR>());
109 (*unify_mindir_pm)->AddPass(std::make_shared<opt::DropoutUnifyMindIR1>());
110 (*unify_mindir_pm)->AddPass(std::make_shared<opt::DropoutGradUnifyMindIR>());
111
112 (*unify_mindir_pm)->AddPass(std::make_shared<opt::NeighborExchangeUnifyMindIR>());
113 (*unify_mindir_pm)->AddPass(std::make_shared<opt::NeighborExchangeV2UnifyMindIR>());
114 (*unify_mindir_pm)->AddPass(std::make_shared<opt::NeighborExchangeV2GradUnifyMindIR>());
115 (*unify_mindir_pm)->AddPass(std::make_shared<opt::AllToAllUnifyMindIR>());
116 (*unify_mindir_pm)->AddPass(std::make_shared<opt::QuantDTypeCastAdjust>());
117 (*unify_mindir_pm)->AddPass(std::make_shared<opt::FSEDecodeAdjust>());
118 // batchnorm
119 (*unify_mindir_pm)->AddPass(std::make_shared<BnSplit>());
120 (*unify_mindir_pm)->AddPass(std::make_shared<opt::BatchNormGradUnifyMindIR>());
121 (*unify_mindir_pm)->AddPass(std::make_shared<BnGradSplit>());
122 (*unify_mindir_pm)->AddPass(std::make_shared<BatchNormGrad2BNInferGrad>());
123 (*unify_mindir_pm)->AddPass(std::make_shared<BatchNormGradInferFission>());
124 (*unify_mindir_pm)->AddPass(std::make_shared<BatchNorm2BNInfer>());
125 // just rename primitive name
126 (*unify_mindir_pm)->AddPass(std::make_shared<opt::AscendMindIROpAdapter>());
127 (*unify_mindir_pm)->AddPass(std::make_shared<opt::DropoutGenMaskFusion>());
128
129 (*unify_mindir_pm)->AddPass(std::make_shared<opt::LambFissionGe>());
130 (*unify_mindir_pm)->AddPass(std::make_shared<opt::AdjustPrintForGe>());
131 (*unify_mindir_pm)->AddPass(std::make_shared<opt::GetNextForGE>());
132 (*unify_mindir_pm)->AddPass(std::make_shared<opt::SyncBnSplit>());
133 (*unify_mindir_pm)->AddPass(std::make_shared<opt::SyncBnGradSplit>());
134 (*unify_mindir_pm)->AddPass(std::make_shared<opt::AdaptiveMaxPool2DGeFusion>());
135 (*unify_mindir_pm)->AddPass(std::make_shared<opt::AvgPoolGradForGE>());
136 (*unify_mindir_pm)->AddPass(std::make_shared<opt::FlashAttentionFusionV1>());
137 (*unify_mindir_pm)->AddPass(std::make_shared<opt::FlashAttentionFusionV2>());
138 if (!is_kbk_mode) {
139 (*unify_mindir_pm)->AddPass(std::make_shared<opt::MatmulReduceScatterFusion>());
140 (*unify_mindir_pm)->AddPass(std::make_shared<opt::AllGatherMatmulFusion>());
141 }
142 (*unify_mindir_pm)->AddPass(std::make_shared<opt::CentralizationMindIR>());
143 #ifdef ENABLE_INTERNAL_KERNELS
144 (*unify_mindir_pm)->AddPass(std::make_shared<opt::AddLayernormFusion>());
145 (*unify_mindir_pm)->AddPass(std::make_shared<opt::ShapeReshapeFusion>());
146 (*unify_mindir_pm)->AddPass(std::make_shared<opt::AddRmsNormFusion>());
147 (*unify_mindir_pm)->AddPass(std::make_shared<opt::AddCastRmsNormCastFusion>());
148 (*unify_mindir_pm)->AddPass(std::make_shared<opt::MatMulAllReduceFusion>());
149 (*unify_mindir_pm)->AddPass(std::make_shared<opt::InferenceMatmulSplitFusion>());
150 #endif // ENABLE_INTERNAL_KERNELS
151 }
152
AscendUnfoldInputsForSpecialNodes(const KernelGraphPtr & kernel_graph)153 void AscendUnfoldInputsForSpecialNodes(const KernelGraphPtr &kernel_graph) {
154 profiler::CollectHostInfo("Ascend", "Graph Optimization", "BackendOptimization_UnfoldInputsForSpecialNodes", 0, 0, 0);
155 MS_EXCEPTION_IF_NULL(kernel_graph);
156 auto context_ptr = MsContext::GetInstance();
157 MS_EXCEPTION_IF_NULL(context_ptr);
158 #ifdef ENABLE_DUMP_IR
159 if (context_ptr->CanDump(kIntroductory)) {
160 std::string file_name =
161 "hwopt_d_before_unfold_inputs_for_special_nodes_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
162 DumpIR(file_name, kernel_graph, true, kWholeStack);
163 DumpIRProto(kernel_graph,
164 "before_unfold_inputs_for_special_nodes_hwopt_" + std::to_string(kernel_graph->graph_id()));
165 }
166 #endif
167 auto optimizer = std::make_shared<opt::GraphOptimizer>();
168 auto unfold_inputs_pm = std::make_shared<opt::PassManager>("unfold_inputs_for_special_nodes_pm");
169 unfold_inputs_pm->AddPass(std::make_shared<opt::AscendConvertTupleInputToDynamicInput>());
170
171 optimizer->AddPassManager(unfold_inputs_pm);
172 (void)optimizer->Optimize(kernel_graph);
173 kernel_graph->SetExecOrderByDefault();
174 #ifdef ENABLE_DUMP_IR
175 if (context_ptr->CanDump(kIntroductory)) {
176 std::string file_name =
177 "hwopt_d_after_unfold_inputs_for_special_nodes_graph_" + std::to_string(kernel_graph->graph_id()) + ".ir";
178 DumpIR(file_name, kernel_graph, true, kWholeStack);
179 }
180 #endif
181 profiler::CollectHostInfo("Ascend", "Graph Optimization", "BackendOptimization_UnfoldInputsForSpecialNodes", 0, 0, 1);
182 }
183 } // namespace opt
184 } // namespace mindspore
185