• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021-2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "backend/common/graph_kernel/adapter/graph_kernel_optimization.h"
17 
18 #include <algorithm>
19 #include <vector>
20 #include <string>
21 #include <memory>
22 
23 #include "mindspore/core/ops/array_ops.h"
24 #include "ir/func_graph.h"
25 #include "utils/ms_context.h"
26 #include "backend/common/graph_kernel/graph_kernel_flags.h"
27 #include "backend/common/graph_kernel/add_atomic_clean.h"
28 #include "backend/common/graph_kernel/add_stitch_atomic_clean_gpu.h"
29 #include "backend/common/graph_kernel/core/arithmetic_simplify.h"
30 #include "backend/common/graph_kernel/adapter/graph_kernel_cluster_cloud.h"
31 #include "backend/common/graph_kernel/core/eliminate_redundant_output.h"
32 #include "backend/common/graph_kernel/insert_pad.h"
33 #include "backend/common/graph_kernel/adapter/graph_kernel_splitter_with_py.h"
34 #include "backend/common/graph_kernel/adapter/graph_kernel_expander_cloud.h"
35 #include "backend/common/graph_kernel/adapter/callback_impl.h"
36 #include "backend/common/graph_kernel/cast_matmul_fusion.h"
37 #include "backend/common/graph_kernel/raise_reduction_precision.h"
38 #include "backend/common/graph_kernel/graph_kernel_cse.h"
39 #include "backend/common/graph_kernel/core/shape_ops_splitter.h"
40 #include "backend/common/graph_kernel/value_graph_binder.h"
41 #include "backend/common/graph_kernel/parallel_fusion.h"
42 #include "backend/common/graph_kernel/optimize_assign.h"
43 #include "backend/common/graph_kernel/core/split_umonad.h"
44 #include "backend/common/graph_kernel/reorder_ops.h"
45 #include "backend/common/graph_kernel/core/update_state_formatter.h"
46 #include "backend/common/graph_kernel/axis_normalizer.h"
47 #include "backend/common/graph_kernel/decrease_compute_precision.h"
48 #include "backend/common/graph_kernel/decrease_transfer_precision.h"
49 #include "backend/common/graph_kernel/csr_atomic_add.h"
50 #include "backend/common/graph_kernel/tsa_atomic_add_to_first_tensor.h"
51 #include "backend/common/graph_kernel/uss_atomic_add.h"
52 #include "backend/common/pass/getitem_tuple.h"
53 #include "backend/common/graph_kernel/core/graph_kernel_pass_manager.h"
54 #include "backend/common/graph_kernel/core/transform_op_optimizer.h"
55 #include "backend/common/graph_kernel/rewrite_output_shape.h"
56 #include "backend/common/graph_kernel/graph_kernel_recompute.h"
57 #include "backend/common/graph_kernel/reduce_fake_out_mem.h"
58 #include "backend/common/graph_kernel/depend_elimination.h"
59 #include "backend/common/graph_kernel/tensor_inplace.h"
60 #include "backend/common/graph_kernel/floatstatus_fusion.h"
61 #include "backend/common/graph_kernel/floatstatus_addn_fusion.h"
62 #include "backend/common/graph_kernel/core/graph_kernel_utils.h"
63 #include "backend/common/graph_kernel/compact_tensor_liveness.h"
64 #include "backend/common/graph_kernel/adapter/symbol_engine_builder.h"
65 #include "backend/common/graph_kernel/kernel_packet/symbol_engine_extender.h"
66 #include "backend/common/graph_kernel/convert_call_to_prim.h"
67 #include "backend/common/graph_kernel/core/graph_kernel_op_combiner.h"
68 #include "backend/common/graph_kernel/set_infershape_functor.h"
69 #include "backend/common/graph_kernel/recognize_softmax_grad_ext.h"
70 #include "backend/common/graph_kernel/convert_custom_for_ge.h"
71 #include "backend/common/graph_kernel/convert_input_and_attr.h"
72 #include "backend/common/graph_kernel/convert_bfloat16.h"
73 #include "backend/common/graph_kernel/deal_with_side_effect.h"
74 #include "backend/common/graph_kernel/fold_updatestate.h"
75 #include "backend/common/graph_kernel/proactive_fallback_expander.h"
76 #include "backend/common/graph_kernel/transpose_matmul_fusion.h"
77 #ifdef ENABLE_AKG
78 #include "backend/common/graph_kernel/graph_kernel_build.h"
79 #endif
80 #include "backend/common/graph_kernel/adapter/split_model_ascend.h"
81 #include "backend/common/graph_kernel/adapter/split_model_cpu.h"
82 #include "backend/common/graph_kernel/adapter/split_model_gpu.h"
83 namespace mindspore::graphkernel {
84 using opt::CommonSubexpressionElimination;
85 using opt::GetitemTuple;
86 using opt::GraphOptimizer;
87 
88 namespace {
89 auto constexpr PARALLEL_OPS_LIMIT = 7;
GetPassLevelByFlag(bool flag)90 inline unsigned int GetPassLevelByFlag(bool flag) { return flag ? OptLevel_1 : OptLevel_MAX; }
91 }  // namespace
92 
Init() const93 void GraphKernelOptimizer::Init() const {
94   // register split model here to ensure that the correct split model will be invoked
95   // when import mindspore and lite in the same process
96   SPLIT_MODEL_REGISTER(kAscendDevice, inner::SplitModelAscend);
97   SPLIT_MODEL_REGISTER(kCPUDevice, inner::SplitModelCpu);
98   SPLIT_MODEL_REGISTER(kGPUDevice, inner::SplitModelGpu);
99 }
100 
PreProcess() const101 PassManagerPtr GraphKernelOptimizer::PreProcess() const {
102   auto pm = std::make_shared<GraphKernelPassManager>(0, "preprocess");
103   // Remove redundant TupleGetItem to enable cluster ops before and after TupleGetItem
104   pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1);
105 
106   // Fallback some operations for further expanding or fusing
107   pm->Add(std::make_shared<ProactiveFallbackExpander>(), OptLevel_1, is_dvm);
108 
109   // Transform Transpose + Mutmul to a single Matmul with attribute trans_a/trans_b
110   pm->Add(std::make_shared<TransposeMatmulFusion>(), OptLevel_2, is_ascend);
111 
112   // convert input to attr adapter for dyn-shape
113   pm->Add(std::make_shared<ConvertFrontEndToGraphKernel>(), OptLevel_1);
114 
115   // Do DependElimination all passes of graphkernel
116   pm->Add(std::make_shared<DependElimination>(), OptLevel_1);
117 
118   // Do cse before all passes of graphkernel
119   pm->Add(std::make_shared<CommonSubexpressionElimination>("cse1"), OptLevel_1);
120 
121   // Save the original output info
122   pm->Add(std::make_shared<SaveOutputShape>(), OptLevel_1);
123 
124   // Change Assign(p, a, U) to Assign(Depend(p, U), a)
125   pm->Add(std::make_shared<SplitAssign>(), OptLevel_1, is_gpu || is_cpu || is_dvm);
126 
127   // Spread the MakeTuple input of UpdateState
128   pm->Add(std::make_shared<SpreadUpdateState>(), OptLevel_1);
129 
130   // Eliminate the common nodes that generated in SpreadUpdateState
131   pm->Add(std::make_shared<GraphKernelCSE>(), OptLevel_1);
132 
133   // Recognize ops that will be fused by GE
134   pm->Add(std::make_shared<RecognizeSoftmaxGradExt>(), OptLevel_1, is_ge);
135   return pm;
136 }
137 
Cluster() const138 PassManagerPtr GraphKernelOptimizer::Cluster() const {
139   auto pm = std::make_shared<GraphKernelPassManager>(1, "cluster");
140 
141   // Convert IsFinite and its user to FloatStatus
142   pm->Add(std::make_shared<FloatStatusFusion>(), OptLevel_2, is_dvm);
143 
144   // Expand FloatStatus(AddN)
145   pm->Add(std::make_shared<FloatStatusAddNFusion>(), OptLevel_2, is_gpu || is_dvm);
146 
147   // Expand complex basic kernels to composite kernels
148   pm->Add(std::make_shared<GraphKernelExpanderCloud>(), OptLevel_1);
149 
150   // Combine supported parallel ops that with common inputs
151   pm->Add(std::make_shared<GraphKernelOpCombiner>(),
152           GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_parallel_op_combine));
153 
154   // Cluster basic kernels and composite kernels
155   pm->Add(std::make_shared<StaticShapeCluster>(), OptLevel_1);
156 
157   // Add Cast for op's inputs if the input data type is not supported by op
158   pm->Add(std::make_shared<ConvertBFloat16>(), OptLevel_1, is_dvm);
159 
160   // Eliminate the outputs without external user
161   pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
162   return pm;
163 }
164 
HighLevelOpt1() const165 PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() const {
166   auto pm = std::make_shared<GraphKernelPassManager>(2, "highlevelopt1");
167 
168   // Remove redundant Cast(bias, fp16) for Matmul input
169   pm->Add(std::make_shared<CastMatmulFusion>(), OptLevel_2, is_ascend);
170 
171   // Reorder Cast and Type-insensitive node
172   pm->Add(std::make_shared<ReorderOps>(), OptLevel_2, !is_ge);
173 
174   // normalize the Reduce axis
175   pm->Add(std::make_shared<AxisNormalizer>(), OptLevel_1);
176 
177   // Cast the input of ReduceSum from float16 to float32 for higher precision
178   pm->Add(std::make_shared<RaiseReductionPrecision>(), OptLevel_2, !is_ge);
179 
180   // Insert PadAkg and UnPadAkg Ops for MatMul
181   pm->Add(std::make_shared<InsertPadOps>(), OptLevel_1, is_gpu);
182 
183   // Universal arithmetic simplify
184   pm->Add(std::make_shared<ArithmeticSimplify>(), OptLevel_2);
185 
186   // Common subexpression elimination
187   pm->Add(std::make_shared<GraphKernelCSE>(), OptLevel_2);
188 
189   // Eliminate unnecessary transform ops
190   pm->Add(std::make_shared<TransformOpOptimizer>(), OptLevel_2);
191   return pm;
192 }
193 
Split() const194 PassManagerPtr GraphKernelOptimizer::Split() const {
195   auto pm = std::make_shared<GraphKernelPassManager>(3, "split");
196   // Make certain nodes redundant so that they are used by only one user,
197   // which can avoid unnecessary input-output and get better performance.
198   // preprocess for ShapeOpsSplitter
199   pm->Add(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1);
200   std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape};
201   pm->Add(std::make_shared<ShapeOpsSplitter>(duplicated_ops), OptLevel_1);
202   // Use symbol to calculate a more precise edge relation between nodes
203   pm->Add(std::make_shared<SymbolEngineBuilder>(false), OptLevel_1, is_dvm);
204   // Split kernel according to costmodel
205   pm->Add(std::make_shared<GraphKernelSplitterWithPy>(false), OptLevel_1);
206   // After Simplify and Splitter, a lot of redundant getitem/maketuple
207   // will be exposed, use GetitemTuple Pass to delete them.
208   pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1);
209 
210   // Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter
211   pm->Add(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1);
212   pm->Add(std::make_shared<GraphKernelCSE>(), OptLevel_1);
213   pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
214   return pm;
215 }
216 
HighLevelOpt2() const217 PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() const {
218   auto pm = std::make_shared<GraphKernelPassManager>(4, "highlevelopt2");
219 
220   auto &flags = GraphKernelFlags::GetInstance();
221   // Auto recompute according to local memory burst.
222   auto recompute_lv = GetPassLevelByFlag(flags.recompute_increment_threshold > 0 ||
223                                          flags.recompute_peak_threshold > 0 || flags.enable_csr_fusion);
224   pm->Add(std::make_shared<GraphKernelRecompute>(), recompute_lv);
225 
226   // Enable atomic add
227   pm->Add(std::make_shared<AtomicCleanInserter>(), OptLevel_2, is_gpu || (is_ascend && !is_ge && !is_dvm));
228 
229   // Enable atomic add for stitch nodes.
230   auto level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_stitch_fusion);
231   pm->Add(std::make_shared<StitchAtomicCleanInserter>(), level, is_gpu);
232 
233   // Enable low precision
234   auto level_low_precision = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_low_precision);
235   pm->Add(std::make_shared<DecreaseTransferPrecision>(), level_low_precision);
236   pm->Add(std::make_shared<DecreaseComputePrecision>(), level_low_precision, is_ascend);
237 
238   // Optimize memory
239   auto memory_optimize_level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_auto_tensor_inplace);
240   pm->Add(std::make_shared<TensorInplace>(), memory_optimize_level);
241 
242   // Enable tsa and uss
243   pm->Add(std::make_shared<TsaAtomicAddToFirstTensor>(), OptLevel_1, is_gpu);
244   pm->Add(std::make_shared<UssAtomicAdd>(), OptLevel_1, is_gpu);
245   pm->Add(std::make_shared<CsrAtomicAdd>(), OptLevel_1, is_gpu);
246 
247   // Replace original output(which is input of Assign) with overridden parameters
248   pm->Add(std::make_shared<OptimizeAssign>(), OptLevel_2);
249   pm->Add(std::make_shared<ExtendOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2));
250   pm->Add(std::make_shared<MergeOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2));
251   pm->Add(std::make_shared<EliminateRedundantOutput>(), std::min(recompute_lv, OptLevel_2));
252 
253   return pm;
254 }
255 
Combine() const256 PassManagerPtr GraphKernelOptimizer::Combine() const {
257   auto pm = std::make_shared<GraphKernelPassManager>(5, "combine");
258   // Enable parallel fusion for gpu device
259   auto context_ptr = MsContext::GetInstance();
260   MS_EXCEPTION_IF_NULL(context_ptr);
261   auto target = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
262   auto level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_parallel_fusion);
263   pm->Add(std::make_shared<FoldUpdateState>(), level, is_gpu || is_ascend);
264   // Atomic-add GraphKernel node may be linked directly to UpdateState, it should be spread before parallel fusion!
265   pm->Add(std::make_shared<SpreadUpdateState>(), level);
266   pm->Add(std::make_shared<ParallelOpFusion>(target, ParallelConfig(PARALLEL_OPS_LIMIT)), level, is_gpu || is_ascend);
267 
268   // For memory efficiency, insert UpdateState for op with no cnode/param inputs to avoid early launching
269   pm->Add(std::make_shared<CompactTensorLiveness>(), OptLevel_2);
270   return pm;
271 }
272 
Build() const273 PassManagerPtr GraphKernelOptimizer::Build() const {
274   // DVM does not need this stage
275   auto pm = std::make_shared<GraphKernelPassManager>(6, "build");
276   pm->Add(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1, !is_dvm);
277   // Reduce fake output memory.
278   auto only_static_shape_fusion = GetPassLevelByFlag(!GraphKernelFlags::GetInstance().enable_dynamic_shape_fusion);
279   pm->Add(std::make_shared<ReduceFakeOutMem>(), only_static_shape_fusion, !is_dvm);
280   // Compile graph kernel nodes, and inline nodes if compile failed.
281   auto enable_dyn_level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_dynamic_shape_fusion);
282   pm->Add(std::make_shared<DynamicShapeCluster>(), enable_dyn_level, is_cpu || is_gpu);
283   pm->Add(std::make_shared<SymbolEngineBuilder>(true), enable_dyn_level, is_cpu || is_gpu);
284   pm->Add(std::make_shared<GraphKernelSplitterWithPy>(true), enable_dyn_level, is_gpu);
285 #ifdef ENABLE_AKG
286   pm->Add(std::make_shared<GraphKernelBuild>(), OptLevel_1, !is_ge && !is_dvm);
287 #endif
288   pm->Add(std::make_shared<ConvertCustomForGE>(), OptLevel_1, is_ge);
289   pm->Add(std::make_shared<GeneratedDependElimination>(), OptLevel_2, is_gpu || (is_ascend && !is_ge && !is_dvm));
290   pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1, !is_dvm);
291   pm->Add(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1, !is_dvm);
292   return pm;
293 }
294 
PostProcess() const295 PassManagerPtr GraphKernelOptimizer::PostProcess() const {
296   auto pm = std::make_shared<GraphKernelPassManager>(7, "postprocess");
297   // Make Tuple for the inputs of UpdateState. (the reverse of SpreadUpdateState)
298   pm->Add(std::make_shared<ShrinkUpdateState>(), OptLevel_1);
299 
300   // Recover the original output info
301   pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1);
302   pm->Add(std::make_shared<RewriteOutputShape>(), OptLevel_1);
303 
304   auto enable_dyn_level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_dynamic_shape_fusion);
305   // Add infershape functor for dynamic shape graph kernel
306   pm->Add(std::make_shared<SetInferShapeFunctor>(), enable_dyn_level, !is_dvm);
307 
308   // Contrary to ConvertFrontEndToGraphKernel pass, adapter for dyn-shape
309   pm->Add(std::make_shared<ConvertGraphKernelToFrontEnd>(), OptLevel_1);
310 
311   // Add the new tensors to the kernel_graph
312   pm->Add(std::make_shared<BindValueToGraph>(), OptLevel_1);
313 
314   // Update side effect attr, update kernel graph ref pair(used in device address allocation)
315   pm->Add(std::make_shared<DealWithSideEffect>(), OptLevel_1, is_dvm);
316   pm->Add(std::make_shared<ConvertCallToPrim>(), OptLevel_1, is_dvm);
317   return pm;
318 }
319 
KernelPacket() const320 PassManagerPtr GraphKernelOptimizer::KernelPacket() const {
321   auto pm = std::make_shared<GraphKernelPassManager>(8, "kernelpacket");
322   pm->Add(std::make_shared<packet::SymbolEngineExtender>(), OptLevel_0);
323   pm->Add(std::make_shared<ConvertCallToPrim>(), OptLevel_0);
324   return pm;
325 }
326 
Run(const KernelGraphPtr & kernel_graph)327 void GraphKernelOptimizer::Run(const KernelGraphPtr &kernel_graph) {
328   auto context_ptr = MsContext::GetInstance();
329   MS_EXCEPTION_IF_NULL(context_ptr);
330   is_gpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);
331   is_ascend = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice);
332   is_cpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice);
333   is_ge = (is_ascend && (context_ptr->backend_policy() == "ge") && kernel_graph->is_graph_run_mode());
334   is_dvm = (GraphKernelFlags::GetInstance().kernel_generator == "DVM");
335   auto cb = Callback::Instance();
336   if (is_ge) {
337     Callback::RegImpl(std::make_shared<CallbackImplWithInferShape>());
338   }
339 
340   auto parent_graph = kernel_graph->parent_graph().lock();
341   FuncGraphManagerPtr parent_manager = nullptr;
342   if (parent_graph != nullptr && parent_graph->manager() != nullptr) {
343     parent_manager = parent_graph->manager();
344   }
345 
346   Init();
347 
348   auto optimizer = std::make_shared<GraphOptimizer>("graph_kernel_optimizer");
349   optimizer->AddPassManager(PreProcess());
350   optimizer->AddPassManager(Cluster());
351   optimizer->AddPassManager(HighLevelOpt1());
352   optimizer->AddPassManager(Split());
353   optimizer->AddPassManager(HighLevelOpt2());
354   optimizer->AddPassManager(Combine());
355   optimizer->AddPassManager(Build());
356   optimizer->AddPassManager(PostProcess());
357 
358   auto mng = GkUtils::GetFuncGraphManager(kernel_graph);
359   GkUtils::UpdateFuncGraphManager(mng, kernel_graph);
360   (void)optimizer->Optimize(kernel_graph);
361 
362   if (parent_graph != nullptr) {
363     parent_graph->set_manager(parent_manager);
364   }
365 
366   if (is_ge) {
367     // need recover the original call back instance for other sub graph processing
368     Callback::RegImpl(cb);
369   }
370 }
371 
RunKernelPacket(const KernelGraphPtr & kernel_graph)372 void GraphKernelOptimizer::RunKernelPacket(const KernelGraphPtr &kernel_graph) {
373   auto optimizer = std::make_shared<GraphOptimizer>("graph_kernel_optimizer");
374   optimizer->AddPassManager(KernelPacket());
375   (void)optimizer->Optimize(kernel_graph);
376 }
377 
GraphKernelOptimize(const KernelGraphPtr & kernel_graph)378 void GraphKernelOptimize(const KernelGraphPtr &kernel_graph) {
379   GraphKernelOptimizer graph_kernel_optimizer;
380   graph_kernel_optimizer.Run(kernel_graph);
381 }
382 
KernelPacketOptimize(const KernelGraphPtr & kernel_graph)383 void KernelPacketOptimize(const KernelGraphPtr &kernel_graph) {
384   GraphKernelOptimizer graph_kernel_optimizer;
385   graph_kernel_optimizer.RunKernelPacket(kernel_graph);
386 }
387 
GraphKernelSupported(const std::vector<AnfNodePtr> & nodes)388 bool GraphKernelSupported(const std::vector<AnfNodePtr> &nodes) {
389   static std::vector<PrimitivePtr> supported_nodes;
390   if (supported_nodes.empty()) {
391     supported_nodes = GraphKernelExpanderCloud::GetExpanderOps();
392     auto cluster_nodes = StaticShapeCluster::GetClusterOps();
393     (void)std::copy(cluster_nodes.begin(), cluster_nodes.end(), std::back_inserter(supported_nodes));
394   }
395   for (const auto &node : nodes) {
396     if (node != nullptr && !std::any_of(supported_nodes.begin(), supported_nodes.end(),
397                                         [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); })) {
398       return false;
399     }
400   }
401   return true;
402 }
403 }  // namespace mindspore::graphkernel
404