1 /**
2 * Copyright 2021-2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "backend/common/graph_kernel/adapter/graph_kernel_optimization.h"
17
18 #include <algorithm>
19 #include <vector>
20 #include <string>
21 #include <memory>
22
23 #include "mindspore/core/ops/array_ops.h"
24 #include "ir/func_graph.h"
25 #include "utils/ms_context.h"
26 #include "backend/common/graph_kernel/graph_kernel_flags.h"
27 #include "backend/common/graph_kernel/add_atomic_clean.h"
28 #include "backend/common/graph_kernel/add_stitch_atomic_clean_gpu.h"
29 #include "backend/common/graph_kernel/core/arithmetic_simplify.h"
30 #include "backend/common/graph_kernel/adapter/graph_kernel_cluster_cloud.h"
31 #include "backend/common/graph_kernel/core/eliminate_redundant_output.h"
32 #include "backend/common/graph_kernel/insert_pad.h"
33 #include "backend/common/graph_kernel/adapter/graph_kernel_splitter_with_py.h"
34 #include "backend/common/graph_kernel/adapter/graph_kernel_expander_cloud.h"
35 #include "backend/common/graph_kernel/adapter/callback_impl.h"
36 #include "backend/common/graph_kernel/cast_matmul_fusion.h"
37 #include "backend/common/graph_kernel/raise_reduction_precision.h"
38 #include "backend/common/graph_kernel/graph_kernel_cse.h"
39 #include "backend/common/graph_kernel/core/shape_ops_splitter.h"
40 #include "backend/common/graph_kernel/value_graph_binder.h"
41 #include "backend/common/graph_kernel/parallel_fusion.h"
42 #include "backend/common/graph_kernel/optimize_assign.h"
43 #include "backend/common/graph_kernel/core/split_umonad.h"
44 #include "backend/common/graph_kernel/reorder_ops.h"
45 #include "backend/common/graph_kernel/core/update_state_formatter.h"
46 #include "backend/common/graph_kernel/axis_normalizer.h"
47 #include "backend/common/graph_kernel/decrease_compute_precision.h"
48 #include "backend/common/graph_kernel/decrease_transfer_precision.h"
49 #include "backend/common/graph_kernel/csr_atomic_add.h"
50 #include "backend/common/graph_kernel/tsa_atomic_add_to_first_tensor.h"
51 #include "backend/common/graph_kernel/uss_atomic_add.h"
52 #include "backend/common/pass/getitem_tuple.h"
53 #include "backend/common/graph_kernel/core/graph_kernel_pass_manager.h"
54 #include "backend/common/graph_kernel/core/transform_op_optimizer.h"
55 #include "backend/common/graph_kernel/rewrite_output_shape.h"
56 #include "backend/common/graph_kernel/graph_kernel_recompute.h"
57 #include "backend/common/graph_kernel/reduce_fake_out_mem.h"
58 #include "backend/common/graph_kernel/depend_elimination.h"
59 #include "backend/common/graph_kernel/tensor_inplace.h"
60 #include "backend/common/graph_kernel/floatstatus_fusion.h"
61 #include "backend/common/graph_kernel/floatstatus_addn_fusion.h"
62 #include "backend/common/graph_kernel/core/graph_kernel_utils.h"
63 #include "backend/common/graph_kernel/compact_tensor_liveness.h"
64 #include "backend/common/graph_kernel/adapter/symbol_engine_builder.h"
65 #include "backend/common/graph_kernel/kernel_packet/symbol_engine_extender.h"
66 #include "backend/common/graph_kernel/convert_call_to_prim.h"
67 #include "backend/common/graph_kernel/core/graph_kernel_op_combiner.h"
68 #include "backend/common/graph_kernel/set_infershape_functor.h"
69 #include "backend/common/graph_kernel/recognize_softmax_grad_ext.h"
70 #include "backend/common/graph_kernel/convert_custom_for_ge.h"
71 #include "backend/common/graph_kernel/convert_input_and_attr.h"
72 #include "backend/common/graph_kernel/convert_bfloat16.h"
73 #include "backend/common/graph_kernel/deal_with_side_effect.h"
74 #include "backend/common/graph_kernel/fold_updatestate.h"
75 #include "backend/common/graph_kernel/proactive_fallback_expander.h"
76 #include "backend/common/graph_kernel/transpose_matmul_fusion.h"
77 #ifdef ENABLE_AKG
78 #include "backend/common/graph_kernel/graph_kernel_build.h"
79 #endif
80 #include "backend/common/graph_kernel/adapter/split_model_ascend.h"
81 #include "backend/common/graph_kernel/adapter/split_model_cpu.h"
82 #include "backend/common/graph_kernel/adapter/split_model_gpu.h"
83 namespace mindspore::graphkernel {
84 using opt::CommonSubexpressionElimination;
85 using opt::GetitemTuple;
86 using opt::GraphOptimizer;
87
88 namespace {
89 auto constexpr PARALLEL_OPS_LIMIT = 7;
GetPassLevelByFlag(bool flag)90 inline unsigned int GetPassLevelByFlag(bool flag) { return flag ? OptLevel_1 : OptLevel_MAX; }
91 } // namespace
92
Init() const93 void GraphKernelOptimizer::Init() const {
94 // register split model here to ensure that the correct split model will be invoked
95 // when import mindspore and lite in the same process
96 SPLIT_MODEL_REGISTER(kAscendDevice, inner::SplitModelAscend);
97 SPLIT_MODEL_REGISTER(kCPUDevice, inner::SplitModelCpu);
98 SPLIT_MODEL_REGISTER(kGPUDevice, inner::SplitModelGpu);
99 }
100
PreProcess() const101 PassManagerPtr GraphKernelOptimizer::PreProcess() const {
102 auto pm = std::make_shared<GraphKernelPassManager>(0, "preprocess");
103 // Remove redundant TupleGetItem to enable cluster ops before and after TupleGetItem
104 pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1);
105
106 // Fallback some operations for further expanding or fusing
107 pm->Add(std::make_shared<ProactiveFallbackExpander>(), OptLevel_1, is_dvm);
108
109 // Transform Transpose + Mutmul to a single Matmul with attribute trans_a/trans_b
110 pm->Add(std::make_shared<TransposeMatmulFusion>(), OptLevel_2, is_ascend);
111
112 // convert input to attr adapter for dyn-shape
113 pm->Add(std::make_shared<ConvertFrontEndToGraphKernel>(), OptLevel_1);
114
115 // Do DependElimination all passes of graphkernel
116 pm->Add(std::make_shared<DependElimination>(), OptLevel_1);
117
118 // Do cse before all passes of graphkernel
119 pm->Add(std::make_shared<CommonSubexpressionElimination>("cse1"), OptLevel_1);
120
121 // Save the original output info
122 pm->Add(std::make_shared<SaveOutputShape>(), OptLevel_1);
123
124 // Change Assign(p, a, U) to Assign(Depend(p, U), a)
125 pm->Add(std::make_shared<SplitAssign>(), OptLevel_1, is_gpu || is_cpu || is_dvm);
126
127 // Spread the MakeTuple input of UpdateState
128 pm->Add(std::make_shared<SpreadUpdateState>(), OptLevel_1);
129
130 // Eliminate the common nodes that generated in SpreadUpdateState
131 pm->Add(std::make_shared<GraphKernelCSE>(), OptLevel_1);
132
133 // Recognize ops that will be fused by GE
134 pm->Add(std::make_shared<RecognizeSoftmaxGradExt>(), OptLevel_1, is_ge);
135 return pm;
136 }
137
Cluster() const138 PassManagerPtr GraphKernelOptimizer::Cluster() const {
139 auto pm = std::make_shared<GraphKernelPassManager>(1, "cluster");
140
141 // Convert IsFinite and its user to FloatStatus
142 pm->Add(std::make_shared<FloatStatusFusion>(), OptLevel_2, is_dvm);
143
144 // Expand FloatStatus(AddN)
145 pm->Add(std::make_shared<FloatStatusAddNFusion>(), OptLevel_2, is_gpu || is_dvm);
146
147 // Expand complex basic kernels to composite kernels
148 pm->Add(std::make_shared<GraphKernelExpanderCloud>(), OptLevel_1);
149
150 // Combine supported parallel ops that with common inputs
151 pm->Add(std::make_shared<GraphKernelOpCombiner>(),
152 GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_parallel_op_combine));
153
154 // Cluster basic kernels and composite kernels
155 pm->Add(std::make_shared<StaticShapeCluster>(), OptLevel_1);
156
157 // Add Cast for op's inputs if the input data type is not supported by op
158 pm->Add(std::make_shared<ConvertBFloat16>(), OptLevel_1, is_dvm);
159
160 // Eliminate the outputs without external user
161 pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
162 return pm;
163 }
164
HighLevelOpt1() const165 PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() const {
166 auto pm = std::make_shared<GraphKernelPassManager>(2, "highlevelopt1");
167
168 // Remove redundant Cast(bias, fp16) for Matmul input
169 pm->Add(std::make_shared<CastMatmulFusion>(), OptLevel_2, is_ascend);
170
171 // Reorder Cast and Type-insensitive node
172 pm->Add(std::make_shared<ReorderOps>(), OptLevel_2, !is_ge);
173
174 // normalize the Reduce axis
175 pm->Add(std::make_shared<AxisNormalizer>(), OptLevel_1);
176
177 // Cast the input of ReduceSum from float16 to float32 for higher precision
178 pm->Add(std::make_shared<RaiseReductionPrecision>(), OptLevel_2, !is_ge);
179
180 // Insert PadAkg and UnPadAkg Ops for MatMul
181 pm->Add(std::make_shared<InsertPadOps>(), OptLevel_1, is_gpu);
182
183 // Universal arithmetic simplify
184 pm->Add(std::make_shared<ArithmeticSimplify>(), OptLevel_2);
185
186 // Common subexpression elimination
187 pm->Add(std::make_shared<GraphKernelCSE>(), OptLevel_2);
188
189 // Eliminate unnecessary transform ops
190 pm->Add(std::make_shared<TransformOpOptimizer>(), OptLevel_2);
191 return pm;
192 }
193
Split() const194 PassManagerPtr GraphKernelOptimizer::Split() const {
195 auto pm = std::make_shared<GraphKernelPassManager>(3, "split");
196 // Make certain nodes redundant so that they are used by only one user,
197 // which can avoid unnecessary input-output and get better performance.
198 // preprocess for ShapeOpsSplitter
199 pm->Add(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1);
200 std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape};
201 pm->Add(std::make_shared<ShapeOpsSplitter>(duplicated_ops), OptLevel_1);
202 // Use symbol to calculate a more precise edge relation between nodes
203 pm->Add(std::make_shared<SymbolEngineBuilder>(false), OptLevel_1, is_dvm);
204 // Split kernel according to costmodel
205 pm->Add(std::make_shared<GraphKernelSplitterWithPy>(false), OptLevel_1);
206 // After Simplify and Splitter, a lot of redundant getitem/maketuple
207 // will be exposed, use GetitemTuple Pass to delete them.
208 pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1);
209
210 // Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter
211 pm->Add(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1);
212 pm->Add(std::make_shared<GraphKernelCSE>(), OptLevel_1);
213 pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
214 return pm;
215 }
216
HighLevelOpt2() const217 PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() const {
218 auto pm = std::make_shared<GraphKernelPassManager>(4, "highlevelopt2");
219
220 auto &flags = GraphKernelFlags::GetInstance();
221 // Auto recompute according to local memory burst.
222 auto recompute_lv = GetPassLevelByFlag(flags.recompute_increment_threshold > 0 ||
223 flags.recompute_peak_threshold > 0 || flags.enable_csr_fusion);
224 pm->Add(std::make_shared<GraphKernelRecompute>(), recompute_lv);
225
226 // Enable atomic add
227 pm->Add(std::make_shared<AtomicCleanInserter>(), OptLevel_2, is_gpu || (is_ascend && !is_ge && !is_dvm));
228
229 // Enable atomic add for stitch nodes.
230 auto level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_stitch_fusion);
231 pm->Add(std::make_shared<StitchAtomicCleanInserter>(), level, is_gpu);
232
233 // Enable low precision
234 auto level_low_precision = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_low_precision);
235 pm->Add(std::make_shared<DecreaseTransferPrecision>(), level_low_precision);
236 pm->Add(std::make_shared<DecreaseComputePrecision>(), level_low_precision, is_ascend);
237
238 // Optimize memory
239 auto memory_optimize_level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_auto_tensor_inplace);
240 pm->Add(std::make_shared<TensorInplace>(), memory_optimize_level);
241
242 // Enable tsa and uss
243 pm->Add(std::make_shared<TsaAtomicAddToFirstTensor>(), OptLevel_1, is_gpu);
244 pm->Add(std::make_shared<UssAtomicAdd>(), OptLevel_1, is_gpu);
245 pm->Add(std::make_shared<CsrAtomicAdd>(), OptLevel_1, is_gpu);
246
247 // Replace original output(which is input of Assign) with overridden parameters
248 pm->Add(std::make_shared<OptimizeAssign>(), OptLevel_2);
249 pm->Add(std::make_shared<ExtendOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2));
250 pm->Add(std::make_shared<MergeOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2));
251 pm->Add(std::make_shared<EliminateRedundantOutput>(), std::min(recompute_lv, OptLevel_2));
252
253 return pm;
254 }
255
Combine() const256 PassManagerPtr GraphKernelOptimizer::Combine() const {
257 auto pm = std::make_shared<GraphKernelPassManager>(5, "combine");
258 // Enable parallel fusion for gpu device
259 auto context_ptr = MsContext::GetInstance();
260 MS_EXCEPTION_IF_NULL(context_ptr);
261 auto target = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
262 auto level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_parallel_fusion);
263 pm->Add(std::make_shared<FoldUpdateState>(), level, is_gpu || is_ascend);
264 // Atomic-add GraphKernel node may be linked directly to UpdateState, it should be spread before parallel fusion!
265 pm->Add(std::make_shared<SpreadUpdateState>(), level);
266 pm->Add(std::make_shared<ParallelOpFusion>(target, ParallelConfig(PARALLEL_OPS_LIMIT)), level, is_gpu || is_ascend);
267
268 // For memory efficiency, insert UpdateState for op with no cnode/param inputs to avoid early launching
269 pm->Add(std::make_shared<CompactTensorLiveness>(), OptLevel_2);
270 return pm;
271 }
272
Build() const273 PassManagerPtr GraphKernelOptimizer::Build() const {
274 // DVM does not need this stage
275 auto pm = std::make_shared<GraphKernelPassManager>(6, "build");
276 pm->Add(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1, !is_dvm);
277 // Reduce fake output memory.
278 auto only_static_shape_fusion = GetPassLevelByFlag(!GraphKernelFlags::GetInstance().enable_dynamic_shape_fusion);
279 pm->Add(std::make_shared<ReduceFakeOutMem>(), only_static_shape_fusion, !is_dvm);
280 // Compile graph kernel nodes, and inline nodes if compile failed.
281 auto enable_dyn_level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_dynamic_shape_fusion);
282 pm->Add(std::make_shared<DynamicShapeCluster>(), enable_dyn_level, is_cpu || is_gpu);
283 pm->Add(std::make_shared<SymbolEngineBuilder>(true), enable_dyn_level, is_cpu || is_gpu);
284 pm->Add(std::make_shared<GraphKernelSplitterWithPy>(true), enable_dyn_level, is_gpu);
285 #ifdef ENABLE_AKG
286 pm->Add(std::make_shared<GraphKernelBuild>(), OptLevel_1, !is_ge && !is_dvm);
287 #endif
288 pm->Add(std::make_shared<ConvertCustomForGE>(), OptLevel_1, is_ge);
289 pm->Add(std::make_shared<GeneratedDependElimination>(), OptLevel_2, is_gpu || (is_ascend && !is_ge && !is_dvm));
290 pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1, !is_dvm);
291 pm->Add(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1, !is_dvm);
292 return pm;
293 }
294
PostProcess() const295 PassManagerPtr GraphKernelOptimizer::PostProcess() const {
296 auto pm = std::make_shared<GraphKernelPassManager>(7, "postprocess");
297 // Make Tuple for the inputs of UpdateState. (the reverse of SpreadUpdateState)
298 pm->Add(std::make_shared<ShrinkUpdateState>(), OptLevel_1);
299
300 // Recover the original output info
301 pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1);
302 pm->Add(std::make_shared<RewriteOutputShape>(), OptLevel_1);
303
304 auto enable_dyn_level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_dynamic_shape_fusion);
305 // Add infershape functor for dynamic shape graph kernel
306 pm->Add(std::make_shared<SetInferShapeFunctor>(), enable_dyn_level, !is_dvm);
307
308 // Contrary to ConvertFrontEndToGraphKernel pass, adapter for dyn-shape
309 pm->Add(std::make_shared<ConvertGraphKernelToFrontEnd>(), OptLevel_1);
310
311 // Add the new tensors to the kernel_graph
312 pm->Add(std::make_shared<BindValueToGraph>(), OptLevel_1);
313
314 // Update side effect attr, update kernel graph ref pair(used in device address allocation)
315 pm->Add(std::make_shared<DealWithSideEffect>(), OptLevel_1, is_dvm);
316 pm->Add(std::make_shared<ConvertCallToPrim>(), OptLevel_1, is_dvm);
317 return pm;
318 }
319
KernelPacket() const320 PassManagerPtr GraphKernelOptimizer::KernelPacket() const {
321 auto pm = std::make_shared<GraphKernelPassManager>(8, "kernelpacket");
322 pm->Add(std::make_shared<packet::SymbolEngineExtender>(), OptLevel_0);
323 pm->Add(std::make_shared<ConvertCallToPrim>(), OptLevel_0);
324 return pm;
325 }
326
Run(const KernelGraphPtr & kernel_graph)327 void GraphKernelOptimizer::Run(const KernelGraphPtr &kernel_graph) {
328 auto context_ptr = MsContext::GetInstance();
329 MS_EXCEPTION_IF_NULL(context_ptr);
330 is_gpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);
331 is_ascend = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice);
332 is_cpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice);
333 is_ge = (is_ascend && (context_ptr->backend_policy() == "ge") && kernel_graph->is_graph_run_mode());
334 is_dvm = (GraphKernelFlags::GetInstance().kernel_generator == "DVM");
335 auto cb = Callback::Instance();
336 if (is_ge) {
337 Callback::RegImpl(std::make_shared<CallbackImplWithInferShape>());
338 }
339
340 auto parent_graph = kernel_graph->parent_graph().lock();
341 FuncGraphManagerPtr parent_manager = nullptr;
342 if (parent_graph != nullptr && parent_graph->manager() != nullptr) {
343 parent_manager = parent_graph->manager();
344 }
345
346 Init();
347
348 auto optimizer = std::make_shared<GraphOptimizer>("graph_kernel_optimizer");
349 optimizer->AddPassManager(PreProcess());
350 optimizer->AddPassManager(Cluster());
351 optimizer->AddPassManager(HighLevelOpt1());
352 optimizer->AddPassManager(Split());
353 optimizer->AddPassManager(HighLevelOpt2());
354 optimizer->AddPassManager(Combine());
355 optimizer->AddPassManager(Build());
356 optimizer->AddPassManager(PostProcess());
357
358 auto mng = GkUtils::GetFuncGraphManager(kernel_graph);
359 GkUtils::UpdateFuncGraphManager(mng, kernel_graph);
360 (void)optimizer->Optimize(kernel_graph);
361
362 if (parent_graph != nullptr) {
363 parent_graph->set_manager(parent_manager);
364 }
365
366 if (is_ge) {
367 // need recover the original call back instance for other sub graph processing
368 Callback::RegImpl(cb);
369 }
370 }
371
RunKernelPacket(const KernelGraphPtr & kernel_graph)372 void GraphKernelOptimizer::RunKernelPacket(const KernelGraphPtr &kernel_graph) {
373 auto optimizer = std::make_shared<GraphOptimizer>("graph_kernel_optimizer");
374 optimizer->AddPassManager(KernelPacket());
375 (void)optimizer->Optimize(kernel_graph);
376 }
377
GraphKernelOptimize(const KernelGraphPtr & kernel_graph)378 void GraphKernelOptimize(const KernelGraphPtr &kernel_graph) {
379 GraphKernelOptimizer graph_kernel_optimizer;
380 graph_kernel_optimizer.Run(kernel_graph);
381 }
382
KernelPacketOptimize(const KernelGraphPtr & kernel_graph)383 void KernelPacketOptimize(const KernelGraphPtr &kernel_graph) {
384 GraphKernelOptimizer graph_kernel_optimizer;
385 graph_kernel_optimizer.RunKernelPacket(kernel_graph);
386 }
387
GraphKernelSupported(const std::vector<AnfNodePtr> & nodes)388 bool GraphKernelSupported(const std::vector<AnfNodePtr> &nodes) {
389 static std::vector<PrimitivePtr> supported_nodes;
390 if (supported_nodes.empty()) {
391 supported_nodes = GraphKernelExpanderCloud::GetExpanderOps();
392 auto cluster_nodes = StaticShapeCluster::GetClusterOps();
393 (void)std::copy(cluster_nodes.begin(), cluster_nodes.end(), std::back_inserter(supported_nodes));
394 }
395 for (const auto &node : nodes) {
396 if (node != nullptr && !std::any_of(supported_nodes.begin(), supported_nodes.end(),
397 [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); })) {
398 return false;
399 }
400 }
401 return true;
402 }
403 } // namespace mindspore::graphkernel
404