• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021-2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "plugin/device/cpu/hal/hardware/cpu_device_context.h"
18 #include <map>
19 #include <string>
20 #include "plugin/device/cpu/hal/device/cpu_device_address.h"
21 #include "plugin/device/cpu/hal/device/cpu_memory_manager.h"
22 #include "plugin/device/cpu/optimizer/reg_cpu_const_input_to_attr.h"
23 #include "plugin/device/cpu/optimizer/print_value_type.h"
24 #include "plugin/device/cpu/hal/hardware/cpu_somas.h"
25 #include "plugin/device/cpu/hal/device/cpu_hash_table_util.h"
26 #ifdef ENABLE_AKG
27 #include "plugin/device/cpu/kernel/akg/akg_cpu_kernel_build.h"
28 #endif
29 #include "plugin/factory/ms_factory.h"
30 #include "plugin/device/cpu/kernel/cpu_kernel.h"
31 #include "kernel/kernel_build_info.h"
32 #include "kernel/framework_utils.h"
33 #include "plugin/device/cpu/hal/device/kernel_select_cpu.h"
34 #include "utils/trace_base.h"
35 #include "backend/common/graph_kernel/graph_kernel_flags.h"
36 #include "include/backend/optimizer/optimizer.h"
37 #include "include/backend/optimizer/pass_manager.h"
38 #include "backend/common/optimizer/common_backend_optimization.h"
39 #include "backend/common/optimizer/dynamic_shape/dynamic_shape_helper.h"
40 #include "plugin/device/cpu/optimizer/insert_cast_cpu.h"
41 #include "plugin/device/cpu/optimizer/insert_cast_to_pyexecute.h"
42 #include "plugin/device/cpu/optimizer/insert_format_transform_op.h"
43 #include "plugin/device/cpu/optimizer/softmax_grad_fusion.h"
44 #include "plugin/device/cpu/optimizer/matmul_biasadd_fusion.h"
45 #include "plugin/device/cpu/optimizer/matmul_biasadd_relu_fusion.h"
46 #include "backend/common/pass/insert_type_transform_op.h"
47 #include "backend/common/pass/flatten_value_sequence_in_pyexecute.h"
48 #include "backend/common/pass/communication_op_fusion.h"
49 #include "backend/common/pass/replace_node_by_proxy.h"
50 #include "backend/common/pass/erase_visit_attr.h"
51 #include "backend/common/pass/add_training_attr.h"
52 #include "backend/common/pass/insert_tensor_move_for_communication.h"
53 #include "backend/common/pass/dynamic_sequence_ops_adaptation.h"
54 #include "backend/common/graph_kernel/adapter/graph_kernel_optimization.h"
55 #include "backend/common/expander/fallback/expander_fallback.h"
56 #include "backend/common/graph_kernel/value_graph_binder.h"
57 #include "include/backend/anf_runtime_algorithm.h"
58 #include "include/common/utils/anfalgo.h"
59 #include "plugin/device/cpu/hal/profiler/cpu_profiling.h"
60 #if defined(__linux__) && defined(WITH_BACKEND)
61 #include "plugin/device/cpu/hal/hardware/ms_collective_comm_lib.h"
62 #endif
63 #ifndef ENABLE_SECURITY
64 #include "include/backend/debug/data_dump/dump_json_parser.h"
65 #endif
66 #ifdef ENABLE_DUMP_IR
67 #include "include/common/debug/anf_ir_dump.h"
68 #endif
69 #include "include/common/profiler.h"
70 #include "plugin/device/cpu/hal/device/cpu_kernel_task.h"
71 #include "plugin/device/cpu/hal/device/cpu_device_synchronizer.h"
72 #include "ops/framework_ops.h"
73 #include "kernel/oplib/oplib.h"
74 #include "runtime/device/move_to.h"
75 
76 namespace mindspore {
77 namespace device {
78 namespace cpu {
79 namespace {
80 const char kModelNameCPU[] = "CPU";
81 const char kEventOptimizeGraph[] = "OptimizeGraph";
82 const char kStageSetKernelInfo[] = "SetKernelInfo";
83 
GetTaskByTaskType(const runtime::KernelTaskType & task_type,const std::shared_ptr<runtime::KernelTaskContext> & task_context)84 runtime::KernelTaskPtr GetTaskByTaskType(const runtime::KernelTaskType &task_type,
85                                          const std::shared_ptr<runtime::KernelTaskContext> &task_context) {
86   switch (task_type) {
87     case runtime::KernelTaskType::kCONTIGUOUS_TASK:
88       return std::make_shared<CpuContiguousKernelTask>(task_context);
89     case runtime::KernelTaskType::kCOPY_TASK:
90       return std::make_shared<CpuCopyWithSliceKernelTask>(task_context);
91     default:
92       MS_LOG(EXCEPTION) << "KernelTaskType is invalid, task_type:" << task_type;
93   }
94 }
95 }  // namespace
96 using mindspore::kernel::KernelBuildInfo;
97 
Initialize()98 void CPUDeviceContext::Initialize() {
99 #ifdef __APPLE__
100   std::lock_guard<SpinLock> spin_lock(init_lock_);
101 #else
102   std::lock_guard<std::mutex> lock(init_mutex_);
103 #endif
104   if (initialized_) {
105     return;
106   }
107   MS_EXCEPTION_IF_NULL(device_res_manager_);
108   device_res_manager_->Initialize();
109   auto ms_context = MsContext::GetInstance();
110   MS_EXCEPTION_IF_NULL(ms_context);
111 #ifndef ENABLE_SECURITY
112   if (ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) {
113     // Dump json config file if dump is enabled.
114     uint32_t rank_id = 0;
115     auto &json_parser = DumpJsonParser::GetInstance();
116     json_parser.Parse();
117     json_parser.CopyDumpJsonToDir(rank_id);
118     json_parser.CopyMSCfgJsonToDir(rank_id);
119   }
120 #endif
121 #ifdef __linux__
122   if (ms_context->IsDefaultDeviceTarget() && ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) {
123     MS_LOG(INFO)
124       << "No device_target set, set CPU as default. You can call mindspore.set_context(device_target=\"XXX\")";
125   }
126 #endif  // __linux__
127   initialized_ = true;
128 }
129 
Destroy()130 void CPUDeviceContext::Destroy() {
131   MS_EXCEPTION_IF_NULL(device_res_manager_);
132   device_res_manager_->Destroy();
133   initialized_ = false;
134 }
135 
Initialize()136 void CPUDeviceResManager::Initialize() {
137   mem_manager_ = std::make_shared<CPUMemoryManager>();
138   MS_EXCEPTION_IF_NULL(mem_manager_);
139 }
140 
Destroy()141 void CPUDeviceResManager::Destroy() {
142   // Release memory.
143   if (mem_manager_ != nullptr) {
144     mem_manager_->Finalize();
145     mem_manager_ = nullptr;
146   }
147 }
148 
AllocateMemory(size_t size,uint32_t stream_id) const149 void *CPUDeviceResManager::AllocateMemory(size_t size, uint32_t stream_id) const {
150   MS_EXCEPTION_IF_NULL(mem_manager_);
151   return mem_manager_->MallocMemFromMemPool(size, false, false, stream_id);
152 }
153 
FreeMemory(void * ptr) const154 void CPUDeviceResManager::FreeMemory(void *ptr) const {
155   MS_EXCEPTION_IF_NULL(ptr);
156   MS_EXCEPTION_IF_NULL(mem_manager_);
157   mem_manager_->FreeMemFromMemPool(ptr);
158 }
159 
FreePartMemorys(const std::vector<void * > & free_addrs,const std::vector<void * > & keep_addrs,const std::vector<size_t> & keep_addr_sizes) const160 void CPUDeviceResManager::FreePartMemorys(const std::vector<void *> &free_addrs, const std::vector<void *> &keep_addrs,
161                                           const std::vector<size_t> &keep_addr_sizes) const {
162   CPUMemoryPool::GetInstance().FreePartTensorMems(free_addrs, keep_addrs, keep_addr_sizes);
163 }
164 
AllocateContinuousMemory(const std::vector<size_t> & size_list,uint32_t stream_id) const165 std::vector<void *> CPUDeviceResManager::AllocateContinuousMemory(const std::vector<size_t> &size_list,
166                                                                   uint32_t stream_id) const {
167   MS_EXCEPTION_IF_NULL(mem_manager_);
168   return mem_manager_->MallocContinuousMemFromMemPool(size_list, stream_id);
169 }
170 
171 namespace {
172 // Create user data content(such as CPU hash table) and set user data reference into device_address.
FillUserData(const UserDataPtr & user_data,DeviceAddress * device_address)173 void FillUserData(const UserDataPtr &user_data, DeviceAddress *device_address) {
174   MS_EXCEPTION_IF_NULL(user_data);
175   MS_EXCEPTION_IF_NULL(device_address);
176 
177   // Save reference of user data in device address.
178   device_address->set_user_data(user_data);
179 
180   const auto &user_data_type = user_data->get<UserDataType>(kUserDataType);
181   if (user_data_type == nullptr) {
182     return;
183   }
184   if (*user_data_type == UserDataType::kUserTypeHashTable) {
185     auto key_type = user_data->get<TypeId>(kHashTableKeyType);
186     auto value_type = user_data->get<TypeId>(kHashTableValueType);
187     MS_EXCEPTION_IF_NULL(key_type);
188     MS_EXCEPTION_IF_NULL(value_type);
189     const auto &iter = cpu_hash_table_funcs.find({*key_type, *value_type});
190     if (iter != cpu_hash_table_funcs.end()) {
191       // Create CPU hash table and set into `user_data`.
192       return std::get<kCreateFuncIndex>(iter->second)(user_data);
193     } else {
194       MS_LOG(EXCEPTION) << "Unsupported hash table type, key type:" << TypeIdLabel(*key_type)
195                         << ", value type:" << TypeIdLabel(*value_type);
196     }
197   } else {
198     MS_LOG(EXCEPTION) << "Invalid user data type:" << *user_data_type;
199   }
200 }
201 }  // namespace
202 
CreateDeviceAddress(const KernelTensorPtr & kernel_tensor) const203 DeviceAddressPtr CPUDeviceResManager::CreateDeviceAddress(const KernelTensorPtr &kernel_tensor) const {
204   MS_EXCEPTION_IF_NULL(kernel_tensor);
205   if (kernel_tensor->device_name().empty()) {
206     kernel_tensor->set_device_name(device_context_->device_context_key().device_name_);
207     kernel_tensor->set_device_id(device_context_->device_context_key().device_id_);
208   }
209   auto device_address = std::make_shared<CPUDeviceAddress>(kernel_tensor);
210 
211   const auto &user_data = kernel_tensor->user_data();
212   if (user_data != nullptr) {
213     FillUserData(user_data, device_address.get());
214   }
215   device_address->set_device_synchronizer(std::make_shared<CPUDeviceSynchronizer>());
216   return device_address;
217 }
218 
MoveTo(const tensor::TensorPtr & src_tensor,const tensor::TensorPtr & dst_tensor,const std::string & to,bool blocking,bool * return_self)219 void CPUDeviceResManager::MoveTo(const tensor::TensorPtr &src_tensor, const tensor::TensorPtr &dst_tensor,
220                                  const std::string &to, bool blocking, bool *return_self) {
221   device::MoveTo(src_tensor, dst_tensor, to, blocking, return_self);
222 }
223 
CreateDeviceAddress(void * ptr,size_t size,const ShapeVector & shape_vector,const Format & format,TypeId type_id,const std::string & device_name,uint32_t device_id,uint32_t stream_id) const224 DeviceAddressPtr CPUDeviceResManager::CreateDeviceAddress(void *ptr, size_t size, const ShapeVector &shape_vector,
225                                                           const Format &format, TypeId type_id,
226                                                           const std::string &device_name, uint32_t device_id,
227                                                           uint32_t stream_id) const {
228   return std::make_shared<CPUDeviceAddress>(ptr, size, shape_vector, format, type_id, device_name, device_id,
229                                             stream_id);
230 }
231 
OptimizeGraph(const FuncGraphPtr & graph) const232 void CPUKernelExecutor::OptimizeGraph(const FuncGraphPtr &graph) const {
233   MS_EXCEPTION_IF_NULL(graph);
234   auto kernel_graph = graph->cast<KernelGraphPtr>();
235   MS_EXCEPTION_IF_NULL(kernel_graph);
236   auto ms_context = MsContext::GetInstance();
237   MS_EXCEPTION_IF_NULL(ms_context);
238   auto enable_lazy_inline = ms_context->CellReuseLevel() != CellReuseLevel::kNoCellReuse;
239   if (enable_lazy_inline) {
240     MS_LOG(EXCEPTION) << "CPU does not support the lazy_inline feature, "
241                       << "please do not mark @lazy_inline in cell's __init__ func.";
242   }
243   if (kernel_graph->is_from_single_op()) {
244     SetOperatorInfo(kernel_graph);
245     SingleOpGraphOptimize(kernel_graph);
246     UpdateKernelRefInfo(kernel_graph);
247   } else {
248     // The passes in this function must be before ops select: SetOperatorInfo()
249     OptimizeMindIR(kernel_graph);
250     // Update Graph Dynamic Shape Attr.
251     opt::AddDynamicShapeAttrPass(kernel_graph);
252 
253     SetOperatorInfo(kernel_graph);
254     // SetOperatorInfo may generate new node, so need set kernel object type again.
255     kernel_graph->SetKernelObjectTypesForUnrealNodes();
256 #ifdef ENABLE_DUMP_IR
257     if (ms_context->CanDump(kIntroductory)) {
258       DumpIR("hwopt_comm_after_kernel_select_" + graph->ToString() + ".ir", graph, true);
259     }
260 #endif
261 
262     OptimizeGraphImpl(kernel_graph);
263 
264     // Run final optimization.
265     opt::CommonFinalOptimization(kernel_graph);
266 
267     // Run graph kernel fusion optimization
268     if (graphkernel::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
269       graphkernel::GraphKernelOptimize(kernel_graph);
270       kernel_graph->SetExecOrderByDefault();
271     }
272   }
273 }
274 
UpdateKernelRefInfo(const KernelGraphPtr & graph) const275 void CPUKernelExecutor::UpdateKernelRefInfo(const KernelGraphPtr &graph) const {
276   MS_EXCEPTION_IF_NULL(graph);
277   const std::vector<CNodePtr> &kernels = graph->execution_order();
278   for (const auto &kernel : kernels) {
279     MS_EXCEPTION_IF_NULL(kernel);
280     const std::string &op_name = common::AnfAlgo::GetCNodeName(kernel);
281     if (IsPrimitiveCNode(kernel, prim::kPrimCustom) &&
282         mindspore::kernel::OpLib::FindOp(op_name, kernel::OpImplyType::kImplyCPU) == nullptr) {
283       MS_LOG(DEBUG) << "Not find operator information for Custom operator [" << op_name << "]";
284       return;
285     }
286 
287     auto kernel_attr_list = kernel::NativeCpuKernelMod::GetCpuSupportedList(op_name);
288     if (kernel_attr_list.empty()) {
289       MS_LOG(DEBUG) << "kernel_attr_list is empty";
290       return;
291     }
292 
293     auto kernel_info = dynamic_cast<device::KernelInfo *>(kernel->kernel_info());
294     MS_EXCEPTION_IF_NULL(kernel_info);
295     kernel_info->set_ref_map(kernel_attr_list[0].GetAllOutInRef(), kernel_attr_list[0].GetOutInRefMap());
296   }
297 }
298 
OptimizeMindIR(const KernelGraphPtr & graph) const299 void CPUKernelExecutor::OptimizeMindIR(const KernelGraphPtr &graph) const {
300   MS_EXCEPTION_IF_NULL(graph);
301   auto optimizer = std::make_shared<opt::GraphOptimizer>();
302   auto pm = std::make_shared<opt::PassManager>();
303   pm->AddPass(std::make_shared<opt::SoftmaxGradFusionCpu>("softmax_grad_fusion_cpu"));
304   // Match MatMul+BiasAdd+ReLU first, if no match, then match MatMul+BiasAdd
305   pm->AddPass(std::make_shared<opt::MatMulBiasAddReluFusionCPU>("matmul_biasadd_relu_fusion_cpu"));
306   pm->AddPass(std::make_shared<opt::DynamicSequenceOpsAdaptation>());
307   optimizer->AddPassManager(pm);
308   (void)optimizer->Optimize(graph);
309   graph->SetExecOrderByDefault();
310 }
311 
OptimizeGraphImpl(const KernelGraphPtr & graph) const312 void CPUKernelExecutor::OptimizeGraphImpl(const KernelGraphPtr &graph) const {
313   MS_EXCEPTION_IF_NULL(graph);
314   auto optimizer = std::make_shared<opt::GraphOptimizer>();
315   auto pm = std::make_shared<opt::PassManager>();
316   pm->AddPass(std::make_shared<opt::InsertTypeTransformOp>("insert_type_transform_op"));
317   pm->AddPass(std::make_shared<opt::FlattenValueSequenceInPyExecute>("flatten_value_sequence_in_pyexecute"));
318   pm->AddPass(std::make_shared<opt::InsertFormatTransformOpCPU>("insert_format_transform_op_cpu"));
319   pm->AddPass(std::make_shared<opt::AllReduceFusion>());
320   pm->AddPass(std::make_shared<opt::InsertCastCPU>("insert_cast"));
321   pm->AddPass(std::make_shared<opt::EraseVisitAttr>());
322   pm->AddPass(std::make_shared<opt::InsertTensorMoveForCommunication>());
323   pm->AddPass(std::make_shared<opt::AddTrainingAttr>());
324   pm->AddPass(std::make_shared<opt::PrintValueType>("print_value_type"));
325   pm->AddPass(std::make_shared<opt::InsertCastToPyExecute>("insert_cast_for_pyexecute"));
326   optimizer->AddPassManager(pm);
327   (void)optimizer->Optimize(graph);
328   graph->SetExecOrderByDefault();
329 }
330 
SingleOpGraphOptimize(const KernelGraphPtr & graph) const331 void CPUKernelExecutor::SingleOpGraphOptimize(const KernelGraphPtr &graph) const {
332   MS_EXCEPTION_IF_NULL(graph);
333   auto optimizer = std::make_shared<opt::GraphOptimizer>();
334   auto pm = std::make_shared<opt::PassManager>();
335   pm->AddPass(std::make_shared<opt::InsertCastCPU>("insert_cast"));
336   optimizer->AddPassManager(pm);
337   (void)optimizer->Optimize(graph);
338   graph->SetExecOrderByDefault();
339 }
340 
341 namespace {
SetControlOpInfo(const CNodePtr & kernel_node)342 void SetControlOpInfo(const CNodePtr &kernel_node) {
343   MS_EXCEPTION_IF_NULL(kernel_node);
344   std::vector<std::string> inputs_format;
345   std::vector<TypeId> inputs_type;
346   size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
347   for (size_t input_index = 0; input_index < input_num; ++input_index) {
348     (void)inputs_format.emplace_back(kOpFormat_DEFAULT);
349     inputs_type.push_back(common::AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, input_index));
350   }
351   std::vector<std::string> outputs_format;
352   std::vector<TypeId> outputs_type;
353   size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
354   for (size_t output_index = 0; output_index < output_num; ++output_index) {
355     (void)outputs_format.emplace_back(kOpFormat_DEFAULT);
356     outputs_type.push_back(common::AnfAlgo::GetOutputInferDataType(kernel_node, output_index));
357   }
358 
359   auto builder = std::make_shared<KernelBuildInfo::KernelBuildInfoBuilder>();
360   builder->SetInputsFormat(inputs_format);
361   builder->SetInputsDeviceType(inputs_type);
362   builder->SetOutputsFormat(outputs_format);
363   builder->SetOutputsDeviceType(outputs_type);
364 
365   AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), kernel_node.get());
366 }
367 
368 // Before creating the kernel, check whether the node has completed the operator selection. If not, the operator
369 // selection needs to be performed to set kernel info.
SetKernelInfoBeforeCreateKernel(const std::vector<CNodePtr> & nodes)370 void SetKernelInfoBeforeCreateKernel(const std::vector<CNodePtr> &nodes) {
371   // Check whether the node has completed operator selection.
372   for (const auto &node : nodes) {
373     if (AnfAlgo::GetSelectKernelBuildInfo(node) != nullptr) {
374       continue;
375     }
376 
377     // Kernel selection process for non control op.
378     if (!common::AnfAlgo::IsBpropCutOpExecInBackend(node)) {
379       auto [msg, etype] = SetKernelInfoWithMsg(node);
380       if (!msg.empty()) {
381         MS_EXCEPTION(etype) << "#umsg#Kernel select failed:#umsg#" << msg;
382       }
383     } else {
384       // Kernel selection process for control op.
385       SetControlOpInfo(node);
386     }
387   }
388 }
389 }  // namespace
390 
SetOperatorInfo(const KernelGraphPtr & graph) const391 void CPUKernelExecutor::SetOperatorInfo(const KernelGraphPtr &graph) const {
392   MS_EXCEPTION_IF_NULL(graph);
393   (void)profiler::CollectHostInfo(kModelNameCPU, kEventOptimizeGraph, kStageSetKernelInfo, 1, 0, 0);
394   bool do_expand = false;
395   auto mng = graph->manager();
396   if (mng == nullptr) {
397     mng = Manage(graph, true);
398     MS_EXCEPTION_IF_NULL(mng);
399     graph->set_manager(mng);
400   }
401   auto &node_list = graph->execution_order();
402   for (auto &node : node_list) {
403     if (!common::AnfAlgo::IsBpropCutOpExecInBackend(node)) {
404       auto [msg, etype] = SetKernelInfoWithMsg(node);
405       if (msg.empty()) {
406         continue;
407       }
408       auto f = [](const CNodePtr &n) {
409         auto res = SetKernelInfoWithMsg(n);
410         return res.first.empty();
411       };
412       auto expand_ret = expander::TryExpandCNode(node, f);
413       if (!expand_ret) {
414         constexpr auto recursive_level = 2;
415         MS_EXCEPTION(etype) << "#umsg#Kernel select failed:#umsg#" << msg
416                             << "\nnode: " << node->DebugString(recursive_level);
417       }
418       MS_LOG(INFO) << msg << " but expand success.";
419       do_expand = true;
420     } else {
421       SetControlOpInfo(node);
422     }
423   }
424   if (do_expand) {
425     (void)graphkernel::BindValueToGraph().Run(graph);
426     graph->SetExecOrderByDefault();
427   }
428   (void)profiler::CollectHostInfo(kModelNameCPU, kEventOptimizeGraph, kStageSetKernelInfo, 1, 0, 1);
429 }
430 
CreateKernelMod(const std::string & op_name) const431 kernel::KernelModPtr CPUKernelExecutor::CreateKernelMod(const std::string &op_name) const {
432   return kernel::Factory<kernel::NativeCpuKernelMod>::Instance().Create(op_name);
433 }
434 
CreateKernel(const std::vector<CNodePtr> & nodes) const435 void CPUKernelExecutor::CreateKernel(const std::vector<CNodePtr> &nodes) const {
436   SetKernelInfoBeforeCreateKernel(nodes);
437 
438   kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
439   std::vector<AnfNodePtr> akg_nodes;
440   for (const auto &node : nodes) {
441     MS_EXCEPTION_IF_NULL(node);
442     if (common::AnfAlgo::IsBpropCutOpExecInBackend(node)) {
443       continue;
444     }
445     if (session::AnfRuntimeAlgorithm::GetKernelType(node) == KernelType::AKG_KERNEL) {
446       if (!bin_map->initialized()) {
447         bin_map->Initialize();
448       }
449       akg_nodes.push_back(node);
450       continue;
451     }
452     std::string kernel_name = common::AnfAlgo::GetCNodeName(node);
453 
454     std::shared_ptr<kernel::NativeCpuKernelMod> cpu_kernel =
455       kernel::Factory<kernel::NativeCpuKernelMod>::Instance().Create(kernel_name);
456 
457     if (cpu_kernel == nullptr) {
458       MS_LOG(INTERNAL_EXCEPTION) << "#dmsg#Kernel build failed:#dmsg#Build cpu operator[" << node->fullname_with_scope()
459                                  << "] failed";
460     }
461 
462     auto kernel_attrs = cpu_kernel->GetOpSupport();
463     kernel::SetCpuRefMapToKernelInfo(node, kernel_attrs);
464     auto thread_pool = kernel::GetActorMgrInnerThreadPool();
465     cpu_kernel->SetThreadPool(thread_pool);
466     std::vector<KernelTensor *> input_kernel_tensors = AnfAlgo::GetOrCreateAllInputKernelTensors(node);
467     std::vector<KernelTensor *> output_kernel_tensors = AnfAlgo::GetOrCreateAllOutputKernelTensors(node);
468     auto ret = cpu_kernel->Init(common::AnfAlgo::GetCNodePrimitive(node), input_kernel_tensors, output_kernel_tensors);
469     if (!ret) {
470       MS_LOG(EXCEPTION) << trace::DumpSourceLines(node);
471     }
472     if (kernel::CheckResizeCondition(node)) {
473       if (cpu_kernel->Resize(input_kernel_tensors, output_kernel_tensors) == kernel::KRET_RESIZE_FAILED) {
474         MS_LOG(INTERNAL_EXCEPTION) << "#dmsg#Kernel build failed:#dmsg#CPU kernel op [" << node->fullname_with_scope()
475                                    << "] resize failed.";
476       }
477     }
478 
479     AnfAlgo::SetKernelMod(cpu_kernel, node.get());
480   }
481 #ifdef ENABLE_AKG
482   kernel::AkgCpuKernelBuilder akg_cpu_kernel_builder;
483   (void)akg_cpu_kernel_builder.SingleOpParallelBuild(akg_nodes);
484 #endif
485 }
486 
PreprocessBeforeRun(const FuncGraphPtr & graph) const487 void CPUKernelExecutor::PreprocessBeforeRun(const FuncGraphPtr &graph) const {
488   MS_EXCEPTION_IF_NULL(graph);
489   auto kernel_graph = graph->cast<KernelGraphPtr>();
490   MS_EXCEPTION_IF_NULL(kernel_graph);
491   if (!kernel_graph->is_from_single_op()) {
492     // Remove reorder after PS feature finish adapting push/pull in auto_monad.
493     auto execution_order = kernel_graph->execution_order();
494     common::AnfAlgo::ReorderPosteriorExecList(NOT_NULL(&execution_order));
495     kernel_graph->set_execution_order(execution_order);
496   }
497   auto ms_context = MsContext::GetInstance();
498   MS_EXCEPTION_IF_NULL(ms_context);
499   // somas
500   if (ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) != kOptimizeO0) {
501     auto somas = std::make_shared<CPUSomas>();
502     bool ret = somas->Assign(kernel_graph);
503     if (ret) {
504       MS_LOG(INFO) << "Somas allocate success for graph " << kernel_graph->graph_id()
505                    << " somas size: " << kernel_graph->somas_whole_block_size();
506     } else if (somas->IsSupportSomas(*kernel_graph)) {
507       MS_LOG(WARNING) << "Somas allocate failed for graph " << kernel_graph->graph_id();
508     }
509   }
510   MS_LOG(INFO) << "Status record: end preprocess before run graph. graph id: " << kernel_graph->graph_id();
511 }
512 
LaunchKernel(const CNodePtr & kernel,const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & workspace,const std::vector<KernelTensor * > & outputs,KernelMod * kernel_mod,void *) const513 bool CPUKernelExecutor::LaunchKernel(const CNodePtr &kernel, const std::vector<KernelTensor *> &inputs,
514                                      const std::vector<KernelTensor *> &workspace,
515                                      const std::vector<KernelTensor *> &outputs, KernelMod *kernel_mod,
516                                      void * /* stream*/) const {
517   MS_EXCEPTION_IF_NULL(kernel);
518 
519 #ifndef ENABLE_SECURITY
520   const auto &profiler_inst = profiler::cpu::CPUProfiler::GetInstance();
521   MS_EXCEPTION_IF_NULL(profiler_inst);
522   if (profiler_inst->GetEnableFlag() && profiler_inst->GetOpTimeFlag()) {
523     auto ret = LaunchKernelWithProfiling(kernel, inputs, workspace, outputs, kernel_mod);
524     return ret;
525   }
526 #endif
527   auto ret = DoLaunchKernel(kernel, inputs, workspace, outputs, kernel_mod);
528   return ret;
529 }
530 
ExecuteKernelTask(const runtime::KernelTaskType & task_type,const device::DeviceAddressPtrList & input_addr_list,const device::DeviceAddressPtrList & output_addr_list,const size_t & stream_id) const531 bool CPUKernelExecutor::ExecuteKernelTask(const runtime::KernelTaskType &task_type,
532                                           const device::DeviceAddressPtrList &input_addr_list,
533                                           const device::DeviceAddressPtrList &output_addr_list,
534                                           const size_t &stream_id) const {
535   auto task_context =
536     std::make_shared<runtime::KernelTaskContext>(device_context_, input_addr_list, output_addr_list, nullptr);
537   auto task = GetTaskByTaskType(task_type, task_context);
538   MS_EXCEPTION_IF_NULL(task);
539 
540   auto ret = task->RunWithRet();
541   if (!ret) {
542     MS_LOG(EXCEPTION) << "Exec task failed, task_type:" << task_type;
543   }
544   return ret;
545 }
546 
LoadCollectiveCommLib()547 bool CPUDeviceResManager::LoadCollectiveCommLib() {
548   bool using_mpi = common::UseMPI();
549   if (using_mpi) {
550     std::string mpi_comm_lib_name = "libmpi_collective.so";
551     auto loader = std::make_shared<CollectiveCommLibLoader>(mpi_comm_lib_name);
552     MS_EXCEPTION_IF_NULL(loader);
553     if (!loader->Initialize()) {
554       MS_LOG(EXCEPTION) << "Failed to load mpi collective library.";
555     }
556 
557     void *collective_comm_lib_handle = loader->collective_comm_lib_ptr();
558     MS_EXCEPTION_IF_NULL(collective_comm_lib_handle);
559 
560     auto instance_func = DlsymFuncObj(communication_lib_instance, collective_comm_lib_handle);
561     collective_comm_lib_ = instance_func();
562     MS_EXCEPTION_IF_NULL(collective_comm_lib_);
563   } else {
564 #if defined(__linux__) && defined(WITH_BACKEND)
565     collective_comm_lib_ = &MsCollectiveCommLib::GetInstance();
566     MS_EXCEPTION_IF_NULL(collective_comm_lib_);
567 #endif
568   }
569   return true;
570 }
571 
LaunchKernelWithProfiling(const CNodePtr & kernel,const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & workspace,const std::vector<KernelTensor * > & outputs,KernelMod * kernel_mod) const572 bool CPUKernelExecutor::LaunchKernelWithProfiling(const CNodePtr &kernel, const std::vector<KernelTensor *> &inputs,
573                                                   const std::vector<KernelTensor *> &workspace,
574                                                   const std::vector<KernelTensor *> &outputs,
575                                                   KernelMod *kernel_mod) const {
576   MS_EXCEPTION_IF_NULL(kernel);
577 
578   auto profiler_inst = profiler::cpu::CPUProfiler::GetInstance();
579   MS_EXCEPTION_IF_NULL(profiler_inst);
580 
581   uint32_t pid = IntToUint(getpid());
582   // cpu support multi-thread with mindrt for profiling.
583   profiler_inst->OpDataProducerBeginParallel(kernel->fullname_with_scope(), pid);
584   bool ret = DoLaunchKernel(kernel, inputs, workspace, outputs, kernel_mod);
585   profiler_inst->OpDataProducerEndParallel(kernel->fullname_with_scope());
586   profiler_inst->RecordFrameWorkInfo(kernel);
587   return ret;
588 }
589 
DoLaunchKernel(const CNodePtr & kernel,const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & workspace,const std::vector<KernelTensor * > & outputs,KernelMod * kernel_mod) const590 bool CPUKernelExecutor::DoLaunchKernel(const CNodePtr &kernel, const std::vector<KernelTensor *> &inputs,
591                                        const std::vector<KernelTensor *> &workspace,
592                                        const std::vector<KernelTensor *> &outputs, KernelMod *kernel_mod) const {
593   MS_EXCEPTION_IF_NULL(kernel);
594   MS_EXCEPTION_IF_NULL(kernel_mod);
595   uint64_t start_time = 0;
596   PROFILER_START(start_time);
597   auto ret = kernel_mod->Launch(inputs, workspace, outputs, nullptr);
598   PROFILER_END(start_time, runtime::ProfilerModule::kKernel, runtime::ProfilerEvent::kKernelLaunch,
599                kernel->fullname_with_scope(), false);
600   return ret;
601 }
602 
RebuildKernelSelectBackoffOp(const std::vector<CNodePtr> & nodes) const603 void CPUKernelExecutor::RebuildKernelSelectBackoffOp(const std::vector<CNodePtr> &nodes) const {
604   for (auto &node : nodes) {
605     MS_EXCEPTION_IF_NULL(node);
606     if (!AnfAlgo::IsKernelSelectBackoffOp(node)) {
607       continue;
608     }
609     auto [failure_info, failure_type] = AnfAlgo::GetKernelSelectBackoffInfo(node);
610     if (IsVmapNotSupported(node)) {
611       MS_EXCEPTION(failure_type) << "#umsg#Kernel select failed:#umsg#" << failure_info;
612     }
613 
614     // Judge whether match strictly between kernel build info and supported kernel attrs.
615     const auto &kernel_build_info = AnfAlgo::GetSelectKernelBuildInfo(node);
616     MS_EXCEPTION_IF_NULL(kernel_build_info);
617     const auto &kernel_attr = kernel::GetKernelAttrFromBuildInfo(kernel_build_info);
618     const auto &supported_kernel_attrs =
619       kernel::NativeCpuKernelMod::GetCpuSupportedList(common::AnfAlgo::GetCNodeName(node));
620     const auto &match_result = kernel::MatchKernelAttrStrict(kernel_attr, supported_kernel_attrs);
621     auto attr_info = kernel::FetchPrintInfoByKernelAttr(kernel_attr);
622     if (!match_result.first) {
623       MS_LOG(INFO) << "Backoff and rebuild kernel on CPU failed for node: " << node->fullname_with_scope()
624                    << ", node attr: " << attr_info;
625       MS_EXCEPTION(failure_type) << "#umsg#Kernel select failed:#umsg#" << failure_info;
626     } else {
627       // Set the CPU flag.
628       common::AnfAlgo::SetNodeAttr(kAttrPrimitiveTarget, MakeValue(kCPUDevice), node);
629       kernel_build_info->set_kernel_type(CPU_KERNEL);
630       kernel_build_info->set_processor(kernel::Processor::CPU);
631       MS_LOG(INFO) << "Backoff and rebuild kernel on CPU successfully for node: " << node->fullname_with_scope()
632                    << ", node attr: " << attr_info;
633     }
634 
635     CreateKernel({node});
636   }
637 }
638 
639 MS_REGISTER_DEVICE(kCPUDevice, CPUDeviceContext);
640 #ifdef WITH_BACKEND
__anon42e1b3780502(MsContext *ctx) 641 MSCONTEXT_REGISTER_INIT_FUNC(kCPUDevice, [](MsContext *ctx) -> void {
642   MS_EXCEPTION_IF_NULL(ctx);
643   if (ctx->backend_policy() != "ms") {
644     (void)ctx->set_backend_policy("ms");
645   }
646 });
647 #endif
648 
649 // Register functions to _c_expression so python hal module could call CPU device interfaces.
PybindCPUStatelessFunc(py::module * m)650 void PybindCPUStatelessFunc(py::module *m) { MS_EXCEPTION_IF_NULL(m); }
651 REGISTER_DEV_STATELESS_FUNC_CB(kCPUDevice, PybindCPUStatelessFunc);
652 }  // namespace cpu
653 }  // namespace device
654 }  // namespace mindspore
655