1 /**
2 * Copyright 2021-2023 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "plugin/device/cpu/hal/hardware/cpu_device_context.h"
18 #include <map>
19 #include <string>
20 #include "plugin/device/cpu/hal/device/cpu_device_address.h"
21 #include "plugin/device/cpu/hal/device/cpu_memory_manager.h"
22 #include "plugin/device/cpu/optimizer/reg_cpu_const_input_to_attr.h"
23 #include "plugin/device/cpu/optimizer/print_value_type.h"
24 #include "plugin/device/cpu/hal/hardware/cpu_somas.h"
25 #include "plugin/device/cpu/hal/device/cpu_hash_table_util.h"
26 #ifdef ENABLE_AKG
27 #include "plugin/device/cpu/kernel/akg/akg_cpu_kernel_build.h"
28 #endif
29 #include "plugin/factory/ms_factory.h"
30 #include "plugin/device/cpu/kernel/cpu_kernel.h"
31 #include "kernel/kernel_build_info.h"
32 #include "kernel/framework_utils.h"
33 #include "plugin/device/cpu/hal/device/kernel_select_cpu.h"
34 #include "utils/trace_base.h"
35 #include "backend/common/graph_kernel/graph_kernel_flags.h"
36 #include "include/backend/optimizer/optimizer.h"
37 #include "include/backend/optimizer/pass_manager.h"
38 #include "backend/common/optimizer/common_backend_optimization.h"
39 #include "backend/common/optimizer/dynamic_shape/dynamic_shape_helper.h"
40 #include "plugin/device/cpu/optimizer/insert_cast_cpu.h"
41 #include "plugin/device/cpu/optimizer/insert_cast_to_pyexecute.h"
42 #include "plugin/device/cpu/optimizer/insert_format_transform_op.h"
43 #include "plugin/device/cpu/optimizer/softmax_grad_fusion.h"
44 #include "plugin/device/cpu/optimizer/matmul_biasadd_fusion.h"
45 #include "plugin/device/cpu/optimizer/matmul_biasadd_relu_fusion.h"
46 #include "backend/common/pass/insert_type_transform_op.h"
47 #include "backend/common/pass/flatten_value_sequence_in_pyexecute.h"
48 #include "backend/common/pass/communication_op_fusion.h"
49 #include "backend/common/pass/replace_node_by_proxy.h"
50 #include "backend/common/pass/erase_visit_attr.h"
51 #include "backend/common/pass/add_training_attr.h"
52 #include "backend/common/pass/insert_tensor_move_for_communication.h"
53 #include "backend/common/pass/dynamic_sequence_ops_adaptation.h"
54 #include "backend/common/graph_kernel/adapter/graph_kernel_optimization.h"
55 #include "backend/common/expander/fallback/expander_fallback.h"
56 #include "backend/common/graph_kernel/value_graph_binder.h"
57 #include "include/backend/anf_runtime_algorithm.h"
58 #include "include/common/utils/anfalgo.h"
59 #include "plugin/device/cpu/hal/profiler/cpu_profiling.h"
60 #if defined(__linux__) && defined(WITH_BACKEND)
61 #include "plugin/device/cpu/hal/hardware/ms_collective_comm_lib.h"
62 #endif
63 #ifndef ENABLE_SECURITY
64 #include "include/backend/debug/data_dump/dump_json_parser.h"
65 #endif
66 #ifdef ENABLE_DUMP_IR
67 #include "include/common/debug/anf_ir_dump.h"
68 #endif
69 #include "include/common/profiler.h"
70 #include "plugin/device/cpu/hal/device/cpu_kernel_task.h"
71 #include "plugin/device/cpu/hal/device/cpu_device_synchronizer.h"
72 #include "ops/framework_ops.h"
73 #include "kernel/oplib/oplib.h"
74 #include "runtime/device/move_to.h"
75
76 namespace mindspore {
77 namespace device {
78 namespace cpu {
79 namespace {
80 const char kModelNameCPU[] = "CPU";
81 const char kEventOptimizeGraph[] = "OptimizeGraph";
82 const char kStageSetKernelInfo[] = "SetKernelInfo";
83
GetTaskByTaskType(const runtime::KernelTaskType & task_type,const std::shared_ptr<runtime::KernelTaskContext> & task_context)84 runtime::KernelTaskPtr GetTaskByTaskType(const runtime::KernelTaskType &task_type,
85 const std::shared_ptr<runtime::KernelTaskContext> &task_context) {
86 switch (task_type) {
87 case runtime::KernelTaskType::kCONTIGUOUS_TASK:
88 return std::make_shared<CpuContiguousKernelTask>(task_context);
89 case runtime::KernelTaskType::kCOPY_TASK:
90 return std::make_shared<CpuCopyWithSliceKernelTask>(task_context);
91 default:
92 MS_LOG(EXCEPTION) << "KernelTaskType is invalid, task_type:" << task_type;
93 }
94 }
95 } // namespace
96 using mindspore::kernel::KernelBuildInfo;
97
Initialize()98 void CPUDeviceContext::Initialize() {
99 #ifdef __APPLE__
100 std::lock_guard<SpinLock> spin_lock(init_lock_);
101 #else
102 std::lock_guard<std::mutex> lock(init_mutex_);
103 #endif
104 if (initialized_) {
105 return;
106 }
107 MS_EXCEPTION_IF_NULL(device_res_manager_);
108 device_res_manager_->Initialize();
109 auto ms_context = MsContext::GetInstance();
110 MS_EXCEPTION_IF_NULL(ms_context);
111 #ifndef ENABLE_SECURITY
112 if (ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) {
113 // Dump json config file if dump is enabled.
114 uint32_t rank_id = 0;
115 auto &json_parser = DumpJsonParser::GetInstance();
116 json_parser.Parse();
117 json_parser.CopyDumpJsonToDir(rank_id);
118 json_parser.CopyMSCfgJsonToDir(rank_id);
119 }
120 #endif
121 #ifdef __linux__
122 if (ms_context->IsDefaultDeviceTarget() && ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) {
123 MS_LOG(INFO)
124 << "No device_target set, set CPU as default. You can call mindspore.set_context(device_target=\"XXX\")";
125 }
126 #endif // __linux__
127 initialized_ = true;
128 }
129
Destroy()130 void CPUDeviceContext::Destroy() {
131 MS_EXCEPTION_IF_NULL(device_res_manager_);
132 device_res_manager_->Destroy();
133 initialized_ = false;
134 }
135
Initialize()136 void CPUDeviceResManager::Initialize() {
137 mem_manager_ = std::make_shared<CPUMemoryManager>();
138 MS_EXCEPTION_IF_NULL(mem_manager_);
139 }
140
Destroy()141 void CPUDeviceResManager::Destroy() {
142 // Release memory.
143 if (mem_manager_ != nullptr) {
144 mem_manager_->Finalize();
145 mem_manager_ = nullptr;
146 }
147 }
148
AllocateMemory(size_t size,uint32_t stream_id) const149 void *CPUDeviceResManager::AllocateMemory(size_t size, uint32_t stream_id) const {
150 MS_EXCEPTION_IF_NULL(mem_manager_);
151 return mem_manager_->MallocMemFromMemPool(size, false, false, stream_id);
152 }
153
FreeMemory(void * ptr) const154 void CPUDeviceResManager::FreeMemory(void *ptr) const {
155 MS_EXCEPTION_IF_NULL(ptr);
156 MS_EXCEPTION_IF_NULL(mem_manager_);
157 mem_manager_->FreeMemFromMemPool(ptr);
158 }
159
FreePartMemorys(const std::vector<void * > & free_addrs,const std::vector<void * > & keep_addrs,const std::vector<size_t> & keep_addr_sizes) const160 void CPUDeviceResManager::FreePartMemorys(const std::vector<void *> &free_addrs, const std::vector<void *> &keep_addrs,
161 const std::vector<size_t> &keep_addr_sizes) const {
162 CPUMemoryPool::GetInstance().FreePartTensorMems(free_addrs, keep_addrs, keep_addr_sizes);
163 }
164
AllocateContinuousMemory(const std::vector<size_t> & size_list,uint32_t stream_id) const165 std::vector<void *> CPUDeviceResManager::AllocateContinuousMemory(const std::vector<size_t> &size_list,
166 uint32_t stream_id) const {
167 MS_EXCEPTION_IF_NULL(mem_manager_);
168 return mem_manager_->MallocContinuousMemFromMemPool(size_list, stream_id);
169 }
170
171 namespace {
172 // Create user data content(such as CPU hash table) and set user data reference into device_address.
FillUserData(const UserDataPtr & user_data,DeviceAddress * device_address)173 void FillUserData(const UserDataPtr &user_data, DeviceAddress *device_address) {
174 MS_EXCEPTION_IF_NULL(user_data);
175 MS_EXCEPTION_IF_NULL(device_address);
176
177 // Save reference of user data in device address.
178 device_address->set_user_data(user_data);
179
180 const auto &user_data_type = user_data->get<UserDataType>(kUserDataType);
181 if (user_data_type == nullptr) {
182 return;
183 }
184 if (*user_data_type == UserDataType::kUserTypeHashTable) {
185 auto key_type = user_data->get<TypeId>(kHashTableKeyType);
186 auto value_type = user_data->get<TypeId>(kHashTableValueType);
187 MS_EXCEPTION_IF_NULL(key_type);
188 MS_EXCEPTION_IF_NULL(value_type);
189 const auto &iter = cpu_hash_table_funcs.find({*key_type, *value_type});
190 if (iter != cpu_hash_table_funcs.end()) {
191 // Create CPU hash table and set into `user_data`.
192 return std::get<kCreateFuncIndex>(iter->second)(user_data);
193 } else {
194 MS_LOG(EXCEPTION) << "Unsupported hash table type, key type:" << TypeIdLabel(*key_type)
195 << ", value type:" << TypeIdLabel(*value_type);
196 }
197 } else {
198 MS_LOG(EXCEPTION) << "Invalid user data type:" << *user_data_type;
199 }
200 }
201 } // namespace
202
CreateDeviceAddress(const KernelTensorPtr & kernel_tensor) const203 DeviceAddressPtr CPUDeviceResManager::CreateDeviceAddress(const KernelTensorPtr &kernel_tensor) const {
204 MS_EXCEPTION_IF_NULL(kernel_tensor);
205 if (kernel_tensor->device_name().empty()) {
206 kernel_tensor->set_device_name(device_context_->device_context_key().device_name_);
207 kernel_tensor->set_device_id(device_context_->device_context_key().device_id_);
208 }
209 auto device_address = std::make_shared<CPUDeviceAddress>(kernel_tensor);
210
211 const auto &user_data = kernel_tensor->user_data();
212 if (user_data != nullptr) {
213 FillUserData(user_data, device_address.get());
214 }
215 device_address->set_device_synchronizer(std::make_shared<CPUDeviceSynchronizer>());
216 return device_address;
217 }
218
MoveTo(const tensor::TensorPtr & src_tensor,const tensor::TensorPtr & dst_tensor,const std::string & to,bool blocking,bool * return_self)219 void CPUDeviceResManager::MoveTo(const tensor::TensorPtr &src_tensor, const tensor::TensorPtr &dst_tensor,
220 const std::string &to, bool blocking, bool *return_self) {
221 device::MoveTo(src_tensor, dst_tensor, to, blocking, return_self);
222 }
223
CreateDeviceAddress(void * ptr,size_t size,const ShapeVector & shape_vector,const Format & format,TypeId type_id,const std::string & device_name,uint32_t device_id,uint32_t stream_id) const224 DeviceAddressPtr CPUDeviceResManager::CreateDeviceAddress(void *ptr, size_t size, const ShapeVector &shape_vector,
225 const Format &format, TypeId type_id,
226 const std::string &device_name, uint32_t device_id,
227 uint32_t stream_id) const {
228 return std::make_shared<CPUDeviceAddress>(ptr, size, shape_vector, format, type_id, device_name, device_id,
229 stream_id);
230 }
231
OptimizeGraph(const FuncGraphPtr & graph) const232 void CPUKernelExecutor::OptimizeGraph(const FuncGraphPtr &graph) const {
233 MS_EXCEPTION_IF_NULL(graph);
234 auto kernel_graph = graph->cast<KernelGraphPtr>();
235 MS_EXCEPTION_IF_NULL(kernel_graph);
236 auto ms_context = MsContext::GetInstance();
237 MS_EXCEPTION_IF_NULL(ms_context);
238 auto enable_lazy_inline = ms_context->CellReuseLevel() != CellReuseLevel::kNoCellReuse;
239 if (enable_lazy_inline) {
240 MS_LOG(EXCEPTION) << "CPU does not support the lazy_inline feature, "
241 << "please do not mark @lazy_inline in cell's __init__ func.";
242 }
243 if (kernel_graph->is_from_single_op()) {
244 SetOperatorInfo(kernel_graph);
245 SingleOpGraphOptimize(kernel_graph);
246 UpdateKernelRefInfo(kernel_graph);
247 } else {
248 // The passes in this function must be before ops select: SetOperatorInfo()
249 OptimizeMindIR(kernel_graph);
250 // Update Graph Dynamic Shape Attr.
251 opt::AddDynamicShapeAttrPass(kernel_graph);
252
253 SetOperatorInfo(kernel_graph);
254 // SetOperatorInfo may generate new node, so need set kernel object type again.
255 kernel_graph->SetKernelObjectTypesForUnrealNodes();
256 #ifdef ENABLE_DUMP_IR
257 if (ms_context->CanDump(kIntroductory)) {
258 DumpIR("hwopt_comm_after_kernel_select_" + graph->ToString() + ".ir", graph, true);
259 }
260 #endif
261
262 OptimizeGraphImpl(kernel_graph);
263
264 // Run final optimization.
265 opt::CommonFinalOptimization(kernel_graph);
266
267 // Run graph kernel fusion optimization
268 if (graphkernel::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) {
269 graphkernel::GraphKernelOptimize(kernel_graph);
270 kernel_graph->SetExecOrderByDefault();
271 }
272 }
273 }
274
UpdateKernelRefInfo(const KernelGraphPtr & graph) const275 void CPUKernelExecutor::UpdateKernelRefInfo(const KernelGraphPtr &graph) const {
276 MS_EXCEPTION_IF_NULL(graph);
277 const std::vector<CNodePtr> &kernels = graph->execution_order();
278 for (const auto &kernel : kernels) {
279 MS_EXCEPTION_IF_NULL(kernel);
280 const std::string &op_name = common::AnfAlgo::GetCNodeName(kernel);
281 if (IsPrimitiveCNode(kernel, prim::kPrimCustom) &&
282 mindspore::kernel::OpLib::FindOp(op_name, kernel::OpImplyType::kImplyCPU) == nullptr) {
283 MS_LOG(DEBUG) << "Not find operator information for Custom operator [" << op_name << "]";
284 return;
285 }
286
287 auto kernel_attr_list = kernel::NativeCpuKernelMod::GetCpuSupportedList(op_name);
288 if (kernel_attr_list.empty()) {
289 MS_LOG(DEBUG) << "kernel_attr_list is empty";
290 return;
291 }
292
293 auto kernel_info = dynamic_cast<device::KernelInfo *>(kernel->kernel_info());
294 MS_EXCEPTION_IF_NULL(kernel_info);
295 kernel_info->set_ref_map(kernel_attr_list[0].GetAllOutInRef(), kernel_attr_list[0].GetOutInRefMap());
296 }
297 }
298
OptimizeMindIR(const KernelGraphPtr & graph) const299 void CPUKernelExecutor::OptimizeMindIR(const KernelGraphPtr &graph) const {
300 MS_EXCEPTION_IF_NULL(graph);
301 auto optimizer = std::make_shared<opt::GraphOptimizer>();
302 auto pm = std::make_shared<opt::PassManager>();
303 pm->AddPass(std::make_shared<opt::SoftmaxGradFusionCpu>("softmax_grad_fusion_cpu"));
304 // Match MatMul+BiasAdd+ReLU first, if no match, then match MatMul+BiasAdd
305 pm->AddPass(std::make_shared<opt::MatMulBiasAddReluFusionCPU>("matmul_biasadd_relu_fusion_cpu"));
306 pm->AddPass(std::make_shared<opt::DynamicSequenceOpsAdaptation>());
307 optimizer->AddPassManager(pm);
308 (void)optimizer->Optimize(graph);
309 graph->SetExecOrderByDefault();
310 }
311
OptimizeGraphImpl(const KernelGraphPtr & graph) const312 void CPUKernelExecutor::OptimizeGraphImpl(const KernelGraphPtr &graph) const {
313 MS_EXCEPTION_IF_NULL(graph);
314 auto optimizer = std::make_shared<opt::GraphOptimizer>();
315 auto pm = std::make_shared<opt::PassManager>();
316 pm->AddPass(std::make_shared<opt::InsertTypeTransformOp>("insert_type_transform_op"));
317 pm->AddPass(std::make_shared<opt::FlattenValueSequenceInPyExecute>("flatten_value_sequence_in_pyexecute"));
318 pm->AddPass(std::make_shared<opt::InsertFormatTransformOpCPU>("insert_format_transform_op_cpu"));
319 pm->AddPass(std::make_shared<opt::AllReduceFusion>());
320 pm->AddPass(std::make_shared<opt::InsertCastCPU>("insert_cast"));
321 pm->AddPass(std::make_shared<opt::EraseVisitAttr>());
322 pm->AddPass(std::make_shared<opt::InsertTensorMoveForCommunication>());
323 pm->AddPass(std::make_shared<opt::AddTrainingAttr>());
324 pm->AddPass(std::make_shared<opt::PrintValueType>("print_value_type"));
325 pm->AddPass(std::make_shared<opt::InsertCastToPyExecute>("insert_cast_for_pyexecute"));
326 optimizer->AddPassManager(pm);
327 (void)optimizer->Optimize(graph);
328 graph->SetExecOrderByDefault();
329 }
330
SingleOpGraphOptimize(const KernelGraphPtr & graph) const331 void CPUKernelExecutor::SingleOpGraphOptimize(const KernelGraphPtr &graph) const {
332 MS_EXCEPTION_IF_NULL(graph);
333 auto optimizer = std::make_shared<opt::GraphOptimizer>();
334 auto pm = std::make_shared<opt::PassManager>();
335 pm->AddPass(std::make_shared<opt::InsertCastCPU>("insert_cast"));
336 optimizer->AddPassManager(pm);
337 (void)optimizer->Optimize(graph);
338 graph->SetExecOrderByDefault();
339 }
340
341 namespace {
SetControlOpInfo(const CNodePtr & kernel_node)342 void SetControlOpInfo(const CNodePtr &kernel_node) {
343 MS_EXCEPTION_IF_NULL(kernel_node);
344 std::vector<std::string> inputs_format;
345 std::vector<TypeId> inputs_type;
346 size_t input_num = common::AnfAlgo::GetInputTensorNum(kernel_node);
347 for (size_t input_index = 0; input_index < input_num; ++input_index) {
348 (void)inputs_format.emplace_back(kOpFormat_DEFAULT);
349 inputs_type.push_back(common::AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, input_index));
350 }
351 std::vector<std::string> outputs_format;
352 std::vector<TypeId> outputs_type;
353 size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
354 for (size_t output_index = 0; output_index < output_num; ++output_index) {
355 (void)outputs_format.emplace_back(kOpFormat_DEFAULT);
356 outputs_type.push_back(common::AnfAlgo::GetOutputInferDataType(kernel_node, output_index));
357 }
358
359 auto builder = std::make_shared<KernelBuildInfo::KernelBuildInfoBuilder>();
360 builder->SetInputsFormat(inputs_format);
361 builder->SetInputsDeviceType(inputs_type);
362 builder->SetOutputsFormat(outputs_format);
363 builder->SetOutputsDeviceType(outputs_type);
364
365 AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), kernel_node.get());
366 }
367
368 // Before creating the kernel, check whether the node has completed the operator selection. If not, the operator
369 // selection needs to be performed to set kernel info.
SetKernelInfoBeforeCreateKernel(const std::vector<CNodePtr> & nodes)370 void SetKernelInfoBeforeCreateKernel(const std::vector<CNodePtr> &nodes) {
371 // Check whether the node has completed operator selection.
372 for (const auto &node : nodes) {
373 if (AnfAlgo::GetSelectKernelBuildInfo(node) != nullptr) {
374 continue;
375 }
376
377 // Kernel selection process for non control op.
378 if (!common::AnfAlgo::IsBpropCutOpExecInBackend(node)) {
379 auto [msg, etype] = SetKernelInfoWithMsg(node);
380 if (!msg.empty()) {
381 MS_EXCEPTION(etype) << "#umsg#Kernel select failed:#umsg#" << msg;
382 }
383 } else {
384 // Kernel selection process for control op.
385 SetControlOpInfo(node);
386 }
387 }
388 }
389 } // namespace
390
SetOperatorInfo(const KernelGraphPtr & graph) const391 void CPUKernelExecutor::SetOperatorInfo(const KernelGraphPtr &graph) const {
392 MS_EXCEPTION_IF_NULL(graph);
393 (void)profiler::CollectHostInfo(kModelNameCPU, kEventOptimizeGraph, kStageSetKernelInfo, 1, 0, 0);
394 bool do_expand = false;
395 auto mng = graph->manager();
396 if (mng == nullptr) {
397 mng = Manage(graph, true);
398 MS_EXCEPTION_IF_NULL(mng);
399 graph->set_manager(mng);
400 }
401 auto &node_list = graph->execution_order();
402 for (auto &node : node_list) {
403 if (!common::AnfAlgo::IsBpropCutOpExecInBackend(node)) {
404 auto [msg, etype] = SetKernelInfoWithMsg(node);
405 if (msg.empty()) {
406 continue;
407 }
408 auto f = [](const CNodePtr &n) {
409 auto res = SetKernelInfoWithMsg(n);
410 return res.first.empty();
411 };
412 auto expand_ret = expander::TryExpandCNode(node, f);
413 if (!expand_ret) {
414 constexpr auto recursive_level = 2;
415 MS_EXCEPTION(etype) << "#umsg#Kernel select failed:#umsg#" << msg
416 << "\nnode: " << node->DebugString(recursive_level);
417 }
418 MS_LOG(INFO) << msg << " but expand success.";
419 do_expand = true;
420 } else {
421 SetControlOpInfo(node);
422 }
423 }
424 if (do_expand) {
425 (void)graphkernel::BindValueToGraph().Run(graph);
426 graph->SetExecOrderByDefault();
427 }
428 (void)profiler::CollectHostInfo(kModelNameCPU, kEventOptimizeGraph, kStageSetKernelInfo, 1, 0, 1);
429 }
430
CreateKernelMod(const std::string & op_name) const431 kernel::KernelModPtr CPUKernelExecutor::CreateKernelMod(const std::string &op_name) const {
432 return kernel::Factory<kernel::NativeCpuKernelMod>::Instance().Create(op_name);
433 }
434
CreateKernel(const std::vector<CNodePtr> & nodes) const435 void CPUKernelExecutor::CreateKernel(const std::vector<CNodePtr> &nodes) const {
436 SetKernelInfoBeforeCreateKernel(nodes);
437
438 kernel::KernelMeta *bin_map = kernel::KernelMeta::GetInstance();
439 std::vector<AnfNodePtr> akg_nodes;
440 for (const auto &node : nodes) {
441 MS_EXCEPTION_IF_NULL(node);
442 if (common::AnfAlgo::IsBpropCutOpExecInBackend(node)) {
443 continue;
444 }
445 if (session::AnfRuntimeAlgorithm::GetKernelType(node) == KernelType::AKG_KERNEL) {
446 if (!bin_map->initialized()) {
447 bin_map->Initialize();
448 }
449 akg_nodes.push_back(node);
450 continue;
451 }
452 std::string kernel_name = common::AnfAlgo::GetCNodeName(node);
453
454 std::shared_ptr<kernel::NativeCpuKernelMod> cpu_kernel =
455 kernel::Factory<kernel::NativeCpuKernelMod>::Instance().Create(kernel_name);
456
457 if (cpu_kernel == nullptr) {
458 MS_LOG(INTERNAL_EXCEPTION) << "#dmsg#Kernel build failed:#dmsg#Build cpu operator[" << node->fullname_with_scope()
459 << "] failed";
460 }
461
462 auto kernel_attrs = cpu_kernel->GetOpSupport();
463 kernel::SetCpuRefMapToKernelInfo(node, kernel_attrs);
464 auto thread_pool = kernel::GetActorMgrInnerThreadPool();
465 cpu_kernel->SetThreadPool(thread_pool);
466 std::vector<KernelTensor *> input_kernel_tensors = AnfAlgo::GetOrCreateAllInputKernelTensors(node);
467 std::vector<KernelTensor *> output_kernel_tensors = AnfAlgo::GetOrCreateAllOutputKernelTensors(node);
468 auto ret = cpu_kernel->Init(common::AnfAlgo::GetCNodePrimitive(node), input_kernel_tensors, output_kernel_tensors);
469 if (!ret) {
470 MS_LOG(EXCEPTION) << trace::DumpSourceLines(node);
471 }
472 if (kernel::CheckResizeCondition(node)) {
473 if (cpu_kernel->Resize(input_kernel_tensors, output_kernel_tensors) == kernel::KRET_RESIZE_FAILED) {
474 MS_LOG(INTERNAL_EXCEPTION) << "#dmsg#Kernel build failed:#dmsg#CPU kernel op [" << node->fullname_with_scope()
475 << "] resize failed.";
476 }
477 }
478
479 AnfAlgo::SetKernelMod(cpu_kernel, node.get());
480 }
481 #ifdef ENABLE_AKG
482 kernel::AkgCpuKernelBuilder akg_cpu_kernel_builder;
483 (void)akg_cpu_kernel_builder.SingleOpParallelBuild(akg_nodes);
484 #endif
485 }
486
PreprocessBeforeRun(const FuncGraphPtr & graph) const487 void CPUKernelExecutor::PreprocessBeforeRun(const FuncGraphPtr &graph) const {
488 MS_EXCEPTION_IF_NULL(graph);
489 auto kernel_graph = graph->cast<KernelGraphPtr>();
490 MS_EXCEPTION_IF_NULL(kernel_graph);
491 if (!kernel_graph->is_from_single_op()) {
492 // Remove reorder after PS feature finish adapting push/pull in auto_monad.
493 auto execution_order = kernel_graph->execution_order();
494 common::AnfAlgo::ReorderPosteriorExecList(NOT_NULL(&execution_order));
495 kernel_graph->set_execution_order(execution_order);
496 }
497 auto ms_context = MsContext::GetInstance();
498 MS_EXCEPTION_IF_NULL(ms_context);
499 // somas
500 if (ms_context->get_param<int>(MS_CTX_MEMORY_OPTIMIZE_LEVEL) != kOptimizeO0) {
501 auto somas = std::make_shared<CPUSomas>();
502 bool ret = somas->Assign(kernel_graph);
503 if (ret) {
504 MS_LOG(INFO) << "Somas allocate success for graph " << kernel_graph->graph_id()
505 << " somas size: " << kernel_graph->somas_whole_block_size();
506 } else if (somas->IsSupportSomas(*kernel_graph)) {
507 MS_LOG(WARNING) << "Somas allocate failed for graph " << kernel_graph->graph_id();
508 }
509 }
510 MS_LOG(INFO) << "Status record: end preprocess before run graph. graph id: " << kernel_graph->graph_id();
511 }
512
LaunchKernel(const CNodePtr & kernel,const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & workspace,const std::vector<KernelTensor * > & outputs,KernelMod * kernel_mod,void *) const513 bool CPUKernelExecutor::LaunchKernel(const CNodePtr &kernel, const std::vector<KernelTensor *> &inputs,
514 const std::vector<KernelTensor *> &workspace,
515 const std::vector<KernelTensor *> &outputs, KernelMod *kernel_mod,
516 void * /* stream*/) const {
517 MS_EXCEPTION_IF_NULL(kernel);
518
519 #ifndef ENABLE_SECURITY
520 const auto &profiler_inst = profiler::cpu::CPUProfiler::GetInstance();
521 MS_EXCEPTION_IF_NULL(profiler_inst);
522 if (profiler_inst->GetEnableFlag() && profiler_inst->GetOpTimeFlag()) {
523 auto ret = LaunchKernelWithProfiling(kernel, inputs, workspace, outputs, kernel_mod);
524 return ret;
525 }
526 #endif
527 auto ret = DoLaunchKernel(kernel, inputs, workspace, outputs, kernel_mod);
528 return ret;
529 }
530
ExecuteKernelTask(const runtime::KernelTaskType & task_type,const device::DeviceAddressPtrList & input_addr_list,const device::DeviceAddressPtrList & output_addr_list,const size_t & stream_id) const531 bool CPUKernelExecutor::ExecuteKernelTask(const runtime::KernelTaskType &task_type,
532 const device::DeviceAddressPtrList &input_addr_list,
533 const device::DeviceAddressPtrList &output_addr_list,
534 const size_t &stream_id) const {
535 auto task_context =
536 std::make_shared<runtime::KernelTaskContext>(device_context_, input_addr_list, output_addr_list, nullptr);
537 auto task = GetTaskByTaskType(task_type, task_context);
538 MS_EXCEPTION_IF_NULL(task);
539
540 auto ret = task->RunWithRet();
541 if (!ret) {
542 MS_LOG(EXCEPTION) << "Exec task failed, task_type:" << task_type;
543 }
544 return ret;
545 }
546
LoadCollectiveCommLib()547 bool CPUDeviceResManager::LoadCollectiveCommLib() {
548 bool using_mpi = common::UseMPI();
549 if (using_mpi) {
550 std::string mpi_comm_lib_name = "libmpi_collective.so";
551 auto loader = std::make_shared<CollectiveCommLibLoader>(mpi_comm_lib_name);
552 MS_EXCEPTION_IF_NULL(loader);
553 if (!loader->Initialize()) {
554 MS_LOG(EXCEPTION) << "Failed to load mpi collective library.";
555 }
556
557 void *collective_comm_lib_handle = loader->collective_comm_lib_ptr();
558 MS_EXCEPTION_IF_NULL(collective_comm_lib_handle);
559
560 auto instance_func = DlsymFuncObj(communication_lib_instance, collective_comm_lib_handle);
561 collective_comm_lib_ = instance_func();
562 MS_EXCEPTION_IF_NULL(collective_comm_lib_);
563 } else {
564 #if defined(__linux__) && defined(WITH_BACKEND)
565 collective_comm_lib_ = &MsCollectiveCommLib::GetInstance();
566 MS_EXCEPTION_IF_NULL(collective_comm_lib_);
567 #endif
568 }
569 return true;
570 }
571
LaunchKernelWithProfiling(const CNodePtr & kernel,const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & workspace,const std::vector<KernelTensor * > & outputs,KernelMod * kernel_mod) const572 bool CPUKernelExecutor::LaunchKernelWithProfiling(const CNodePtr &kernel, const std::vector<KernelTensor *> &inputs,
573 const std::vector<KernelTensor *> &workspace,
574 const std::vector<KernelTensor *> &outputs,
575 KernelMod *kernel_mod) const {
576 MS_EXCEPTION_IF_NULL(kernel);
577
578 auto profiler_inst = profiler::cpu::CPUProfiler::GetInstance();
579 MS_EXCEPTION_IF_NULL(profiler_inst);
580
581 uint32_t pid = IntToUint(getpid());
582 // cpu support multi-thread with mindrt for profiling.
583 profiler_inst->OpDataProducerBeginParallel(kernel->fullname_with_scope(), pid);
584 bool ret = DoLaunchKernel(kernel, inputs, workspace, outputs, kernel_mod);
585 profiler_inst->OpDataProducerEndParallel(kernel->fullname_with_scope());
586 profiler_inst->RecordFrameWorkInfo(kernel);
587 return ret;
588 }
589
DoLaunchKernel(const CNodePtr & kernel,const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & workspace,const std::vector<KernelTensor * > & outputs,KernelMod * kernel_mod) const590 bool CPUKernelExecutor::DoLaunchKernel(const CNodePtr &kernel, const std::vector<KernelTensor *> &inputs,
591 const std::vector<KernelTensor *> &workspace,
592 const std::vector<KernelTensor *> &outputs, KernelMod *kernel_mod) const {
593 MS_EXCEPTION_IF_NULL(kernel);
594 MS_EXCEPTION_IF_NULL(kernel_mod);
595 uint64_t start_time = 0;
596 PROFILER_START(start_time);
597 auto ret = kernel_mod->Launch(inputs, workspace, outputs, nullptr);
598 PROFILER_END(start_time, runtime::ProfilerModule::kKernel, runtime::ProfilerEvent::kKernelLaunch,
599 kernel->fullname_with_scope(), false);
600 return ret;
601 }
602
RebuildKernelSelectBackoffOp(const std::vector<CNodePtr> & nodes) const603 void CPUKernelExecutor::RebuildKernelSelectBackoffOp(const std::vector<CNodePtr> &nodes) const {
604 for (auto &node : nodes) {
605 MS_EXCEPTION_IF_NULL(node);
606 if (!AnfAlgo::IsKernelSelectBackoffOp(node)) {
607 continue;
608 }
609 auto [failure_info, failure_type] = AnfAlgo::GetKernelSelectBackoffInfo(node);
610 if (IsVmapNotSupported(node)) {
611 MS_EXCEPTION(failure_type) << "#umsg#Kernel select failed:#umsg#" << failure_info;
612 }
613
614 // Judge whether match strictly between kernel build info and supported kernel attrs.
615 const auto &kernel_build_info = AnfAlgo::GetSelectKernelBuildInfo(node);
616 MS_EXCEPTION_IF_NULL(kernel_build_info);
617 const auto &kernel_attr = kernel::GetKernelAttrFromBuildInfo(kernel_build_info);
618 const auto &supported_kernel_attrs =
619 kernel::NativeCpuKernelMod::GetCpuSupportedList(common::AnfAlgo::GetCNodeName(node));
620 const auto &match_result = kernel::MatchKernelAttrStrict(kernel_attr, supported_kernel_attrs);
621 auto attr_info = kernel::FetchPrintInfoByKernelAttr(kernel_attr);
622 if (!match_result.first) {
623 MS_LOG(INFO) << "Backoff and rebuild kernel on CPU failed for node: " << node->fullname_with_scope()
624 << ", node attr: " << attr_info;
625 MS_EXCEPTION(failure_type) << "#umsg#Kernel select failed:#umsg#" << failure_info;
626 } else {
627 // Set the CPU flag.
628 common::AnfAlgo::SetNodeAttr(kAttrPrimitiveTarget, MakeValue(kCPUDevice), node);
629 kernel_build_info->set_kernel_type(CPU_KERNEL);
630 kernel_build_info->set_processor(kernel::Processor::CPU);
631 MS_LOG(INFO) << "Backoff and rebuild kernel on CPU successfully for node: " << node->fullname_with_scope()
632 << ", node attr: " << attr_info;
633 }
634
635 CreateKernel({node});
636 }
637 }
638
639 MS_REGISTER_DEVICE(kCPUDevice, CPUDeviceContext);
640 #ifdef WITH_BACKEND
__anonda6a1c1a0502(MsContext *ctx) 641 MSCONTEXT_REGISTER_INIT_FUNC(kCPUDevice, [](MsContext *ctx) -> void {
642 MS_EXCEPTION_IF_NULL(ctx);
643 if (ctx->backend_policy() != "ms") {
644 (void)ctx->set_backend_policy("ms");
645 }
646 });
647 #endif
648
649 // Register functions to _c_expression so python hal module could call CPU device interfaces.
PybindCPUStatelessFunc(py::module * m)650 void PybindCPUStatelessFunc(py::module *m) { MS_EXCEPTION_IF_NULL(m); }
651 REGISTER_DEV_STATELESS_FUNC_CB(kCPUDevice, PybindCPUStatelessFunc);
652 } // namespace cpu
653 } // namespace device
654 } // namespace mindspore
655