1 /** 2 * Copyright 2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_KERNEL_ACTOR_H_ 18 #define MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_KERNEL_ACTOR_H_ 19 20 #include <vector> 21 #include <string> 22 #include <memory> 23 #include <utility> 24 #include <unordered_map> 25 #include "runtime/framework/actor/actor_common.h" 26 #include "runtime/framework/actor/debug_aware_actor.h" 27 #include "runtime/hardware/device_context.h" 28 #include "runtime/framework/device_tensor_store.h" 29 #include "backend/kernel_compiler/kernel.h" 30 #include "ir/anf.h" 31 #include "ir/tensor.h" 32 33 namespace mindspore { 34 namespace runtime { 35 using mindspore::device::DeviceContext; 36 using mindspore::device::KernelInfo; 37 using mindspore::kernel::Address; 38 using mindspore::kernel::KernelLaunchInfo; 39 using mindspore::tensor::TensorPtr; 40 41 // The kernel actor is used to receive the device tensors and control info to luanch kernel. 42 // The processing flow is RunOpData/RunOpControl -> CheckRunningCondition -> SendMemoryAllocReq 43 // -> OnMemoryAllocFinish -> LaunchKernel -> SendMemoryFreeReq -> SendOutput. 44 class KernelActor : public DebugAwareActor { 45 public: KernelActor(const std::string & name,const CNodePtr & kernel,const DeviceContext * device_context,const AID & memory_manager_aid,const AID * debug_aid,const AID * recorder_aid,GraphExecutionStrategy strategy)46 KernelActor(const std::string &name, const CNodePtr &kernel, const DeviceContext *device_context, 47 const AID &memory_manager_aid, const AID *debug_aid, const AID *recorder_aid, 48 GraphExecutionStrategy strategy) 49 : DebugAwareActor(name, KernelTransformType::kKernelActor, recorder_aid, memory_manager_aid, debug_aid), 50 kernel_(kernel), 51 kernel_info_(nullptr), 52 is_dynamic_shape_(false), 53 real_input_num_(0), 54 strategy_(strategy) { 55 (void)device_contexts_.emplace_back(device_context); 56 } 57 ~KernelActor() override = default; 58 59 void Init() override; 60 61 // The kernel actor run when receive the input data. 62 void RunOpData(OpData<DeviceTensor> *const input_data, OpContext<DeviceTensor> *const context) override; 63 // The kernel actor run when receive the input control. 64 void RunOpControl(AID *const input_control, OpContext<DeviceTensor> *const context) override; 65 // The kernel actor run when receive the input control and input tensors, used in step mode. 66 void RunOpControlWithInputTensor(AID *const input_control, OpContext<DeviceTensor> *const context, 67 const std::vector<TensorPtr> *input_tensors); 68 69 // The memory related operation interface. 70 void SendMemoryAllocReq(OpContext<DeviceTensor> *const context) override; 71 void SendMemoryFreeReq(OpContext<DeviceTensor> *const context) override; 72 // The callback after memory alloc finished. 73 void OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) override; 74 75 // The debug related operation interface. 76 void SendDebugReq(OpContext<DeviceTensor> *const context) override; 77 // The callback after debug finished. 78 void OnDebugFinish(OpContext<DeviceTensor> *const context) override; 79 80 private: 81 friend class GraphScheduler; 82 83 // Fetch the device tensor for launch. 84 void FetchInputDeviceTensor(OpContext<DeviceTensor> *const context); 85 void FetchOutputDeviceTensor(); 86 void CopyInputDeviceTensor(const OpData<DeviceTensor> *input_data, OpContext<DeviceTensor> *const context); 87 // In step mode, push the input tensors which contain valid device address into input_device_tensors_ directly. 88 void PushInputDeviceTensor(const std::vector<TensorPtr> *input_tensors); 89 90 // The processing before kernel launch: update the info of kernel launch. 91 void PreLaunchKernel(OpContext<DeviceTensor> *const context); 92 // The processing after kernel launch: 1.erase input, 2.free memory, 3.send output. 93 void PostLaunchKernel(OpContext<DeviceTensor> *const context); 94 95 // Send output data and output controls when finish kernel launch. 96 void SendOutput(OpContext<DeviceTensor> *const context) const; 97 98 // The info of kernel. 99 CNodePtr kernel_; 100 KernelInfo *kernel_info_; 101 bool is_dynamic_shape_; 102 103 // The real input number of kernel launch. 104 size_t real_input_num_; 105 106 // The execution strategy of kernel actor. 107 // In pipeline mode, kernel actor executes asynchronously. 108 // In step mode, kernel actor executes synchronously. 109 GraphExecutionStrategy strategy_{GraphExecutionStrategy::kPipeline}; 110 111 // The device tensors for launch. 112 std::vector<DeviceTensor *> input_device_tensors_; 113 std::vector<DeviceTensor *> output_device_tensors_; 114 std::vector<DeviceTensor *> workspace_device_tensors_; 115 // The received input device type may be different from the device context type in the control flow and host device 116 // scenarios, so it needs to be copied from the input device type to the device context type. 117 std::vector<DeviceTensorPtr> copy_input_device_tensors_; 118 119 // The device tensors for memory alloc and free. 120 // output + workspace 121 std::vector<DeviceTensor *> memory_alloc_list_; 122 // input + output + workspace 123 std::vector<DeviceTensor *> memory_free_list_; 124 // The device tensor of external reference is not the real data of this kernel, but need add to the memory_free_list_. 125 std::vector<DeviceTensor *> external_reference_tensors_; 126 127 // The kernel launch info is fetched by the device tensors. 128 KernelLaunchInfo launch_info_; 129 130 // Cache unique output data by output index to modify the output data effectively. 131 std::vector<std::vector<OpDataUniquePtr<DeviceTensor>>> output_data_by_output_index_; 132 // The output_data_ corresponds to the output_data_arrows_ one by one. 133 std::vector<OpData<DeviceTensor> *> output_data_; 134 }; 135 136 using KernelActorPtr = std::shared_ptr<KernelActor>; 137 } // namespace runtime 138 } // namespace mindspore 139 140 #endif // MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_KERNEL_ACTOR_H_ 141