1 /** 2 * Copyright 2021-2023 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_KERNEL_ACTOR_H_ 18 #define MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_KERNEL_ACTOR_H_ 19 20 #include <vector> 21 #include <set> 22 #include <string> 23 #include <memory> 24 #include <utility> 25 #include "utils/hash_map.h" 26 #include "runtime/graph_scheduler/actor/actor_common.h" 27 #include "runtime/graph_scheduler/actor/debug_aware_actor.h" 28 #include "runtime/graph_scheduler/actor/kernel_async_launch_actor.h" 29 #include "runtime/graph_scheduler/actor/kernel_async_infer_actor.h" 30 #include "runtime/graph_scheduler/actor/kernel_async_resize_actor.h" 31 #include "runtime/hardware/device_context.h" 32 #include "runtime/graph_scheduler/device_tensor_store.h" 33 #include "kernel/kernel.h" 34 #include "ir/anf.h" 35 #include "ir/tensor.h" 36 37 namespace mindspore { 38 namespace runtime { 39 using mindspore::device::DeviceContext; 40 using mindspore::device::KernelInfo; 41 using mindspore::kernel::Address; 42 using mindspore::kernel::KernelLaunchAddr; 43 using mindspore::kernel::KernelMod; 44 using mindspore::kernel::KernelTensor; 45 using mindspore::kernel::KernelTensorPtr; 46 using mindspore::session::SomasInfo; 47 using mindspore::tensor::TensorPtr; 48 49 class SuperKernelActor; 50 51 struct InputDataInfo { InputDataInfoInputDataInfo52 InputDataInfo(const std::string &format, const ShapeVector &shape, size_t size, TypeId type_id) 53 : format_(format), shape_(shape), size_(size), type_id_(type_id) {} 54 std::string format_; 55 ShapeVector shape_; 56 size_t size_; 57 TypeId type_id_; 58 }; 59 60 // The kernel actor is used to receive the device tensors and control info to luanch kernel. 61 // The processing flow is RunOpData/RunOpControl -> CheckRunningCondition -> SendMemoryAllocReq 62 // -> OnMemoryAllocFinish -> LaunchKernel -> SendMemoryFreeReq -> SendOutput. 63 class KernelActor : public DebugAwareActor { 64 public: 65 KernelActor(const std::string &name, const CNodePtr &kernel, const DeviceContext *device_context, 66 const AID &memory_manager_aid, const AID *debug_aid, const AID *recorder_aid, 67 GraphExecutionStrategy strategy, const std::set<size_t> &modifiable_ref_input_indexes, 68 const std::set<size_t> &modifiable_ref_output_indexes, 69 const KernelTransformType &type = KernelTransformType::kKernelActor) DebugAwareActor(name,type,recorder_aid,memory_manager_aid,debug_aid,nullptr)70 : DebugAwareActor(name, type, recorder_aid, memory_manager_aid, debug_aid, nullptr), 71 kernel_(kernel), 72 is_dynamic_value_(false), 73 is_dynamic_type_(false), 74 has_dynamic_(false), 75 enable_async_infer_(false), 76 kernel_info_(nullptr), 77 kernel_mod_(nullptr), 78 somas_info_(nullptr), 79 real_input_num_(0), 80 strategy_(strategy), 81 modifiable_ref_input_indexes_(modifiable_ref_input_indexes), 82 modifiable_ref_output_indexes_(modifiable_ref_output_indexes), 83 is_launch_skipped_(false), 84 inputs_continuous_memory_(false) { 85 (void)device_contexts_.emplace_back(device_context); 86 is_dynamic_shape_ = common::AnfAlgo::IsDynamicShape(kernel_) || common::AnfAlgo::IsDynamicSequence(kernel_); 87 88 kernel_async_infer_aid_ = KernelAsyncInferActor::GetInstance()->GetAID(); 89 kernel_async_resize_aid_ = KernelAsyncResizeActor::GetInstance()->GetAID(); 90 kernel_async_launch_aid_ = KernelAsyncLaunchActor::GetInstance()->GetAID(); 91 } 92 93 ~KernelActor() override = default; 94 95 // The memory related operation interface. 96 void SendMemoryAllocReq(OpContext<DeviceTensor> *const context) override; 97 void SendMemoryFreeReq(OpContext<DeviceTensor> *const context) override; 98 // The callback after memory alloc finished. 99 void OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) override; 100 kernel()101 const CNodePtr &kernel() const { return kernel_; } modifiable_ref_input_indexes()102 const std::set<size_t> &modifiable_ref_input_indexes() const { return modifiable_ref_input_indexes_; } modifiable_ref_output_indexes()103 const std::set<size_t> &modifiable_ref_output_indexes() const { return modifiable_ref_output_indexes_; } is_dynamic_shape()104 bool is_dynamic_shape() const { return is_dynamic_shape_; } is_launch_skipped()105 bool is_launch_skipped() const { return is_launch_skipped_; } inputs_continuous_memory()106 bool inputs_continuous_memory() const { return inputs_continuous_memory_; } somas_info()107 SomasInfo *somas_info() const { return somas_info_; } somas_graph_output_indexes()108 const std::set<size_t> &somas_graph_output_indexes() const { return somas_graph_output_indexes_; } 109 set_enable_async_infer(bool enable_async_infer)110 void set_enable_async_infer(bool enable_async_infer) { enable_async_infer_ = enable_async_infer; } 111 112 // Really do infer shape and update kernel tensor shape. 113 void ExecuteInferShapeTask(OpContext<DeviceTensor> *const context); 114 // Really do resize kernel mod and update new size into output and workspace kernel tensors. 115 void ExecuteResizeKernelModTask(OpContext<DeviceTensor> *const context); 116 // Really do launch kernel with memory allocate and free. 117 void ExecuteLaunchKernelTask(OpContext<DeviceTensor> *const context); 118 set_stream_send_actor(KernelActor * stream_send_actor)119 void set_stream_send_actor(KernelActor *stream_send_actor) { stream_send_actor_ = stream_send_actor; } 120 121 void SetInputDeviceTensor(DeviceTensor *input_device_tensor, size_t input_index); 122 123 // Set the memory address for the tensors which use the somas. 124 void SetSomasMemory(OpContext<DeviceTensor> *const context) const; 125 skip_launch_shape_related_op()126 bool skip_launch_shape_related_op() const { return skip_launch_shape_related_op_; } set_skip_launch_shape_related_op(bool skip_launch_shape_related_op)127 void set_skip_launch_shape_related_op(bool skip_launch_shape_related_op) { 128 skip_launch_shape_related_op_ = skip_launch_shape_related_op; 129 } 130 131 protected: 132 void Init() override; 133 void Run(OpContext<DeviceTensor> *const context) override; 134 void SendRecorderInfo(OpContext<DeviceTensor> *const context) const override; 135 136 // Do kernel launching in this method after 'PreLaunchKernel' and 'PostLaunchKernel'. 137 virtual bool LaunchKernel(OpContext<DeviceTensor> *const context, bool is_skip_launch = false); 138 // Execute kernel actor multi stream produre to make sure safety of memory before kernel launch. 139 virtual void ProcessMultiStreamBeforeKernelLaunch(OpContext<DeviceTensor> *const context); 140 // Execute kernel actor multi stream produre to make sure safety of memory after kernel launch. 141 virtual void ProcessMultiStreamAfterKernelLaunch(OpContext<DeviceTensor> *const context); 142 143 // Execute infer shape, resize and launch kernel by runtime pipeline which executes by KernelAsyncInferActor, 144 // KernelAsyncResizeActor and KernelAsyncLaunchActor. 145 void RunWithMultiPipeline(OpContext<DeviceTensor> *const context); 146 // Execute launch kernel asynchronously in KernelAsyncLaunchActor. 147 void RunWithAsyncLaunchKernel(OpContext<DeviceTensor> *const context); 148 149 // Infer shape(and type) and resize kernel mod. 150 void InferAndResize(OpContext<DeviceTensor> *const context); 151 152 // Re-Infer shape, type and resize before kernel launch in dynamic scenarios. 153 void InferShapeAndType(); 154 155 // Re-InferShape and resize before kernel launch in dynamic scenarios. 156 void InferShape(); 157 158 void ResizeKernelMod(); 159 160 // Update input_device_tensors by input op data. 161 void UpdateInputDeviceTensor(const OpData<DeviceTensor> *input_data, OpContext<DeviceTensor> *const context); 162 163 // Record the output and workspace memory pointer and size to optimize memory allocate/free performance in next step. 164 // Note: only use in inference case. 165 void TraceDynamicMemory(); 166 167 // The info of kernel. 168 CNodePtr kernel_; 169 bool is_dynamic_shape_; 170 bool is_dynamic_value_; 171 bool is_dynamic_type_; 172 bool has_dynamic_; 173 // Whether enable asynchronously infer shape and resize kernel mod by KernelInferActor and KernelResizeActor. 174 bool enable_async_infer_; 175 AID kernel_async_infer_aid_; 176 AID kernel_async_resize_aid_; 177 AID kernel_async_launch_aid_; 178 KernelInfo *kernel_info_; 179 KernelMod *kernel_mod_; 180 181 // The device tensors for launch. 182 std::vector<DeviceTensor *> input_device_tensors_; 183 std::vector<DeviceTensor *> output_device_tensors_; 184 std::vector<DeviceTensor *> workspace_device_tensors_; 185 186 std::vector<DeviceTensor *> max_ref_cnt_output_list_; 187 188 // The input kernel tensors for infer shape. 189 std::vector<abstract::AbstractBasePtr> input_kernel_tensors_for_infer_; 190 // The kernel tensors for resize and launch. 191 std::vector<KernelTensor *> input_kernel_tensors_; 192 std::vector<KernelTensor *> output_kernel_tensors_; 193 std::vector<KernelTensor *> workspace_kernel_tensors_; 194 195 // The received input device type and format may be different from the formal parameter in the control flow 196 // scenarios, so it needs to be copied from the input data to real data that kernel launch needs. 197 std::vector<DeviceTensorPtr> copy_input_device_tensors_; 198 // Real data info that kernel launch needs, used to check the consistency of received input data. 199 std::vector<std::shared_ptr<InputDataInfo>> real_input_data_infos_; 200 201 // The device tensors for memory alloc and free. 202 // output + workspace 203 std::vector<DeviceTensor *> memory_alloc_list_; 204 // input + output + workspace 205 std::vector<DeviceTensor *> memory_free_list_; 206 // depend shape input list 207 std::vector<bool> depend_shape_input_list_; 208 // The device tensor of external reference is not the real data of this kernel, but need add to the 209 // memory_free_list_. 210 std::vector<DeviceTensor *> external_reference_tensors_; 211 212 // The information used for integration of dynamic and static memory. 213 SomasInfo *somas_info_; 214 // The graph output node and index use somas info. 215 std::set<size_t> somas_graph_output_indexes_; 216 // Task id on stream, use for events. 217 std::shared_ptr<int64_t> task_id_on_stream_ = std::make_shared<int64_t>(0L); 218 // Send actor ref, point to the send actor when current actor is recv actor. 219 KernelActor *stream_send_actor_{nullptr}; 220 // Flag for stream recv actor. 221 bool is_stream_recv_actor_{false}; 222 // Flag for indicating if current actor is multi-thread safe, which was generate at compile time. 223 bool is_multi_stream_safe_{false}; 224 225 private: 226 friend class GraphScheduler; 227 friend class ControlNodeScheduler; 228 friend class InlineControlFlowScheduler; 229 friend class SchedulerHelper; 230 #ifdef ENABLE_RPC_ACTOR 231 friend class RpcNodeScheduler; 232 #endif 233 friend class SuperKernelActor; 234 235 // Init the device tensors and kernel launch info. 236 void InitInputInfo(); 237 void InitOutputInfo(); 238 void InitWorkspaceInfo(); 239 void InitShapeDependInfo(); 240 241 // Fetch the device tensor for launch. 242 void FetchInputDeviceTensor(OpContext<DeviceTensor> *const context); 243 void FetchOutputDeviceTensor(OpContext<DeviceTensor> *const context); 244 void FetchWorkspaceDeviceTensor(); 245 // Need copy when the data type or format between real parameters and formal parameters are inconsistent. 246 void CopyInputDeviceTensor(const OpData<DeviceTensor> *input_data, OpContext<DeviceTensor> *const context); 247 248 // The processing before kernel launch: update the info of kernel launch. 249 void PreLaunchKernel(OpContext<DeviceTensor> *const context); 250 // The processing after kernel launch: 1.erase input, 2.free memory, 3.send output. 251 void PostLaunchKernel(OpContext<DeviceTensor> *const context); 252 // Back refresh the dynamic device tensor stores that have been triggered copy. 253 void RefreshDeviceTensorCopyStore(OpContext<DeviceTensor> *const context); 254 255 void *GetSomasDevicePtr(size_t offset) const; 256 257 // Record mem info, because async send may free device info. 258 void SetMemInfoForDebugAndRdr(); 259 260 // The real input number of kernel launch. 261 size_t real_input_num_; 262 263 // The execution strategy of kernel actor. 264 // In pipeline mode, kernel actor executes asynchronously. 265 // In step mode, kernel actor executes synchronously. 266 GraphExecutionStrategy strategy_{GraphExecutionStrategy::kPipeline}; 267 268 // Record the modifiable ref indexes. Used to refresh the ref data which are modified in the running. 269 std::set<size_t> modifiable_ref_input_indexes_; 270 std::set<size_t> modifiable_ref_output_indexes_; 271 272 // Whether skip the kernel launch. 273 bool is_launch_skipped_; 274 275 // Recoreded mem info. 276 KernelLaunchAddr mem_info_; 277 278 // The ignore input addresses when the kernel launch. 279 std::vector<size_t> launch_ignored_inputs_; 280 281 // Whether the inputs need continuous memory, used to check the inputs legitimacy. 282 bool inputs_continuous_memory_; 283 284 // The stream resource of the KernelActor to launch kernel. 285 void *stream_{nullptr}; 286 287 bool is_multi_stream_process_skipped_{false}; 288 std::vector<std::pair<uint32_t, void *>> cross_stream_addresses_; 289 290 // Flag for skipping launch shape related operator, such as RealMakeTuple. 291 // RealMakeTuple --> ShapeCalc pattern: if ShapeCalc is not value depend for one input RealMakeTuple op, we can skip 292 // launch this RealMakeTuple. 293 bool skip_launch_shape_related_op_{false}; 294 295 bool is_output_kernel_{false}; 296 }; 297 298 using KernelActorPtr = std::shared_ptr<KernelActor>; 299 } // namespace runtime 300 } // namespace mindspore 301 302 #endif // MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_KERNEL_ACTOR_H_ 303