• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2021-2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_KERNEL_ACTOR_H_
18 #define MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_KERNEL_ACTOR_H_
19 
20 #include <vector>
21 #include <set>
22 #include <string>
23 #include <memory>
24 #include <utility>
25 #include "utils/hash_map.h"
26 #include "runtime/graph_scheduler/actor/actor_common.h"
27 #include "runtime/graph_scheduler/actor/debug_aware_actor.h"
28 #include "runtime/graph_scheduler/actor/kernel_async_launch_actor.h"
29 #include "runtime/graph_scheduler/actor/kernel_async_infer_actor.h"
30 #include "runtime/graph_scheduler/actor/kernel_async_resize_actor.h"
31 #include "runtime/hardware/device_context.h"
32 #include "runtime/graph_scheduler/device_tensor_store.h"
33 #include "kernel/kernel.h"
34 #include "ir/anf.h"
35 #include "ir/tensor.h"
36 
37 namespace mindspore {
38 namespace runtime {
39 using mindspore::device::DeviceContext;
40 using mindspore::device::KernelInfo;
41 using mindspore::kernel::Address;
42 using mindspore::kernel::KernelLaunchAddr;
43 using mindspore::kernel::KernelMod;
44 using mindspore::kernel::KernelTensor;
45 using mindspore::kernel::KernelTensorPtr;
46 using mindspore::session::SomasInfo;
47 using mindspore::tensor::TensorPtr;
48 
49 class SuperKernelActor;
50 
51 struct InputDataInfo {
InputDataInfoInputDataInfo52   InputDataInfo(const std::string &format, const ShapeVector &shape, size_t size, TypeId type_id)
53       : format_(format), shape_(shape), size_(size), type_id_(type_id) {}
54   std::string format_;
55   ShapeVector shape_;
56   size_t size_;
57   TypeId type_id_;
58 };
59 
60 // The kernel actor is used to receive the device tensors and control info to luanch kernel.
61 // The processing flow is RunOpData/RunOpControl -> CheckRunningCondition -> SendMemoryAllocReq
62 // -> OnMemoryAllocFinish -> LaunchKernel -> SendMemoryFreeReq -> SendOutput.
63 class KernelActor : public DebugAwareActor {
64  public:
65   KernelActor(const std::string &name, const CNodePtr &kernel, const DeviceContext *device_context,
66               const AID &memory_manager_aid, const AID *debug_aid, const AID *recorder_aid,
67               GraphExecutionStrategy strategy, const std::set<size_t> &modifiable_ref_input_indexes,
68               const std::set<size_t> &modifiable_ref_output_indexes,
69               const KernelTransformType &type = KernelTransformType::kKernelActor)
DebugAwareActor(name,type,recorder_aid,memory_manager_aid,debug_aid,nullptr)70       : DebugAwareActor(name, type, recorder_aid, memory_manager_aid, debug_aid, nullptr),
71         kernel_(kernel),
72         is_dynamic_value_(false),
73         is_dynamic_type_(false),
74         has_dynamic_(false),
75         enable_async_infer_(false),
76         kernel_info_(nullptr),
77         kernel_mod_(nullptr),
78         somas_info_(nullptr),
79         real_input_num_(0),
80         strategy_(strategy),
81         modifiable_ref_input_indexes_(modifiable_ref_input_indexes),
82         modifiable_ref_output_indexes_(modifiable_ref_output_indexes),
83         is_launch_skipped_(false),
84         inputs_continuous_memory_(false) {
85     (void)device_contexts_.emplace_back(device_context);
86     is_dynamic_shape_ = common::AnfAlgo::IsDynamicShape(kernel_) || common::AnfAlgo::IsDynamicSequence(kernel_);
87 
88     kernel_async_infer_aid_ = KernelAsyncInferActor::GetInstance()->GetAID();
89     kernel_async_resize_aid_ = KernelAsyncResizeActor::GetInstance()->GetAID();
90     kernel_async_launch_aid_ = KernelAsyncLaunchActor::GetInstance()->GetAID();
91   }
92 
93   ~KernelActor() override = default;
94 
95   // The memory related operation interface.
96   void SendMemoryAllocReq(OpContext<DeviceTensor> *const context) override;
97   void SendMemoryFreeReq(OpContext<DeviceTensor> *const context) override;
98   // The callback after memory alloc finished.
99   void OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) override;
100 
kernel()101   const CNodePtr &kernel() const { return kernel_; }
modifiable_ref_input_indexes()102   const std::set<size_t> &modifiable_ref_input_indexes() const { return modifiable_ref_input_indexes_; }
modifiable_ref_output_indexes()103   const std::set<size_t> &modifiable_ref_output_indexes() const { return modifiable_ref_output_indexes_; }
is_dynamic_shape()104   bool is_dynamic_shape() const { return is_dynamic_shape_; }
is_launch_skipped()105   bool is_launch_skipped() const { return is_launch_skipped_; }
inputs_continuous_memory()106   bool inputs_continuous_memory() const { return inputs_continuous_memory_; }
somas_info()107   SomasInfo *somas_info() const { return somas_info_; }
somas_graph_output_indexes()108   const std::set<size_t> &somas_graph_output_indexes() const { return somas_graph_output_indexes_; }
109 
set_enable_async_infer(bool enable_async_infer)110   void set_enable_async_infer(bool enable_async_infer) { enable_async_infer_ = enable_async_infer; }
111 
112   // Really do infer shape and update kernel tensor shape.
113   void ExecuteInferShapeTask(OpContext<DeviceTensor> *const context);
114   // Really do resize kernel mod and update new size into output and workspace kernel tensors.
115   void ExecuteResizeKernelModTask(OpContext<DeviceTensor> *const context);
116   // Really do launch kernel with memory allocate and free.
117   void ExecuteLaunchKernelTask(OpContext<DeviceTensor> *const context);
118 
set_stream_send_actor(KernelActor * stream_send_actor)119   void set_stream_send_actor(KernelActor *stream_send_actor) { stream_send_actor_ = stream_send_actor; }
120 
121   void SetInputDeviceTensor(DeviceTensor *input_device_tensor, size_t input_index);
122 
123   // Set the memory address for the tensors which use the somas.
124   void SetSomasMemory(OpContext<DeviceTensor> *const context) const;
125 
skip_launch_shape_related_op()126   bool skip_launch_shape_related_op() const { return skip_launch_shape_related_op_; }
set_skip_launch_shape_related_op(bool skip_launch_shape_related_op)127   void set_skip_launch_shape_related_op(bool skip_launch_shape_related_op) {
128     skip_launch_shape_related_op_ = skip_launch_shape_related_op;
129   }
130 
131  protected:
132   void Init() override;
133   void Run(OpContext<DeviceTensor> *const context) override;
134   void SendRecorderInfo(OpContext<DeviceTensor> *const context) const override;
135 
136   // Do kernel launching in this method after 'PreLaunchKernel' and 'PostLaunchKernel'.
137   virtual bool LaunchKernel(OpContext<DeviceTensor> *const context, bool is_skip_launch = false);
138   // Execute kernel actor multi stream produre to make sure safety of memory before kernel launch.
139   virtual void ProcessMultiStreamBeforeKernelLaunch(OpContext<DeviceTensor> *const context);
140   // Execute kernel actor multi stream produre to make sure safety of memory after kernel launch.
141   virtual void ProcessMultiStreamAfterKernelLaunch(OpContext<DeviceTensor> *const context);
142 
143   // Execute infer shape, resize and launch kernel by runtime pipeline which executes by KernelAsyncInferActor,
144   // KernelAsyncResizeActor and KernelAsyncLaunchActor.
145   void RunWithMultiPipeline(OpContext<DeviceTensor> *const context);
146   // Execute launch kernel asynchronously in KernelAsyncLaunchActor.
147   void RunWithAsyncLaunchKernel(OpContext<DeviceTensor> *const context);
148 
149   // Infer shape(and type) and resize kernel mod.
150   void InferAndResize(OpContext<DeviceTensor> *const context);
151 
152   // Re-Infer shape, type and resize before kernel launch in dynamic scenarios.
153   void InferShapeAndType();
154 
155   // Re-InferShape and resize before kernel launch in dynamic scenarios.
156   void InferShape();
157 
158   void ResizeKernelMod();
159 
160   // Update input_device_tensors by input op data.
161   void UpdateInputDeviceTensor(const OpData<DeviceTensor> *input_data, OpContext<DeviceTensor> *const context);
162 
163   // Record the output and workspace memory pointer and size to optimize memory allocate/free performance in next step.
164   // Note: only use in inference case.
165   void TraceDynamicMemory();
166 
167   // The info of kernel.
168   CNodePtr kernel_;
169   bool is_dynamic_shape_;
170   bool is_dynamic_value_;
171   bool is_dynamic_type_;
172   bool has_dynamic_;
173   // Whether enable asynchronously infer shape and resize kernel mod by KernelInferActor and KernelResizeActor.
174   bool enable_async_infer_;
175   AID kernel_async_infer_aid_;
176   AID kernel_async_resize_aid_;
177   AID kernel_async_launch_aid_;
178   KernelInfo *kernel_info_;
179   KernelMod *kernel_mod_;
180 
181   // The device tensors for launch.
182   std::vector<DeviceTensor *> input_device_tensors_;
183   std::vector<DeviceTensor *> output_device_tensors_;
184   std::vector<DeviceTensor *> workspace_device_tensors_;
185 
186   std::vector<DeviceTensor *> max_ref_cnt_output_list_;
187 
188   // The input kernel tensors for infer shape.
189   std::vector<abstract::AbstractBasePtr> input_kernel_tensors_for_infer_;
190   // The kernel tensors for resize and launch.
191   std::vector<KernelTensor *> input_kernel_tensors_;
192   std::vector<KernelTensor *> output_kernel_tensors_;
193   std::vector<KernelTensor *> workspace_kernel_tensors_;
194 
195   // The received input device type and format may be different from the formal parameter in the control flow
196   // scenarios, so it needs to be copied from the input data to real data that kernel launch needs.
197   std::vector<DeviceTensorPtr> copy_input_device_tensors_;
198   // Real data info that kernel launch needs, used to check the consistency of received input data.
199   std::vector<std::shared_ptr<InputDataInfo>> real_input_data_infos_;
200 
201   // The device tensors for memory alloc and free.
202   // output + workspace
203   std::vector<DeviceTensor *> memory_alloc_list_;
204   // input + output + workspace
205   std::vector<DeviceTensor *> memory_free_list_;
206   // depend shape input list
207   std::vector<bool> depend_shape_input_list_;
208   // The device tensor of external reference is not the real data of this kernel, but need add to the
209   // memory_free_list_.
210   std::vector<DeviceTensor *> external_reference_tensors_;
211 
212   // The information used for integration of dynamic and static memory.
213   SomasInfo *somas_info_;
214   // The graph output node and index use somas info.
215   std::set<size_t> somas_graph_output_indexes_;
216   // Task id on stream, use for events.
217   std::shared_ptr<int64_t> task_id_on_stream_ = std::make_shared<int64_t>(0L);
218   // Send actor ref, point to the send actor when current actor is recv actor.
219   KernelActor *stream_send_actor_{nullptr};
220   // Flag for stream recv actor.
221   bool is_stream_recv_actor_{false};
222   // Flag for indicating if current actor is multi-thread safe, which was generate at compile time.
223   bool is_multi_stream_safe_{false};
224 
225  private:
226   friend class GraphScheduler;
227   friend class ControlNodeScheduler;
228   friend class InlineControlFlowScheduler;
229   friend class SchedulerHelper;
230 #ifdef ENABLE_RPC_ACTOR
231   friend class RpcNodeScheduler;
232 #endif
233   friend class SuperKernelActor;
234 
235   // Init the device tensors and kernel launch info.
236   void InitInputInfo();
237   void InitOutputInfo();
238   void InitWorkspaceInfo();
239   void InitShapeDependInfo();
240 
241   // Fetch the device tensor for launch.
242   void FetchInputDeviceTensor(OpContext<DeviceTensor> *const context);
243   void FetchOutputDeviceTensor(OpContext<DeviceTensor> *const context);
244   void FetchWorkspaceDeviceTensor();
245   // Need copy when the data type or format between real parameters and formal parameters are inconsistent.
246   void CopyInputDeviceTensor(const OpData<DeviceTensor> *input_data, OpContext<DeviceTensor> *const context);
247 
248   // The processing before kernel launch: update the info of kernel launch.
249   void PreLaunchKernel(OpContext<DeviceTensor> *const context);
250   // The processing after kernel launch: 1.erase input, 2.free memory, 3.send output.
251   void PostLaunchKernel(OpContext<DeviceTensor> *const context);
252   // Back refresh the dynamic device tensor stores that have been triggered copy.
253   void RefreshDeviceTensorCopyStore(OpContext<DeviceTensor> *const context);
254 
255   void *GetSomasDevicePtr(size_t offset) const;
256 
257   // Record mem info, because async send may free device info.
258   void SetMemInfoForDebugAndRdr();
259 
260   // The real input number of kernel launch.
261   size_t real_input_num_;
262 
263   // The execution strategy of kernel actor.
264   // In pipeline mode, kernel actor executes asynchronously.
265   // In step mode, kernel actor executes synchronously.
266   GraphExecutionStrategy strategy_{GraphExecutionStrategy::kPipeline};
267 
268   // Record the modifiable ref indexes. Used to refresh the ref data which are modified in the running.
269   std::set<size_t> modifiable_ref_input_indexes_;
270   std::set<size_t> modifiable_ref_output_indexes_;
271 
272   // Whether skip the kernel launch.
273   bool is_launch_skipped_;
274 
275   // Recoreded mem info.
276   KernelLaunchAddr mem_info_;
277 
278   // The ignore input addresses when the kernel launch.
279   std::vector<size_t> launch_ignored_inputs_;
280 
281   // Whether the inputs need continuous memory, used to check the inputs legitimacy.
282   bool inputs_continuous_memory_;
283 
284   // The stream resource of the KernelActor to launch kernel.
285   void *stream_{nullptr};
286 
287   bool is_multi_stream_process_skipped_{false};
288   std::vector<std::pair<uint32_t, void *>> cross_stream_addresses_;
289 
290   // Flag for skipping launch shape related operator, such as RealMakeTuple.
291   // RealMakeTuple --> ShapeCalc pattern: if ShapeCalc is not value depend for one input RealMakeTuple op, we can skip
292   // launch this RealMakeTuple.
293   bool skip_launch_shape_related_op_{false};
294 
295   bool is_output_kernel_{false};
296 };
297 
298 using KernelActorPtr = std::shared_ptr<KernelActor>;
299 }  // namespace runtime
300 }  // namespace mindspore
301 
302 #endif  // MINDSPORE_CCSRC_RUNTIME_FRAMEWORK_ACTOR_KERNEL_ACTOR_H_
303