• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2023-2024 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_ACLNN_KERNEL_MOD_H_
17 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_ACLNN_KERNEL_MOD_H_
18 #include <vector>
19 #include <memory>
20 #include <map>
21 #include <string>
22 #include <tuple>
23 #include <unordered_set>
24 #include <utility>
25 #include "ops/base_operator.h"
26 #include "ops/op_def.h"
27 #include "kernel/kernel.h"
28 #include "plugin/factory/ms_factory.h"
29 #include "include/common/utils/utils.h"
30 #include "include/backend/mem_reuse/mem_tracker.h"
31 #include "runtime/pynative/op_runtime_info.h"
32 #include "transform/acl_ir/acl_convert.h"
33 #include "transform/acl_ir/op_api_exec.h"
34 #include "transform/acl_ir/op_api_util.h"
35 #include "plugin/device/ascend/hal/device/ascend_memory_manager.h"
36 
37 namespace mindspore {
38 namespace kernel {
39 using aclTensor = transform::aclTensor;
40 using aclOpExecutor = transform::aclOpExecutor;
41 using CallBackFunc = std::function<void()>;
42 using OpApiUtil = transform::OpApiUtil;
43 using AclUtil = transform::AclUtil;
44 
45 #define DEFINE_GET_WORKSPACE_FOR_RESIZE()                                                                            \
46   template <typename... Args>                                                                                        \
47   void GetWorkspaceForResize(const Args &... args) {                                                                 \
48     if (AclnnKernelMod::is_dynamic_) {                                                                               \
49       hash_id_ = 0;                                                                                                  \
50       return;                                                                                                        \
51     }                                                                                                                \
52     hash_id_ = transform::CalcOpApiHash(op_type_, args...);                                                          \
53     if (cache_hash_.count(hash_id_) == 0) {                                                                          \
54       const bool use_huge_pages = false;                                                                             \
55       auto return_value = GEN_EXECUTOR_CUST(op_type_, use_huge_pages, args...);                                      \
56       UpdateWorkspace(return_value);                                                                                 \
57     } else {                                                                                                         \
58       auto return_value = GEN_EXECUTOR_BOOST(op_type_, hash_id_, args...);                                           \
59       UpdateWorkspace(return_value);                                                                                 \
60     }                                                                                                                \
61   }                                                                                                                  \
62                                                                                                                      \
63   void RunOp(void *stream_ptr, const std::vector<KernelTensor *> &workspace) {                                       \
64     if (workspace_size_list_.empty()) {                                                                              \
65       RUN_OP_API_ASYNC(op_type_, nullptr, 0, executor_, stream_ptr, release_func_);                                  \
66     } else {                                                                                                         \
67       if (is_dynamic_) {                                                                                             \
68         void *device_addr = device::ascend::AscendMemoryPool::GetInstance().AllocTensorMem(workspace_size_list_[0]); \
69         device::tracker::CALL_MEMORY_TRACKER_WITH_FILE(                                                              \
70           UpdateDevicePtrInfo, device_addr, device::tracker::MemType::kWorkSpace, "AclnnWorkspace_" + op_type_);     \
71         RUN_OP_API_ASYNC(op_type_, device_addr, workspace_size_list_[0], executor_, stream_ptr, release_func_);      \
72         device::ascend::AscendMemoryPool::GetInstance().FreeTensorMem(device_addr);                                  \
73       } else {                                                                                                       \
74         if (workspace.empty()) {                                                                                     \
75           MS_LOG(EXCEPTION) << "Failed to allocate workspace tensor!";                                               \
76         }                                                                                                            \
77         auto workspace_tensor = workspace[0];                                                                        \
78         if (workspace_tensor->size() != workspace_size_list_[0]) {                                                   \
79           MS_LOG(EXCEPTION) << "Please check 'GetWorkSpaceInfo' and 'Launch' func. Expected workspace size is"       \
80                             << workspace_size_list_[0] << ", but get " << workspace_tensor->size();                  \
81         }                                                                                                            \
82         RUN_OP_API_ASYNC(op_type_, workspace_tensor->device_ptr(), workspace_size_list_[0], executor_, stream_ptr,   \
83                          release_func_);                                                                             \
84       }                                                                                                              \
85     }                                                                                                                \
86   }                                                                                                                  \
87                                                                                                                      \
88   void RunOpSync(void *stream_ptr, const std::vector<KernelTensor *> &workspace) {                                   \
89     if (workspace_size_list_.empty()) {                                                                              \
90       RUN_OP_API_SYNC(op_type_, nullptr, 0, executor_, stream_ptr);                                                  \
91     } else {                                                                                                         \
92       if (is_dynamic_) {                                                                                             \
93         void *device_addr = device::ascend::AscendMemoryPool::GetInstance().AllocTensorMem(workspace_size_list_[0]); \
94         device::tracker::CALL_MEMORY_TRACKER_WITH_FILE(                                                              \
95           UpdateDevicePtrInfo, device_addr, device::tracker::MemType::kWorkSpace, "AclnnWorkspace_" + op_type_);     \
96         RUN_OP_API_SYNC(op_type_, device_addr, workspace_size_list_[0], executor_, stream_ptr);                      \
97         device::ascend::AscendMemoryPool::GetInstance().FreeTensorMem(device_addr);                                  \
98       } else {                                                                                                       \
99         if (workspace.empty()) {                                                                                     \
100           MS_LOG(EXCEPTION) << "Failed to allocate workspace tensor!";                                               \
101         }                                                                                                            \
102         auto workspace_tensor = workspace[0];                                                                        \
103         if (workspace_tensor->size() != workspace_size_list_[0]) {                                                   \
104           MS_LOG(EXCEPTION) << "Please check 'GetWorkSpaceInfo' and 'Launch' func. Expected workspace size is"       \
105                             << workspace_size_list_[0] << ", but get " << workspace_tensor->size();                  \
106         }                                                                                                            \
107         RUN_OP_API_SYNC(op_type_, workspace_tensor->device_ptr(), workspace_size_list_[0], executor_, stream_ptr);   \
108       }                                                                                                              \
109     }                                                                                                                \
110   }
111 
112 class EmptyKernelTensor {
113  public:
EmptyKernelTensor()114   EmptyKernelTensor() { tensor_ = new KernelTensor(); }
EmptyKernelTensor(TypeId type_id,TypeId dtype_id)115   EmptyKernelTensor(TypeId type_id, TypeId dtype_id) {
116     if (type_id == kObjectTypeTensorType) {
117       tensor_ = new KernelTensor();
118       auto tensor_shape = std::make_shared<abstract::TensorShape>();
119       tensor_shape->SetShapeVector({0});
120       tensor_->SetType(std::make_shared<TensorType>(TypeIdToType(dtype_id)));
121       tensor_->SetShape(tensor_shape);
122     }
123   }
~EmptyKernelTensor()124   ~EmptyKernelTensor() { delete tensor_; }
get()125   KernelTensor *get() const { return tensor_; }
126 
127  private:
128   KernelTensor *tensor_;
129 };
130 
131 class AclnnKernelMod : public KernelMod {
132  public:
AclnnKernelMod(std::string && op_type)133   explicit AclnnKernelMod(std::string &&op_type) : op_type_(std::move(op_type)) {}
134   ~AclnnKernelMod() = default;
135 
136   bool Init(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &outputs);
137   int Resize(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &outputs);
138 
GetWorkSpaceInfo(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & outputs)139   virtual void GetWorkSpaceInfo(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &outputs) {
140   }
141   virtual bool Launch(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &workspace,
142                       const std::vector<KernelTensor *> &outputs, void *stream_ptr);
143 
ResetDeivceAddress(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & outputs)144   void ResetDeivceAddress(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &outputs) {}
145 
146   std::vector<size_t> GetLaunchIgnoredInputAddressIdx() const override;
IsNeedUpdateOutputShapeAndSize()147   bool IsNeedUpdateOutputShapeAndSize() override { return false; }
GetOpSupport()148   std::vector<KernelAttr> GetOpSupport() override { MS_LOG(EXCEPTION) << "This interface is not support in aclnn."; }
149 
150   template <typename... Args>
UpdateWorkspace(const std::tuple<Args...> & args)151   void UpdateWorkspace(const std::tuple<Args...> &args) {
152     auto real_workspace_size = static_cast<size_t>(std::get<0>(args));
153     if (real_workspace_size != 0) {
154       std::vector<size_t> workspace_size_list = {real_workspace_size};
155       SetWorkspaceSizeList(workspace_size_list);
156     }
157 
158     constexpr size_t kBoostGeneratorSize = 5;
159     if constexpr (std::tuple_size_v<std::tuple<Args...>> == kBoostGeneratorSize) {
160       hash_id_ = std::get<kHashIdIndex>(args);
161     }
162   }
163 
164   template <typename... Args>
ParseGenExecutor(const std::tuple<Args...> & args)165   void ParseGenExecutor(const std::tuple<Args...> &args) {
166     if (is_dynamic_) {
167       workspace_size_list_.clear();
168       size_t size = std::get<0>(args);
169       if (size != 0) {
170         (void)workspace_size_list_.emplace_back(size);
171       }
172     }
173 
174     executor_ = std::get<1>(args);
175     if (executor_ == nullptr) {
176       MS_LOG(INTERNAL_EXCEPTION) << "Please check op api's generate!";
177     }
178     release_func_ = std::get<2>(args);
179 
180     constexpr size_t kBoostGeneratorSize = 5;
181     if constexpr (std::tuple_size_v<std::tuple<Args...>> == kBoostGeneratorSize) {
182       hash_id_ = std::get<kHashIdIndex>(args);
183       if (cache_hash_.count(hash_id_) != 0) {
184         return;
185       }
186       constexpr size_t kHitIndex = 4;
187       if (std::get<kHitIndex>(args)) {
188         cache_hash_.insert(hash_id_);
189       }
190     }
191   }
192 
SetDynamic(bool is_dynamic)193   void SetDynamic(bool is_dynamic) {
194     std::lock_guard<std::mutex> lock(mtx_);
195     is_dynamic_ = is_dynamic;
196   }
197 
198  protected:
199   template <size_t N, std::size_t... Is>
GetTupleFrontImpl(const std::vector<KernelTensor * > & vecs,std::index_sequence<Is...>)200   auto GetTupleFrontImpl(const std::vector<KernelTensor *> &vecs, std::index_sequence<Is...>) {
201     return std::make_tuple(vecs[Is]...);
202   }
203 
204   template <size_t N>
GetTupleFront(const std::vector<KernelTensor * > & vecs)205   auto GetTupleFront(const std::vector<KernelTensor *> &vecs) {
206     return GetTupleFrontImpl<N>(vecs, std::make_index_sequence<N>());
207   }
208 
209   template <typename T, typename... Vecs>
ConcatVecs(const std::vector<T> & vec,const Vecs &...vecs)210   std::vector<T> ConcatVecs(const std::vector<T> &vec, const Vecs &... vecs) {
211     std::vector<T> result = vec;
212     (result.insert(result.end(), vecs.begin(), vecs.end()), ...);
213     return result;
214   }
215 
216   template <typename T, typename... Vecs>
ConcatVecs(const Vecs &...vecs)217   std::vector<T> ConcatVecs(const Vecs &... vecs) {
218     static_assert((std::is_same_v<T, typename Vecs::value_type> && ...), "All vectors must have the same type!");
219     std::vector<T> result;
220     (result.insert(result.end(), vecs.begin(), vecs.end()), ...);
221     return result;
222   }
223 
224   template <size_t N, typename... Ts>
GetKernelTuple(const std::vector<Ts> &...vecs)225   auto GetKernelTuple(const std::vector<Ts> &... vecs) {
226     const auto &new_vec = ConcatVecs(vecs...);
227     if (new_vec.size() != N) {
228       MS_LOG(EXCEPTION) << op_type_ << "'s config op input and output's size must be same, but get " << N << " with "
229                         << new_vec.size();
230     }
231     const auto &result = GetTupleFront<N>(new_vec);
232     return result;
233   }
234 
235   aclOpExecutor *executor_{nullptr};
236   CallBackFunc release_func_{nullptr};
237   std::string op_type_;
238   uint64_t hash_id_{0};
239   std::unordered_set<uint64_t> cache_hash_;
240   static bool is_dynamic_;
241   std::mutex mtx_;
242 
243   static constexpr size_t kWsSizeIndex = 0;
244   static constexpr size_t kHashIdIndex = 3;
245 };
246 
247 using AclnnKernelModPtr = std::shared_ptr<AclnnKernelMod>;
248 using AclnnKernelModPtrList = std::vector<AclnnKernelModPtr>;
249 
250 #define REGISTER_ACLNN_CLASS(TYPE)                                                                                     \
251   template <size_t N>                                                                                                  \
252   class Aclnn##TYPE##KernelMod : public AclnnKernelMod {                                                               \
253    public:                                                                                                             \
254     explicit Aclnn##TYPE##KernelMod(std::string &&op_type) : AclnnKernelMod(std::move(op_type)) {}                     \
255     ~Aclnn##TYPE##KernelMod() = default;                                                                               \
256     void GetWorkSpaceInfo(const std::vector<KernelTensor *> &inputs,                                                   \
257                           const std::vector<KernelTensor *> &outputs) override {                                       \
258       const auto &res_tuple = this->GetKernelTuple<N>(inputs, outputs);                                                \
259       std::apply(                                                                                                      \
260         [this](const auto &... args) {                                                                                 \
261           if (AclnnKernelMod::is_dynamic_) {                                                                           \
262             hash_id_ = 0;                                                                                              \
263             return;                                                                                                    \
264           }                                                                                                            \
265           hash_id_ = transform::CalcOpApiHash(op_type_, args...);                                                      \
266           if (cache_hash_.count(hash_id_) == 0) {                                                                      \
267             const bool use_huge_pages = false;                                                                         \
268             auto return_value = GEN_EXECUTOR_CUST(op_type_, use_huge_pages, args...);                                  \
269             UpdateWorkspace(return_value);                                                                             \
270           } else {                                                                                                     \
271             auto return_value = GEN_EXECUTOR_BOOST(op_type_, hash_id_, args...);                                       \
272             UpdateWorkspace(return_value);                                                                             \
273           }                                                                                                            \
274         },                                                                                                             \
275         res_tuple);                                                                                                    \
276     }                                                                                                                  \
277     bool Launch(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &workspace,               \
278                 const std::vector<KernelTensor *> &outputs, void *stream_ptr) override {                               \
279       this->ParseGenExecutor(GenExecutor(inputs, outputs));                                                            \
280       RunOp(stream_ptr, workspace);                                                                                    \
281       return true;                                                                                                     \
282     }                                                                                                                  \
283                                                                                                                        \
284    private:                                                                                                            \
285     template <typename... Ts>                                                                                          \
286     auto GenExecutor(const std::vector<Ts> &... vecs) {                                                                \
287       const auto &op_type = this->op_type_;                                                                            \
288       const auto &hash_id = this->hash_id_;                                                                            \
289       const auto &res_tuple = this->GetKernelTuple<N>(vecs...);                                                        \
290       auto executor_info = std::apply(                                                                                 \
291         [&op_type, &hash_id](const auto &... args) { return GEN_EXECUTOR_BOOST(op_type, hash_id, args...); },          \
292         res_tuple);                                                                                                    \
293       return executor_info;                                                                                            \
294     }                                                                                                                  \
295                                                                                                                        \
296     void RunOp(void *stream_ptr, const std::vector<KernelTensor *> &workspace) {                                       \
297       if (workspace_size_list_.empty()) {                                                                              \
298         RUN_OP_API_ASYNC(op_type_, nullptr, 0, executor_, stream_ptr, release_func_);                                  \
299       } else {                                                                                                         \
300         if (is_dynamic_) {                                                                                             \
301           void *device_addr = device::ascend::AscendMemoryPool::GetInstance().AllocTensorMem(workspace_size_list_[0]); \
302           device::tracker::CALL_MEMORY_TRACKER_WITH_FILE(                                                              \
303             UpdateDevicePtrInfo, device_addr, device::tracker::MemType::kWorkSpace, "AclnnWorkspace_" + op_type_);     \
304           RUN_OP_API_ASYNC(op_type_, device_addr, workspace_size_list_[0], executor_, stream_ptr, release_func_);      \
305           device::ascend::AscendMemoryPool::GetInstance().FreeTensorMem(device_addr);                                  \
306         } else {                                                                                                       \
307           if (workspace.empty()) {                                                                                     \
308             MS_LOG(EXCEPTION) << "Failed to allocate workspace tensor!";                                               \
309           }                                                                                                            \
310           auto workspace_tensor = workspace[0];                                                                        \
311           if (workspace_tensor->size() != workspace_size_list_[0]) {                                                   \
312             MS_LOG(EXCEPTION) << "Please check 'GetWorkSpaceInfo' and 'Launch' func. Expected workspace size is"       \
313                               << workspace_size_list_[0] << ", but get " << workspace_tensor->size();                  \
314           }                                                                                                            \
315           RUN_OP_API_ASYNC(op_type_, workspace_tensor->device_ptr(), workspace_size_list_[0], executor_, stream_ptr,   \
316                            release_func_);                                                                             \
317         }                                                                                                              \
318       }                                                                                                                \
319     }                                                                                                                  \
320   };
321 
322 #define MS_ACLNN_KERNEL_FACTORY_REG(NAME, DERIVE_CLASS) MS_KERNEL_FACTORY_REG(AclnnKernelMod, NAME, DERIVE_CLASS)
323 #define MS_ACLNN_COMMON_KERNEL_FACTORY_REG(NAME, TYPE, N)                     \
324   REGISTER_ACLNN_CLASS(NAME)                                                  \
325   static const KernelRegistrar<AclnnKernelMod> g_##NAME##_AclnnKernelMod_reg( \
326     #NAME, []() { return std::make_shared<Aclnn##NAME##KernelMod<N>>(#TYPE); });
327 }  // namespace kernel
328 }  // namespace mindspore
329 
330 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_ACLNN_KERNEL_MOD_H_
331