1 /** 2 * Copyright 2023-2024 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_ACLNN_KERNEL_MOD_H_ 17 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_ACLNN_KERNEL_MOD_H_ 18 #include <vector> 19 #include <memory> 20 #include <map> 21 #include <string> 22 #include <tuple> 23 #include <unordered_set> 24 #include <utility> 25 #include "ops/base_operator.h" 26 #include "ops/op_def.h" 27 #include "kernel/kernel.h" 28 #include "plugin/factory/ms_factory.h" 29 #include "include/common/utils/utils.h" 30 #include "include/backend/mem_reuse/mem_tracker.h" 31 #include "runtime/pynative/op_runtime_info.h" 32 #include "transform/acl_ir/acl_convert.h" 33 #include "transform/acl_ir/op_api_exec.h" 34 #include "transform/acl_ir/op_api_util.h" 35 #include "plugin/device/ascend/hal/device/ascend_memory_manager.h" 36 37 namespace mindspore { 38 namespace kernel { 39 using aclTensor = transform::aclTensor; 40 using aclOpExecutor = transform::aclOpExecutor; 41 using CallBackFunc = std::function<void()>; 42 using OpApiUtil = transform::OpApiUtil; 43 using AclUtil = transform::AclUtil; 44 45 #define DEFINE_GET_WORKSPACE_FOR_RESIZE() \ 46 template <typename... Args> \ 47 void GetWorkspaceForResize(const Args &... args) { \ 48 if (AclnnKernelMod::is_dynamic_) { \ 49 hash_id_ = 0; \ 50 return; \ 51 } \ 52 hash_id_ = transform::CalcOpApiHash(op_type_, args...); \ 53 if (cache_hash_.count(hash_id_) == 0) { \ 54 const bool use_huge_pages = false; \ 55 auto return_value = GEN_EXECUTOR_CUST(op_type_, use_huge_pages, args...); \ 56 UpdateWorkspace(return_value); \ 57 } else { \ 58 auto return_value = GEN_EXECUTOR_BOOST(op_type_, hash_id_, args...); \ 59 UpdateWorkspace(return_value); \ 60 } \ 61 } \ 62 \ 63 void RunOp(void *stream_ptr, const std::vector<KernelTensor *> &workspace) { \ 64 if (workspace_size_list_.empty()) { \ 65 RUN_OP_API_ASYNC(op_type_, nullptr, 0, executor_, stream_ptr, release_func_); \ 66 } else { \ 67 if (is_dynamic_) { \ 68 void *device_addr = device::ascend::AscendMemoryPool::GetInstance().AllocTensorMem(workspace_size_list_[0]); \ 69 device::tracker::CALL_MEMORY_TRACKER_WITH_FILE( \ 70 UpdateDevicePtrInfo, device_addr, device::tracker::MemType::kWorkSpace, "AclnnWorkspace_" + op_type_); \ 71 RUN_OP_API_ASYNC(op_type_, device_addr, workspace_size_list_[0], executor_, stream_ptr, release_func_); \ 72 device::ascend::AscendMemoryPool::GetInstance().FreeTensorMem(device_addr); \ 73 } else { \ 74 if (workspace.empty()) { \ 75 MS_LOG(EXCEPTION) << "Failed to allocate workspace tensor!"; \ 76 } \ 77 auto workspace_tensor = workspace[0]; \ 78 if (workspace_tensor->size() != workspace_size_list_[0]) { \ 79 MS_LOG(EXCEPTION) << "Please check 'GetWorkSpaceInfo' and 'Launch' func. Expected workspace size is" \ 80 << workspace_size_list_[0] << ", but get " << workspace_tensor->size(); \ 81 } \ 82 RUN_OP_API_ASYNC(op_type_, workspace_tensor->device_ptr(), workspace_size_list_[0], executor_, stream_ptr, \ 83 release_func_); \ 84 } \ 85 } \ 86 } \ 87 \ 88 void RunOpSync(void *stream_ptr, const std::vector<KernelTensor *> &workspace) { \ 89 if (workspace_size_list_.empty()) { \ 90 RUN_OP_API_SYNC(op_type_, nullptr, 0, executor_, stream_ptr); \ 91 } else { \ 92 if (is_dynamic_) { \ 93 void *device_addr = device::ascend::AscendMemoryPool::GetInstance().AllocTensorMem(workspace_size_list_[0]); \ 94 device::tracker::CALL_MEMORY_TRACKER_WITH_FILE( \ 95 UpdateDevicePtrInfo, device_addr, device::tracker::MemType::kWorkSpace, "AclnnWorkspace_" + op_type_); \ 96 RUN_OP_API_SYNC(op_type_, device_addr, workspace_size_list_[0], executor_, stream_ptr); \ 97 device::ascend::AscendMemoryPool::GetInstance().FreeTensorMem(device_addr); \ 98 } else { \ 99 if (workspace.empty()) { \ 100 MS_LOG(EXCEPTION) << "Failed to allocate workspace tensor!"; \ 101 } \ 102 auto workspace_tensor = workspace[0]; \ 103 if (workspace_tensor->size() != workspace_size_list_[0]) { \ 104 MS_LOG(EXCEPTION) << "Please check 'GetWorkSpaceInfo' and 'Launch' func. Expected workspace size is" \ 105 << workspace_size_list_[0] << ", but get " << workspace_tensor->size(); \ 106 } \ 107 RUN_OP_API_SYNC(op_type_, workspace_tensor->device_ptr(), workspace_size_list_[0], executor_, stream_ptr); \ 108 } \ 109 } \ 110 } 111 112 class EmptyKernelTensor { 113 public: EmptyKernelTensor()114 EmptyKernelTensor() { tensor_ = new KernelTensor(); } EmptyKernelTensor(TypeId type_id,TypeId dtype_id)115 EmptyKernelTensor(TypeId type_id, TypeId dtype_id) { 116 if (type_id == kObjectTypeTensorType) { 117 tensor_ = new KernelTensor(); 118 auto tensor_shape = std::make_shared<abstract::TensorShape>(); 119 tensor_shape->SetShapeVector({0}); 120 tensor_->SetType(std::make_shared<TensorType>(TypeIdToType(dtype_id))); 121 tensor_->SetShape(tensor_shape); 122 } 123 } ~EmptyKernelTensor()124 ~EmptyKernelTensor() { delete tensor_; } get()125 KernelTensor *get() const { return tensor_; } 126 127 private: 128 KernelTensor *tensor_; 129 }; 130 131 class AclnnKernelMod : public KernelMod { 132 public: AclnnKernelMod(std::string && op_type)133 explicit AclnnKernelMod(std::string &&op_type) : op_type_(std::move(op_type)) {} 134 ~AclnnKernelMod() = default; 135 136 bool Init(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &outputs); 137 int Resize(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &outputs); 138 GetWorkSpaceInfo(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & outputs)139 virtual void GetWorkSpaceInfo(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &outputs) { 140 } 141 virtual bool Launch(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &workspace, 142 const std::vector<KernelTensor *> &outputs, void *stream_ptr); 143 ResetDeivceAddress(const std::vector<KernelTensor * > & inputs,const std::vector<KernelTensor * > & outputs)144 void ResetDeivceAddress(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &outputs) {} 145 146 std::vector<size_t> GetLaunchIgnoredInputAddressIdx() const override; IsNeedUpdateOutputShapeAndSize()147 bool IsNeedUpdateOutputShapeAndSize() override { return false; } GetOpSupport()148 std::vector<KernelAttr> GetOpSupport() override { MS_LOG(EXCEPTION) << "This interface is not support in aclnn."; } 149 150 template <typename... Args> UpdateWorkspace(const std::tuple<Args...> & args)151 void UpdateWorkspace(const std::tuple<Args...> &args) { 152 auto real_workspace_size = static_cast<size_t>(std::get<0>(args)); 153 if (real_workspace_size != 0) { 154 std::vector<size_t> workspace_size_list = {real_workspace_size}; 155 SetWorkspaceSizeList(workspace_size_list); 156 } 157 158 constexpr size_t kBoostGeneratorSize = 5; 159 if constexpr (std::tuple_size_v<std::tuple<Args...>> == kBoostGeneratorSize) { 160 hash_id_ = std::get<kHashIdIndex>(args); 161 } 162 } 163 164 template <typename... Args> ParseGenExecutor(const std::tuple<Args...> & args)165 void ParseGenExecutor(const std::tuple<Args...> &args) { 166 if (is_dynamic_) { 167 workspace_size_list_.clear(); 168 size_t size = std::get<0>(args); 169 if (size != 0) { 170 (void)workspace_size_list_.emplace_back(size); 171 } 172 } 173 174 executor_ = std::get<1>(args); 175 if (executor_ == nullptr) { 176 MS_LOG(INTERNAL_EXCEPTION) << "Please check op api's generate!"; 177 } 178 release_func_ = std::get<2>(args); 179 180 constexpr size_t kBoostGeneratorSize = 5; 181 if constexpr (std::tuple_size_v<std::tuple<Args...>> == kBoostGeneratorSize) { 182 hash_id_ = std::get<kHashIdIndex>(args); 183 if (cache_hash_.count(hash_id_) != 0) { 184 return; 185 } 186 constexpr size_t kHitIndex = 4; 187 if (std::get<kHitIndex>(args)) { 188 cache_hash_.insert(hash_id_); 189 } 190 } 191 } 192 SetDynamic(bool is_dynamic)193 void SetDynamic(bool is_dynamic) { 194 std::lock_guard<std::mutex> lock(mtx_); 195 is_dynamic_ = is_dynamic; 196 } 197 198 protected: 199 template <size_t N, std::size_t... Is> GetTupleFrontImpl(const std::vector<KernelTensor * > & vecs,std::index_sequence<Is...>)200 auto GetTupleFrontImpl(const std::vector<KernelTensor *> &vecs, std::index_sequence<Is...>) { 201 return std::make_tuple(vecs[Is]...); 202 } 203 204 template <size_t N> GetTupleFront(const std::vector<KernelTensor * > & vecs)205 auto GetTupleFront(const std::vector<KernelTensor *> &vecs) { 206 return GetTupleFrontImpl<N>(vecs, std::make_index_sequence<N>()); 207 } 208 209 template <typename T, typename... Vecs> ConcatVecs(const std::vector<T> & vec,const Vecs &...vecs)210 std::vector<T> ConcatVecs(const std::vector<T> &vec, const Vecs &... vecs) { 211 std::vector<T> result = vec; 212 (result.insert(result.end(), vecs.begin(), vecs.end()), ...); 213 return result; 214 } 215 216 template <typename T, typename... Vecs> ConcatVecs(const Vecs &...vecs)217 std::vector<T> ConcatVecs(const Vecs &... vecs) { 218 static_assert((std::is_same_v<T, typename Vecs::value_type> && ...), "All vectors must have the same type!"); 219 std::vector<T> result; 220 (result.insert(result.end(), vecs.begin(), vecs.end()), ...); 221 return result; 222 } 223 224 template <size_t N, typename... Ts> GetKernelTuple(const std::vector<Ts> &...vecs)225 auto GetKernelTuple(const std::vector<Ts> &... vecs) { 226 const auto &new_vec = ConcatVecs(vecs...); 227 if (new_vec.size() != N) { 228 MS_LOG(EXCEPTION) << op_type_ << "'s config op input and output's size must be same, but get " << N << " with " 229 << new_vec.size(); 230 } 231 const auto &result = GetTupleFront<N>(new_vec); 232 return result; 233 } 234 235 aclOpExecutor *executor_{nullptr}; 236 CallBackFunc release_func_{nullptr}; 237 std::string op_type_; 238 uint64_t hash_id_{0}; 239 std::unordered_set<uint64_t> cache_hash_; 240 static bool is_dynamic_; 241 std::mutex mtx_; 242 243 static constexpr size_t kWsSizeIndex = 0; 244 static constexpr size_t kHashIdIndex = 3; 245 }; 246 247 using AclnnKernelModPtr = std::shared_ptr<AclnnKernelMod>; 248 using AclnnKernelModPtrList = std::vector<AclnnKernelModPtr>; 249 250 #define REGISTER_ACLNN_CLASS(TYPE) \ 251 template <size_t N> \ 252 class Aclnn##TYPE##KernelMod : public AclnnKernelMod { \ 253 public: \ 254 explicit Aclnn##TYPE##KernelMod(std::string &&op_type) : AclnnKernelMod(std::move(op_type)) {} \ 255 ~Aclnn##TYPE##KernelMod() = default; \ 256 void GetWorkSpaceInfo(const std::vector<KernelTensor *> &inputs, \ 257 const std::vector<KernelTensor *> &outputs) override { \ 258 const auto &res_tuple = this->GetKernelTuple<N>(inputs, outputs); \ 259 std::apply( \ 260 [this](const auto &... args) { \ 261 if (AclnnKernelMod::is_dynamic_) { \ 262 hash_id_ = 0; \ 263 return; \ 264 } \ 265 hash_id_ = transform::CalcOpApiHash(op_type_, args...); \ 266 if (cache_hash_.count(hash_id_) == 0) { \ 267 const bool use_huge_pages = false; \ 268 auto return_value = GEN_EXECUTOR_CUST(op_type_, use_huge_pages, args...); \ 269 UpdateWorkspace(return_value); \ 270 } else { \ 271 auto return_value = GEN_EXECUTOR_BOOST(op_type_, hash_id_, args...); \ 272 UpdateWorkspace(return_value); \ 273 } \ 274 }, \ 275 res_tuple); \ 276 } \ 277 bool Launch(const std::vector<KernelTensor *> &inputs, const std::vector<KernelTensor *> &workspace, \ 278 const std::vector<KernelTensor *> &outputs, void *stream_ptr) override { \ 279 this->ParseGenExecutor(GenExecutor(inputs, outputs)); \ 280 RunOp(stream_ptr, workspace); \ 281 return true; \ 282 } \ 283 \ 284 private: \ 285 template <typename... Ts> \ 286 auto GenExecutor(const std::vector<Ts> &... vecs) { \ 287 const auto &op_type = this->op_type_; \ 288 const auto &hash_id = this->hash_id_; \ 289 const auto &res_tuple = this->GetKernelTuple<N>(vecs...); \ 290 auto executor_info = std::apply( \ 291 [&op_type, &hash_id](const auto &... args) { return GEN_EXECUTOR_BOOST(op_type, hash_id, args...); }, \ 292 res_tuple); \ 293 return executor_info; \ 294 } \ 295 \ 296 void RunOp(void *stream_ptr, const std::vector<KernelTensor *> &workspace) { \ 297 if (workspace_size_list_.empty()) { \ 298 RUN_OP_API_ASYNC(op_type_, nullptr, 0, executor_, stream_ptr, release_func_); \ 299 } else { \ 300 if (is_dynamic_) { \ 301 void *device_addr = device::ascend::AscendMemoryPool::GetInstance().AllocTensorMem(workspace_size_list_[0]); \ 302 device::tracker::CALL_MEMORY_TRACKER_WITH_FILE( \ 303 UpdateDevicePtrInfo, device_addr, device::tracker::MemType::kWorkSpace, "AclnnWorkspace_" + op_type_); \ 304 RUN_OP_API_ASYNC(op_type_, device_addr, workspace_size_list_[0], executor_, stream_ptr, release_func_); \ 305 device::ascend::AscendMemoryPool::GetInstance().FreeTensorMem(device_addr); \ 306 } else { \ 307 if (workspace.empty()) { \ 308 MS_LOG(EXCEPTION) << "Failed to allocate workspace tensor!"; \ 309 } \ 310 auto workspace_tensor = workspace[0]; \ 311 if (workspace_tensor->size() != workspace_size_list_[0]) { \ 312 MS_LOG(EXCEPTION) << "Please check 'GetWorkSpaceInfo' and 'Launch' func. Expected workspace size is" \ 313 << workspace_size_list_[0] << ", but get " << workspace_tensor->size(); \ 314 } \ 315 RUN_OP_API_ASYNC(op_type_, workspace_tensor->device_ptr(), workspace_size_list_[0], executor_, stream_ptr, \ 316 release_func_); \ 317 } \ 318 } \ 319 } \ 320 }; 321 322 #define MS_ACLNN_KERNEL_FACTORY_REG(NAME, DERIVE_CLASS) MS_KERNEL_FACTORY_REG(AclnnKernelMod, NAME, DERIVE_CLASS) 323 #define MS_ACLNN_COMMON_KERNEL_FACTORY_REG(NAME, TYPE, N) \ 324 REGISTER_ACLNN_CLASS(NAME) \ 325 static const KernelRegistrar<AclnnKernelMod> g_##NAME##_AclnnKernelMod_reg( \ 326 #NAME, []() { return std::make_shared<Aclnn##NAME##KernelMod<N>>(#TYPE); }); 327 } // namespace kernel 328 } // namespace mindspore 329 330 #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_ACLNN_KERNEL_MOD_H_ 331