diff --git a/cmake/package_lite.cmake b/cmake/package_lite.cmake index 2254c2a7..f15724f1 100644 --- a/cmake/package_lite.cmake +++ b/cmake/package_lite.cmake @@ -474,7 +474,7 @@ if(PLATFORM_ARM64) COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h" PATTERN "ops*" EXCLUDE) install(DIRECTORY ${TOP_DIR}/include/c_api/ DESTINATION ${RUNTIME_INC_DIR}/c_api COMPONENT ${RUNTIME_COMPONENT_NAME} FILES_MATCHING PATTERN "*.h") - if(ANDROID_NDK_TOOLCHAIN_INCLUDED OR MSLITE_ENABLE_CONVERTER OR TARGET_HIMIX) + if(ANDROID_NDK_TOOLCHAIN_INCLUDED OR MSLITE_ENABLE_CONVERTER OR TARGET_HIMIX OR TARGET_OHOS) __install_micro_wrapper() endif() if(MSLITE_ENABLE_RUNTIME_GLOG) diff --git a/mindspore/ccsrc/backend/common/optimizer/pass.h b/mindspore/ccsrc/backend/common/optimizer/pass.h new file mode 100644 index 00000000..8d396164 --- /dev/null +++ b/mindspore/ccsrc/backend/common/optimizer/pass.h @@ -0,0 +1,48 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_COMMON_PASS_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_COMMON_PASS_H_ +#include +#include +#include "ir/anf.h" +#include "mindspore/core/ops/array_ops.h" +#include "mindspore/core/ops/lite_ops.h" +#include "utils/trace_base.h" + +namespace mindspore { +namespace opt { +class CacheManager; +using CacheManagerPtr = std::shared_ptr; + +// @brief ANF Graph level optimization base pass +class Pass { +public: + explicit Pass(const std::string &name = "pass") : name_(name) {} + virtual ~Pass() = default; + virtual bool Run(const FuncGraphPtr &fun_graph) = 0; + const std::string &name() const { return name_;} + void SetCacheManager(const CacheManagerPtr &cm) { cache_manager_ = cm;} + const CacheManagerPtr &GetCacheManager() const {return cache_manager_;} + +private: + const std::string name_; + CacheManagerPtr cache_manager_; +}; +using PassPtr = std::shared_ptr; +} // namespace opt +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_COMMON_PASS_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.cc index 55bbddac..378ef00c 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.cc @@ -60,6 +60,8 @@ bool LstmCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vec hidden_size_ = kernel_ptr->get_hidden_size(); num_layers_ = kernel_ptr->get_num_layers(); has_bias_ = kernel_ptr->get_has_bias(); + proj_size_ = kernel_ptr->get_proj_size(); + real_hidden_size_ = proj_size_ > 0 ? proj_size_ : hidden_size_; constexpr int kBidirectional = 2; num_directions_ = 1; if (bidirectional_) { @@ -73,14 +75,20 @@ bool LstmCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vec MS_LOG(EXCEPTION) << "Layers must be lower than 100!"; } + weight_size_ = 0; + weight_h_size_ = 0; + weight_r_size_ = 0; for (int i = 0; i < num_layers_; ++i) { weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_); - weight_h_size_ += gate_size * hidden_size_; + weight_h_size_ += gate_size * real_hidden_size_; + weight_r_size_ += hidden_size_ * proj_size_; } weight_size_ = weight_size_ * num_directions_; weight_h_size_ = weight_h_size_ * num_directions_; + weight_r_size_ = weight_r_size_ * num_directions_; weights_dims_ = {num_layers_, num_directions_, input_size_, kGateNum, hidden_size_}; - weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, kGateNum, hidden_size_}; + weights_h_dims_ = {num_layers_, num_directions_, real_hidden_size_, kGateNum, hidden_size_}; + weights_r_dims_ = {num_layers_, num_directions_, hidden_size_, proj_size_}; bias_dims_ = {num_layers_, num_directions_, kGateNum, hidden_size_}; is_training_ = base_operator->HasAttr(kAttrIsTraining) ? GetValue(base_operator->GetAttr(kAttrIsTraining)) : true; @@ -110,10 +118,10 @@ int LstmCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::ve direction = dnnl::rnn_direction::bidirectional_concat; } dim src_dims = {seq_len_, batch_size_, input_size_}; - dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; + dim src_h_dims = {num_layers_, num_directions_, batch_size_, real_hidden_size_}; dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; - dim dst_dims = {seq_len_, batch_size_, static_cast(hidden_size_) * num_directions_}; - dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; + dim dst_dims = {seq_len_, batch_size_, real_hidden_size_ * num_directions_}; + dim dst_h_dims = {num_layers_, num_directions_, batch_size_, real_hidden_size_}; dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc); dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc); @@ -126,13 +134,16 @@ int LstmCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::ve auto prop_kind = is_training_ ? dnnl::prop_kind::forward_training : dnnl::prop_kind::forward_inference; auto weights_desc = formatted_md(weights_dims_, tag::any); auto weights_h_desc = formatted_md(weights_h_dims_, tag::any); - auto desc = - CreatePrimitive(prop_kind, direction, src_desc, src_h_desc, src_c_desc, weights_desc, - weights_h_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc); + auto weights_r_desc = proj_size_ > 0 ? formatted_md(weights_r_dims_, tag::any) : dnnl::memory::desc(); + auto peephole_desc = dnnl::memory::desc(); + auto desc = CreatePrimitive(prop_kind, direction, src_desc, src_h_desc, src_c_desc, + weights_desc, weights_h_desc, peephole_desc, weights_r_desc, + bias_desc, dst_desc, dst_h_desc, dst_c_desc); prim_desc_ = CreateDesc(*desc, engine_); primitive_ = CreatePrimitive(prim_desc_); auto weights_layer = GetWeightsLayerDesc(prim_desc_); auto weights_iter = GetWeightsIterDesc(prim_desc_); + auto weights_proj = GetWeightsProjectionDesc(prim_desc_); bias_desc_ = GetBiasDesc(prim_desc_); if (is_training_) { auto wksp_desc = GetWorkspaceDesc(prim_desc_); @@ -144,6 +155,7 @@ int LstmCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::ve AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc); AddArgument(DNNL_ARG_WEIGHTS_LAYER, weights_layer); AddArgument(DNNL_ARG_WEIGHTS_ITER, weights_iter); + AddArgument(DNNL_ARG_WEIGHTS_PROJECTION, weights_proj); AddArgument(DNNL_ARG_BIAS, bias_desc); AddArgument(DNNL_ARG_DST_LAYER, dst_desc); AddArgument(DNNL_ARG_DST_ITER, dst_h_desc); @@ -151,10 +163,13 @@ int LstmCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::ve auto weights_dims_desc = CreateDesc(weights_dims_, dt::f32, tag::ldgoi); auto weights_h_dims_desc = CreateDesc(weights_h_dims_, dt::f32, tag::ldgoi); + auto weights_r_dims_desc = CreateDesc(weights_r_dims_, dt::f32, tag::ldoi); user_weights_memory_ = CreateDesc(weights_dims_desc, engine_); user_weights_h_memory_ = CreateDesc(weights_h_dims_desc, engine_); + user_weights_r_memory_ = CreateDesc(weights_r_dims_desc, engine_); weights_memory_ = CreateDesc(weights_layer, engine_); weights_h_memory_ = CreateDesc(weights_iter, engine_); + weights_r_memory_ = CreateDesc(weights_proj, engine_); bias_memory_ = CreateDesc(bias_desc_, engine_); InitOutputSize(outputs); @@ -163,13 +178,20 @@ int LstmCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::ve bool LstmCpuKernelMod::Launch(const std::vector &inputs, const std::vector &, const std::vector &outputs) { + size_t offset = 0; SetDataHandle(user_weights_memory_, inputs[kInputWeightIndex]->addr); - SetDataHandle(user_weights_h_memory_, reinterpret_cast(inputs[kInputWeightIndex]->addr) + weight_size_); + offset += weight_size_; + SetDataHandle(user_weights_h_memory_, reinterpret_cast(inputs[kInputWeightIndex]->addr) + offset); + offset += weight_h_size_; Reorder(&user_weights_memory_, &weights_memory_); Reorder(&user_weights_h_memory_, &weights_h_memory_); + if (proj_size_ > 0) { + SetDataHandle(user_weights_r_memory_, reinterpret_cast(inputs[kInputWeightIndex]->addr) + offset); + Reorder(&user_weights_r_memory_, &weights_r_memory_); + offset += weight_r_size_; + } if (has_bias_) { - SetDataHandle(bias_memory_, - reinterpret_cast(inputs[kInputWeightIndex]->addr) + weight_size_ + weight_h_size_); + SetDataHandle(bias_memory_, reinterpret_cast(inputs[kInputWeightIndex]->addr) + offset); } else { auto size = GetSize(bias_desc_); if (memset_s(GetDataHandle(bias_memory_), size, 0, size) != EOK) { @@ -182,6 +204,7 @@ bool LstmCpuKernelMod::Launch(const std::vector &inputs, con SetArgumentHandle(DNNL_ARG_SRC_ITER_C, inputs[kInputCIndex]->addr); SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, GetDataHandle(weights_memory_)); SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, GetDataHandle(weights_h_memory_)); + SetArgumentHandle(DNNL_ARG_WEIGHTS_PROJECTION, GetDataHandle(weights_r_memory_)); SetArgumentHandle(DNNL_ARG_BIAS, GetDataHandle(bias_memory_)); SetArgumentHandle(DNNL_ARG_DST_LAYER, outputs[0]->addr); SetArgumentHandle(DNNL_ARG_DST_ITER, outputs[1]->addr); diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.h b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.h index 42609eed..a0241c16 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_cpu_kernel.h @@ -58,14 +58,17 @@ class LstmCpuKernelMod : public MKLCpuKernelMod { private: void InitOutputSize(const std::vector &outputs); - int weight_size_{0}; - int weight_h_size_{0}; - int input_size_{0}; - int hidden_size_{0}; - int num_layers_{0}; - int batch_size_{0}; - int seq_len_{0}; - int num_directions_{0}; + int64_t weight_size_{0}; + int64_t weight_h_size_{0}; + int64_t weight_r_size_{0}; + int64_t input_size_{0}; + int64_t hidden_size_{0}; + int64_t num_layers_{0}; + int64_t batch_size_{0}; + int64_t seq_len_{0}; + int64_t num_directions_{0}; + int64_t proj_size_{0}; + int64_t real_hidden_size_{0}; bool bidirectional_{false}; bool has_bias_{false}; bool is_training_{false}; @@ -73,13 +76,16 @@ class LstmCpuKernelMod : public MKLCpuKernelMod { dnnl::memory::dims weights_dims_; dnnl::memory::dims weights_h_dims_; + dnnl::memory::dims weights_r_dims_; dnnl::memory::dims bias_dims_; dnnl::lstm_forward::primitive_desc prim_desc_; dnnl::memory::desc bias_desc_; dnnl::memory user_weights_memory_; dnnl::memory user_weights_h_memory_; + dnnl::memory user_weights_r_memory_; dnnl::memory weights_memory_; dnnl::memory weights_h_memory_; + dnnl::memory weights_r_memory_; dnnl::memory bias_memory_; }; } // namespace kernel diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.cc index aa1f8b44..0b5d09c1 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.cc @@ -62,6 +62,8 @@ bool LSTMGradCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std: hidden_size_ = op_prim->get_hidden_size(); num_layers_ = op_prim->get_num_layers(); has_bias_ = op_prim->get_has_bias(); + proj_size_ = op_prim->get_proj_size(); + real_hidden_size_ = proj_size_ > 0 ? proj_size_ : hidden_size_; auto kernel_attr = GetKernelAttrFromTensors(inputs, outputs); auto match = MatchKernelAttr(kernel_attr, GetOpSupport()); if (!match.first) { @@ -103,12 +105,15 @@ int LSTMGradCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std } weight_size_ = 0; weight_h_size_ = 0; + weight_r_size_ = 0; for (int64_t i = 0; i < num_layers_; ++i) { weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_); - weight_h_size_ += gate_size * hidden_size_; + weight_h_size_ += gate_size * real_hidden_size_; + weight_r_size_ += proj_size_ * hidden_size_; } weight_size_ = weight_size_ * num_directions_; weight_h_size_ = weight_h_size_ * num_directions_; + weight_r_size_ = weight_r_size_ * num_directions_; if (num_directions_ * num_layers_ != src_h_shape[0]) { MS_LOG(ERROR) << "Error iteration shape!"; return KRET_RESIZE_FAILED; @@ -124,13 +129,14 @@ void LSTMGradCpuKernelMod::InitDnnl() { direction = dnnl::rnn_direction::bidirectional_concat; } dim src_dims = {seq_len_, batch_size_, input_size_}; - dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; + dim src_h_dims = {num_layers_, num_directions_, batch_size_, real_hidden_size_}; dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; weights_dims_ = {num_layers_, num_directions_, input_size_, kNumberFour, hidden_size_}; - weights_h_dims_ = {num_layers_, num_directions_, hidden_size_, kNumberFour, hidden_size_}; + weights_h_dims_ = {num_layers_, num_directions_, real_hidden_size_, kNumberFour, hidden_size_}; + weights_r_dims_ = {num_layers_, num_directions_, hidden_size_, proj_size_}; bias_dims_ = {num_layers_, num_directions_, kNumberFour, hidden_size_}; - dim dst_dims = {seq_len_, batch_size_, static_cast(hidden_size_) * num_directions_}; - dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; + dim dst_dims = {seq_len_, batch_size_, real_hidden_size_ * num_directions_}; + dim dst_h_dims = {num_layers_, num_directions_, batch_size_, real_hidden_size_}; dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_}; dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc); dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc); @@ -141,15 +147,17 @@ void LSTMGradCpuKernelMod::InitDnnl() { dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc); auto weights_desc = formatted_md(weights_dims_, tag::any); auto weights_h_desc = formatted_md(weights_h_dims_, tag::any); + auto weights_r_desc = proj_size_ > 0 ? formatted_md(weights_r_dims_, tag::any) : dnnl::memory::desc(); + auto peepole_desc = dnnl::memory::desc(); - auto forward_desc = CreatePrimitive(dnnl::prop_kind::forward_training, direction, src_desc, - src_h_desc, src_c_desc, weights_desc, weights_h_desc, - bias_desc, dst_desc, dst_h_desc, dst_c_desc); + auto forward_desc = CreatePrimitive( + dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc, weights_desc, weights_h_desc, + peepole_desc, weights_r_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc); auto prim_forward_desc = CreateDesc(*forward_desc, eng); auto backward_desc = CreatePrimitive( - dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, weights_desc, weights_h_desc, bias_desc, - dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc, src_c_desc, weights_desc, weights_h_desc, bias_desc, - dst_desc, dst_h_desc, dst_c_desc); + dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, weights_desc, weights_h_desc, peepole_desc, + weights_r_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc, src_c_desc, weights_desc, + weights_h_desc, peepole_desc, weights_r_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc); prim_backward_desc_ = CreateDesc(*backward_desc, eng, prim_forward_desc); primitive_ = CreatePrimitive(prim_backward_desc_); auto wksp_desc = GetWorkspaceDesc(prim_forward_desc); @@ -159,24 +167,31 @@ void LSTMGradCpuKernelMod::InitDnnl() { // construct fw memory weights_layer_desc_ = GetWeightsLayerDesc(prim_backward_desc_); weights_iter_desc_ = GetWeightsIterDesc(prim_backward_desc_); + weights_proj_desc_ = GetWeightsProjectionDesc(prim_backward_desc_); bias_desc_ = GetBiasDesc(prim_backward_desc_); auto weights_mem_desc = CreateDesc(weights_dims_, dt::f32, tag::ldgoi); auto weights_h_mem_desc = CreateDesc(weights_h_dims_, dt::f32, tag::ldgoi); + auto weights_r_mem_desc = CreateDesc(weights_r_dims_, dt::f32, tag::ldoi); user_weights_memory_ = CreateDesc(weights_mem_desc, eng); user_weights_h_memory_ = CreateDesc(weights_h_mem_desc, eng); + user_weights_r_memory_ = CreateDesc(weights_r_mem_desc, eng); weights_memory_ = CreateDesc(weights_layer_desc_, eng); weights_h_memory_ = CreateDesc(weights_iter_desc_, eng); + weights_r_memory_ = CreateDesc(weights_proj_desc_, eng); bias_memory_ = CreateDesc(bias_desc_, eng); // construct bw memory diff_weights_layer_desc_ = GetDiffWeightsLayerDesc(prim_backward_desc_); diff_weights_iter_desc_ = GetDiffWeightsIterDesc(prim_backward_desc_); + diff_weights_proj_desc_ = GetDiffWeightsProjectionDesc(prim_backward_desc_); diff_bias_desc_ = GetDiffBiasDesc(prim_backward_desc_); diff_weights_memory_ = CreateDesc(diff_weights_layer_desc_, eng); diff_weights_h_memory_ = CreateDesc(diff_weights_iter_desc_, eng); + diff_weights_r_memory_ = CreateDesc(diff_weights_proj_desc_, eng); diff_bias_memory_ = CreateDesc(diff_bias_desc_, eng); user_diff_weights_memory_ = CreateDesc(weights_mem_desc, eng); user_diff_weights_h_memory_ = CreateDesc(weights_h_mem_desc, eng); + user_diff_weights_r_memory_ = CreateDesc(weights_r_mem_desc, eng); } void LSTMGradCpuKernelMod::AddArgumentOp(const dnnl::memory::desc &src_desc, const dnnl::memory::desc &src_h_desc, @@ -188,6 +203,7 @@ void LSTMGradCpuKernelMod::AddArgumentOp(const dnnl::memory::desc &src_desc, con AddArgument(DNNL_ARG_SRC_ITER_C, src_c_desc); AddArgument(DNNL_ARG_WEIGHTS_LAYER, weights_layer_desc_); AddArgument(DNNL_ARG_WEIGHTS_ITER, weights_iter_desc_); + AddArgument(DNNL_ARG_WEIGHTS_PROJECTION, weights_proj_desc_); AddArgument(DNNL_ARG_BIAS, bias_desc); AddArgument(DNNL_ARG_DST_LAYER, dst_desc); AddArgument(DNNL_ARG_DST_ITER, dst_h_desc); @@ -197,6 +213,7 @@ void LSTMGradCpuKernelMod::AddArgumentOp(const dnnl::memory::desc &src_desc, con AddArgument(DNNL_ARG_DIFF_SRC_ITER_C, src_c_desc); AddArgument(DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_layer_desc_); AddArgument(DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_iter_desc_); + AddArgument(DNNL_ARG_DIFF_WEIGHTS_PROJECTION, diff_weights_proj_desc_); AddArgument(DNNL_ARG_DIFF_BIAS, bias_desc); AddArgument(DNNL_ARG_DIFF_DST_LAYER, dst_desc); AddArgument(DNNL_ARG_DIFF_DST_ITER, dst_h_desc); @@ -211,6 +228,7 @@ void LSTMGradCpuKernelMod::SetArgumentHandleOp(const std::vectoraddr); SetArgumentHandle(DNNL_ARG_WEIGHTS_LAYER, GetDataHandle(weights_memory_)); SetArgumentHandle(DNNL_ARG_WEIGHTS_ITER, GetDataHandle(weights_h_memory_)); + SetArgumentHandle(DNNL_ARG_WEIGHTS_PROJECTION, GetDataHandle(weights_r_memory_)); SetArgumentHandle(DNNL_ARG_BIAS, GetDataHandle(bias_memory_)); SetArgumentHandle(DNNL_ARG_DST_LAYER, inputs[kDstLayerIdx]->addr); SetArgumentHandle(DNNL_ARG_DST_ITER, inputs[kDstIterIdx]->addr); @@ -221,6 +239,7 @@ void LSTMGradCpuKernelMod::SetArgumentHandleOp(const std::vectoraddr); SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_LAYER, GetDataHandle(diff_weights_memory_)); SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_ITER, GetDataHandle(diff_weights_h_memory_)); + SetArgumentHandle(DNNL_ARG_DIFF_WEIGHTS_PROJECTION, GetDataHandle(diff_weights_r_memory_)); SetArgumentHandle(DNNL_ARG_DIFF_BIAS, GetDataHandle(diff_bias_memory_)); SetArgumentHandle(DNNL_ARG_DIFF_DST_LAYER, inputs[kDiffDstLayerIdx]->addr); SetArgumentHandle(DNNL_ARG_DIFF_DST_ITER, inputs[kDiffDstIterIdx]->addr); @@ -241,13 +260,20 @@ bool LSTMGradCpuKernelMod::Launch(const std::vector &inputs, const std::vector &outputs) { CHECK_KERNEL_INPUTS_NUM(inputs.size(), kLstmGradInputsNum, kernel_name_); CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kLstmGradOutputsNum, kernel_name_); + size_t offset = 0; SetDataHandle(user_weights_memory_, inputs[kInputWeightIndex]->addr); - SetDataHandle(user_weights_h_memory_, reinterpret_cast(inputs[kInputWeightIndex]->addr) + weight_size_); + offset += weight_size_; + SetDataHandle(user_weights_h_memory_, reinterpret_cast(inputs[kInputWeightIndex]->addr) + offset); + offset += weight_h_size_; Reorder(&user_weights_memory_, &weights_memory_); Reorder(&user_weights_h_memory_, &weights_h_memory_); + if (proj_size_ > 0) { + SetDataHandle(user_weights_r_memory_, reinterpret_cast(inputs[kInputWeightIndex]->addr) + offset); + Reorder(&user_weights_r_memory_, &weights_r_memory_); + offset += weight_r_size_; + } if (has_bias_) { - SetDataHandle(bias_memory_, - reinterpret_cast(inputs[kInputWeightIndex]->addr) + weight_size_ + weight_h_size_); + SetDataHandle(bias_memory_, reinterpret_cast(inputs[kInputWeightIndex]->addr) + offset); } else { auto dst_ptr = GetDataHandle(bias_memory_); auto size = GetSize(bias_desc_); @@ -256,16 +282,23 @@ bool LSTMGradCpuKernelMod::Launch(const std::vector &inputs, } } + offset = 0; SetDataHandle(user_diff_weights_memory_, outputs[kOutputWeightIndex]->addr); - SetDataHandle(user_diff_weights_h_memory_, - reinterpret_cast(outputs[kOutputWeightIndex]->addr) + weight_size_); + offset += weight_size_; + SetDataHandle(user_diff_weights_h_memory_, reinterpret_cast(outputs[kOutputWeightIndex]->addr) + offset); + offset += weight_h_size_; ResetMemory(user_diff_weights_memory_, "user weights grad"); ResetMemory(user_diff_weights_h_memory_, "user weights iter grad"); ResetMemory(diff_weights_memory_, "weights grad"); ResetMemory(diff_weights_h_memory_, "weights iter grad"); + if (proj_size_ > 0) { + SetDataHandle(user_diff_weights_r_memory_, reinterpret_cast(outputs[kOutputWeightIndex]->addr) + offset); + ResetMemory(user_diff_weights_r_memory_, "user weights projection grad"); + ResetMemory(diff_weights_r_memory_, "weights projection grad"); + offset += weight_r_size_; + } if (has_bias_) { - SetDataHandle(diff_bias_memory_, - reinterpret_cast(outputs[kOutputWeightIndex]->addr) + weight_size_ + weight_h_size_); + SetDataHandle(diff_bias_memory_, reinterpret_cast(outputs[kOutputWeightIndex]->addr) + offset); } auto dst_ptr = GetDataHandle(diff_bias_memory_); auto size = GetSize(diff_bias_desc_); @@ -276,6 +309,9 @@ bool LSTMGradCpuKernelMod::Launch(const std::vector &inputs, ExecutePrimitive(); Reorder(&diff_weights_memory_, &user_diff_weights_memory_); Reorder(&diff_weights_h_memory_, &user_diff_weights_h_memory_); + if (proj_size_ > 0) { + Reorder(&diff_weights_r_memory_, &user_diff_weights_r_memory_); + } return true; } diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.h b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.h index f47bafc0..9768464d 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/lstm_grad_cpu_kernel.h @@ -75,34 +75,44 @@ class LSTMGradCpuKernelMod : public MKLCpuKernelMod { bool has_bias_{false}; int64_t weight_size_{0}; int64_t weight_h_size_{0}; + int64_t weight_r_size_{0}; int64_t input_size_{0}; int64_t hidden_size_{0}; int64_t num_layers_{0}; int64_t batch_size_{0}; int64_t seq_len_{0}; + int64_t proj_size_{0}; + int64_t real_hidden_size_{0}; size_t reserve_size_{0}; dnnl::memory::dims weights_dims_; dnnl::memory::dims weights_h_dims_; + dnnl::memory::dims weights_r_dims_; dnnl::memory::dims bias_dims_; dnnl::lstm_backward::primitive_desc prim_backward_desc_; dnnl::memory::desc weights_layer_desc_; dnnl::memory::desc weights_iter_desc_; + dnnl::memory::desc weights_proj_desc_; dnnl::memory::desc bias_desc_; dnnl::memory::desc diff_weights_layer_desc_; dnnl::memory::desc diff_weights_iter_desc_; + dnnl::memory::desc diff_weights_proj_desc_; dnnl::memory::desc diff_bias_desc_; dnnl::memory user_weights_memory_; dnnl::memory user_weights_h_memory_; + dnnl::memory user_weights_r_memory_; dnnl::memory weights_memory_; dnnl::memory weights_h_memory_; + dnnl::memory weights_r_memory_; dnnl::memory bias_memory_; dnnl::memory diff_weights_memory_; dnnl::memory diff_weights_h_memory_; + dnnl::memory diff_weights_r_memory_; dnnl::memory diff_bias_memory_; dnnl::memory user_diff_weights_memory_; dnnl::memory user_diff_weights_h_memory_; + dnnl::memory user_diff_weights_r_memory_; }; } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/mkl_cpu_kernel.h b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/mkl_cpu_kernel.h index 7c8292df..0c98f8f6 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/mkl_cpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/mkldnn/mkl_cpu_kernel.h @@ -89,6 +89,14 @@ auto GetWeightsIterDesc(const T &prim_desc) { return desc; } +template +auto GetWeightsProjectionDesc(const T &prim_desc) { + MS_LOG(DEBUG) << "begin to invoke " << demangle(typeid(T).name()) << "::weights_projection_desc()"; + auto desc = prim_desc.weights_projection_desc(); + MS_LOG(DEBUG) << "end to invoke " << demangle(typeid(T).name()) << "::weights_projection_desc()"; + return desc; +} + template auto GetBiasDesc(const T &prim_desc) { MS_LOG(DEBUG) << "begin to invoke " << demangle(typeid(T).name()) << "::bias_desc()"; @@ -113,6 +121,14 @@ auto GetDiffWeightsIterDesc(const T &prim_desc) { return desc; } +template +auto GetDiffWeightsProjectionDesc(const T &prim_desc) { + MS_LOG(DEBUG) << "begin to invoke " << demangle(typeid(T).name()) << "::diff_weights_projection_desc()"; + auto desc = prim_desc.diff_weights_projection_desc(); + MS_LOG(DEBUG) << "end to invoke " << demangle(typeid(T).name()) << "::diff_weights_projection_desc()"; + return desc; +} + template auto GetDiffBiasDesc(const T &prim_desc) { MS_LOG(DEBUG) << "begin to invoke " << demangle(typeid(T).name()) << "::diff_bias_desc()"; diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/BUILD.gn b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/BUILD.gn index 103e53b7..d27817be 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/BUILD.gn +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/BUILD.gn @@ -501,6 +501,7 @@ infer_shape_sources = [ "infer/custom_masked_fill_infer.c", "infer/custom_is_inf_infer.c", "infer/custom_tensor_scatter_max_infer.c", + "infer/custom_gather_d_grad_v2_infer.c", "infer/decoder_layer_infer.c", "infer/deconv2d_infer.c", "infer/depth_to_space_infer.c", @@ -740,6 +741,7 @@ arm64_fp16_assembly_sources = [ "assembly/fp16/Matmul12X16Fp16.S", "assembly/fp16/MatmulBaseFp16Neon.S", "assembly/fp16/MatmulFp16Opt.S", + "assembly/fp16/MatmulFp16OptV2.S", "assembly/fp16/MatmulFp16.S", "assembly/fp16/MatmulWinogradFp16.S", "assembly/fp16/MatVecMulFp16.S", diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S new file mode 100644 index 00000000..2d901a3d --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/assembly/fp16/MatmulFp16OptV2.S @@ -0,0 +1,2966 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifdef ENABLE_ARM64 +#include "nnacl/assembly_global.h" + +.text +.align 5 + +// void MatmulFp16OptV2(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, +// size_t depth, size_t row, size_t col, size_t stride, size_t writeMode) +// x0: a +// x1: b +// x2: c +// x3: bias +// x4: act_type +// x5: depth +// x6: row +// x7: col +// x8: stride +// x9: writeMode + +asm_function MatmulFp16OptV2 + sub sp, sp, #192 + st1 {v8.8h, v9.8h, v10.8h, v11.8h}, [sp], #64 + st1 {v12.8h, v13.8h, v14.8h, v15.8h}, [sp], #64 + stp x19, x20, [sp], #16 + stp x21, x22, [sp], #16 + stp x23, x24, [sp], #16 + stp x29, x30, [sp], #16 + + ldr x8, [sp] + ldr x9, [sp, #8] // writeMode + lsl x8, x8, #1 // stride * sizeof(float16_t) + + lsl x15, x7, #1 // col * sizeof(float16_t) + lsl x16, x5, #1 // depth * sizeof(float16_t) + mov x11, x2 + movi v7.8h, #0x46, lsl #8 + subs x6, x6, #12 + blt LoopRow8 +LoopRow12: + mov x11, x1 // reload matrixB + mov x12, x3 // reload bias + mov x13, x7 // reload col + mov x21, x2 // relocate output + subs x13, x13, #16 + blt LoopCol12x8 + LoopCol12x16: + mov x10, x0 // update matrixA + ld1 {v0.8h}, [x10], #16 + mov x14, x5 // reload depth + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + cbnz x12, InitFromBias12x16 + dup v8.2d, xzr + dup v9.2d, xzr + dup v10.2d, xzr + dup v11.2d, xzr + dup v12.2d, xzr + dup v13.2d, xzr + dup v14.2d, xzr + dup v15.2d, xzr + dup v16.2d, xzr + dup v17.2d, xzr + dup v18.2d, xzr + dup v19.2d, xzr + dup v20.2d, xzr + dup v21.2d, xzr + dup v22.2d, xzr + dup v23.2d, xzr + dup v24.2d, xzr + dup v25.2d, xzr + dup v26.2d, xzr + dup v27.2d, xzr + dup v28.2d, xzr + dup v29.2d, xzr + dup v30.2d, xzr + dup v31.2d, xzr + b Compute12x16Enter + InitFromBias12x16: + ld1 {v8.8h, v9.8h}, [x12] + ld1 {v10.8h, v11.8h}, [x12] + ld1 {v12.8h, v13.8h}, [x12] + ld1 {v14.8h, v15.8h}, [x12] + ld1 {v16.8h, v17.8h}, [x12] + ld1 {v18.8h, v19.8h}, [x12] + ld1 {v20.8h, v21.8h}, [x12] + ld1 {v22.8h, v23.8h}, [x12] + ld1 {v24.8h, v25.8h}, [x12] + ld1 {v26.8h, v27.8h}, [x12] + ld1 {v28.8h, v29.8h}, [x12] + ld1 {v30.8h, v31.8h}, [x12] + add x12, x12, #32 + Compute12x16Enter: + bl Compute12x16Unit + Activation12x16: + cmp x4, #3 + beq Relu612x16 + cmp x4, #1 + beq Relu12x16 + b Write12x16 + + Relu612x16: + fmin v8.8h, v8.8h, v7.8h + fmin v9.8h, v9.8h, v7.8h + fmin v10.8h, v10.8h, v7.8h + fmin v11.8h, v11.8h, v7.8h + fmin v12.8h, v12.8h, v7.8h + fmin v13.8h, v13.8h, v7.8h + fmin v14.8h, v14.8h, v7.8h + fmin v15.8h, v15.8h, v7.8h + fmin v16.8h, v16.8h, v7.8h + fmin v17.8h, v17.8h, v7.8h + fmin v18.8h, v18.8h, v7.8h + fmin v19.8h, v19.8h, v7.8h + fmin v20.8h, v20.8h, v7.8h + fmin v21.8h, v21.8h, v7.8h + fmin v22.8h, v22.8h, v7.8h + fmin v23.8h, v23.8h, v7.8h + fmin v24.8h, v24.8h, v7.8h + fmin v25.8h, v25.8h, v7.8h + fmin v26.8h, v26.8h, v7.8h + fmin v27.8h, v27.8h, v7.8h + fmin v28.8h, v28.8h, v7.8h + fmin v29.8h, v29.8h, v7.8h + fmin v30.8h, v30.8h, v7.8h + fmin v31.8h, v31.8h, v7.8h + + Relu12x16: + dup v6.8h, wzr + fmax v8.8h, v8.8h, v6.8h + fmax v9.8h, v9.8h, v6.8h + fmax v10.8h, v10.8h, v6.8h + fmax v11.8h, v11.8h, v6.8h + fmax v12.8h, v12.8h, v6.8h + fmax v13.8h, v13.8h, v6.8h + fmax v14.8h, v14.8h, v6.8h + fmax v15.8h, v15.8h, v6.8h + fmax v16.8h, v16.8h, v6.8h + fmax v17.8h, v17.8h, v6.8h + fmax v18.8h, v18.8h, v6.8h + fmax v19.8h, v19.8h, v6.8h + fmax v20.8h, v20.8h, v6.8h + fmax v21.8h, v21.8h, v6.8h + fmax v22.8h, v22.8h, v6.8h + fmax v23.8h, v23.8h, v6.8h + fmax v24.8h, v24.8h, v6.8h + fmax v25.8h, v25.8h, v6.8h + fmax v26.8h, v26.8h, v6.8h + fmax v27.8h, v27.8h, v6.8h + fmax v28.8h, v28.8h, v6.8h + fmax v29.8h, v29.8h, v6.8h + fmax v30.8h, v30.8h, v6.8h + fmax v31.8h, v31.8h, v6.8h + Write12x16: + mov x22, x21 + add x23, x21, x8, lsl #2 + add x24, x21, x8, lsl #3 + st1 {v8.8h, v9.8h}, [x22], x8 + st1 {v10.8h, v11.8h}, [x22], x8 + st1 {v12.8h, v13.8h}, [x22], x8 + st1 {v14.8h, v15.8h}, [x22] + st1 {v16.8h, v17.8h}, [x23], x8 + st1 {v18.8h, v19.8h}, [x23], x8 + st1 {v20.8h, v21.8h}, [x23], x8 + st1 {v22.8h, v23.8h}, [x23] + st1 {v24.8h, v25.8h}, [x24], x8 + st1 {v26.8h, v27.8h}, [x24], x8 + st1 {v28.8h, v29.8h}, [x24], x8 + st1 {v30.8h, v31.8h}, [x24] + add x21, x21, #32 + subs x13, x13, #16 + bge LoopCol12x16 + + LoopCol12x8: + adds x13, x13, #16 + cbz x13, LoopRow12End + subs x13, x13, #8 + blt LoopCol12x4 + mov x10, x0 // update matrixA + ld1 {v0.8h}, [x10], #16 + mov x14, x5 // reload depth + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + cbnz x12, InitFromBias12x8 + dup v8.2d, xzr + dup v10.2d, xzr + dup v12.2d, xzr + dup v14.2d, xzr + dup v16.2d, xzr + dup v18.2d, xzr + dup v20.2d, xzr + dup v22.2d, xzr + dup v24.2d, xzr + dup v26.2d, xzr + dup v28.2d, xzr + dup v30.2d, xzr + b Compute12x8Enter + InitFromBias12x8: + ld1 {v8.8h}, [x12] + ld1 {v10.8h}, [x12] + ld1 {v12.8h}, [x12] + ld1 {v14.8h}, [x12] + ld1 {v16.8h}, [x12] + ld1 {v18.8h}, [x12] + ld1 {v20.8h}, [x12] + ld1 {v22.8h}, [x12] + ld1 {v24.8h}, [x12] + ld1 {v26.8h}, [x12] + ld1 {v28.8h}, [x12] + ld1 {v30.8h}, [x12] + add x12, x12, #16 + Compute12x8Enter: + bl Compute12x8Unit + Activation12x8: + cmp x4, #3 + beq Relu612x8 + cmp x4, #1 + beq Relu12x8 + b Write12x8 + + Relu612x8: + fmin v8.8h, v8.8h, v7.8h + fmin v10.8h, v10.8h, v7.8h + fmin v12.8h, v12.8h, v7.8h + fmin v14.8h, v14.8h, v7.8h + fmin v16.8h, v16.8h, v7.8h + fmin v18.8h, v18.8h, v7.8h + fmin v20.8h, v20.8h, v7.8h + fmin v22.8h, v22.8h, v7.8h + fmin v24.8h, v24.8h, v7.8h + fmin v26.8h, v26.8h, v7.8h + fmin v28.8h, v28.8h, v7.8h + fmin v30.8h, v30.8h, v7.8h + + Relu12x8: + dup v6.8h, wzr + fmax v8.8h, v8.8h, v6.8h + fmax v10.8h, v10.8h, v6.8h + fmax v12.8h, v12.8h, v6.8h + fmax v14.8h, v14.8h, v6.8h + fmax v16.8h, v16.8h, v6.8h + fmax v18.8h, v18.8h, v6.8h + fmax v20.8h, v20.8h, v6.8h + fmax v22.8h, v22.8h, v6.8h + fmax v24.8h, v24.8h, v6.8h + fmax v26.8h, v26.8h, v6.8h + fmax v28.8h, v28.8h, v6.8h + fmax v30.8h, v30.8h, v6.8h + Write12x8: + mov x22, x21 + add x23, x21, x8, lsl #2 + add x24, x21, x8, lsl #3 + st1 {v8.8h}, [x22], x8 + st1 {v10.8h}, [x22], x8 + st1 {v12.8h}, [x22], x8 + st1 {v14.8h}, [x22] + st1 {v16.8h}, [x23], x8 + st1 {v18.8h}, [x23], x8 + st1 {v20.8h}, [x23], x8 + st1 {v22.8h}, [x23] + st1 {v24.8h}, [x24], x8 + st1 {v26.8h}, [x24], x8 + st1 {v28.8h}, [x24], x8 + st1 {v30.8h}, [x24] + add x21, x21, #16 + subs x13, x13, #8 + + LoopCol12x4: + adds x13, x13, #8 + cbz x13, LoopRow12End + LoopCol12x4Core: + mov x10, x0 // update matrixA + ld1 {v0.8h}, [x10], #16 + mov x14, x5 // reload depth + prfm pldl1strm, [x11, #632] + ld1 {v3.4h}, [x11], #8 + cbnz x12, InitFromBias12x4 + dup v8.2s, wzr + dup v10.2s, wzr + dup v12.2s, wzr + dup v14.2s, wzr + dup v16.2s, wzr + dup v18.2s, wzr + dup v20.2s, wzr + dup v22.2s, wzr + dup v24.2s, wzr + dup v26.2s, wzr + dup v28.2s, wzr + dup v30.2s, wzr + b Compute12x4Enter + InitFromBias12x4: + ld1 {v8.4h}, [x12] + ld1 {v10.4h}, [x12] + ld1 {v12.4h}, [x12] + ld1 {v14.4h}, [x12] + ld1 {v16.4h}, [x12] + ld1 {v18.4h}, [x12] + ld1 {v20.4h}, [x12] + ld1 {v22.4h}, [x12] + ld1 {v24.4h}, [x12] + ld1 {v26.4h}, [x12] + ld1 {v28.4h}, [x12] + ld1 {v30.4h}, [x12] + add x12, x12, #8 + Compute12x4Enter: + bl Compute12x4Unit + Activation12x4: + cmp x4, #3 + beq Relu612x4 + cmp x4, #1 + beq Relu12x4 + b Write12x4 + + Relu612x4: + fmin v8.4h, v8.4h, v7.4h + fmin v10.4h, v10.4h, v7.4h + fmin v12.4h, v12.4h, v7.4h + fmin v14.4h, v14.4h, v7.4h + fmin v16.4h, v16.4h, v7.4h + fmin v18.4h, v18.4h, v7.4h + fmin v20.4h, v20.4h, v7.4h + fmin v22.4h, v22.4h, v7.4h + fmin v24.4h, v24.4h, v7.4h + fmin v26.4h, v26.4h, v7.4h + fmin v28.4h, v28.4h, v7.4h + fmin v30.4h, v30.4h, v7.4h + + Relu12x4: + dup v6.4h, wzr + fmax v8.4h, v8.4h, v6.4h + fmax v10.4h, v10.4h, v6.4h + fmax v12.4h, v12.4h, v6.4h + fmax v14.4h, v14.4h, v6.4h + fmax v16.4h, v16.4h, v6.4h + fmax v18.4h, v18.4h, v6.4h + fmax v20.4h, v20.4h, v6.4h + fmax v22.4h, v22.4h, v6.4h + fmax v24.4h, v24.4h, v6.4h + fmax v26.4h, v26.4h, v6.4h + fmax v28.4h, v28.4h, v6.4h + fmax v30.4h, v30.4h, v6.4h + Write12x4: + mov x22, x21 + add x23, x21, x8, lsl #2 + add x24, x21, x8, lsl #3 + cmp x13, #1 + beq Write12x1 + cmp x13, #2 + beq Write12x2 + cmp x13, #3 + beq Write12x3 + st1 {v8.4h}, [x22], x8 + st1 {v10.4h}, [x22], x8 + st1 {v12.4h}, [x22], x8 + st1 {v14.4h}, [x22] + st1 {v16.4h}, [x23], x8 + st1 {v18.4h}, [x23], x8 + st1 {v20.4h}, [x23], x8 + st1 {v22.4h}, [x23] + st1 {v24.4h}, [x24], x8 + st1 {v26.4h}, [x24], x8 + st1 {v28.4h}, [x24], x8 + st1 {v30.4h}, [x24] + add x21, x21, #8 + subs x13, x13, #4 + bgt LoopCol12x4Core + b LoopRow12End + Write12x1: + st1 {v8.h}[0], [x22], x8 + st1 {v10.h}[0], [x22], x8 + st1 {v12.h}[0], [x22], x8 + st1 {v14.h}[0], [x22] + st1 {v16.h}[0], [x23], x8 + st1 {v18.h}[0], [x23], x8 + st1 {v20.h}[0], [x23], x8 + st1 {v22.h}[0], [x23] + st1 {v24.h}[0], [x24], x8 + st1 {v26.h}[0], [x24], x8 + st1 {v28.h}[0], [x24], x8 + st1 {v30.h}[0], [x24] + b LoopRow12End + Write12x2: + st1 {v8.s}[0], [x22], x8 + st1 {v10.s}[0], [x22], x8 + st1 {v12.s}[0], [x22], x8 + st1 {v14.s}[0], [x22] + st1 {v16.s}[0], [x23], x8 + st1 {v18.s}[0], [x23], x8 + st1 {v20.s}[0], [x23], x8 + st1 {v22.s}[0], [x23] + st1 {v24.s}[0], [x24], x8 + st1 {v26.s}[0], [x24], x8 + st1 {v28.s}[0], [x24], x8 + st1 {v30.s}[0], [x24] + b LoopRow12End + Write12x3: + add x23, x22, #4 + st1 {v8.s}[0], [x22], x8 + st1 {v8.h}[2], [x23], x8 + st1 {v10.s}[0], [x22], x8 + st1 {v10.h}[2], [x23], x8 + st1 {v12.s}[0], [x22], x8 + st1 {v12.h}[2], [x23], x8 + st1 {v14.s}[0], [x22], x8 + st1 {v14.h}[2], [x23], x8 + st1 {v16.s}[0], [x22], x8 + st1 {v16.h}[2], [x23], x8 + st1 {v18.s}[0], [x22], x8 + st1 {v18.h}[2], [x23], x8 + st1 {v20.s}[0], [x22], x8 + st1 {v20.h}[2], [x23], x8 + st1 {v22.s}[0], [x22], x8 + st1 {v22.h}[2], [x23], x8 + st1 {v24.s}[0], [x22], x8 + st1 {v24.h}[2], [x23], x8 + st1 {v26.s}[0], [x22], x8 + st1 {v26.h}[2], [x23], x8 + st1 {v28.s}[0], [x22], x8 + st1 {v28.h}[2], [x23], x8 + st1 {v30.s}[0], [x22] + st1 {v30.h}[2], [x23] + LoopRow12End: + add x0, x0, x16, lsl #3 + add x0, x0, x16, lsl #2 + add x2, x2, x8, lsl #3 + add x2, x2, x8, lsl #2 + subs x6, x6, #12 + bge LoopRow12 + +LoopRow8: + adds x6, x6,#12 + cbz x6, End + subs x6, x6, #8 + blt LoopRow4 + mov x11, x1 // reload matrixB + mov x12, x3 // reload bias + mov x13, x7 // reload col + mov x21, x2 // relocate output + subs x13, x13, #16 + blt LoopCol8x8 + LoopCol8x16: + mov x10, x0 // update matrixA + ld1 {v0.8h}, [x10], #16 + mov x14, x5 // reload depth + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + cbnz x12, InitFromBias8x16 + dup v8.2d, xzr + dup v9.2d, xzr + dup v10.2d, xzr + dup v11.2d, xzr + dup v12.2d, xzr + dup v13.2d, xzr + dup v14.2d, xzr + dup v15.2d, xzr + dup v16.2d, xzr + dup v17.2d, xzr + dup v18.2d, xzr + dup v19.2d, xzr + dup v20.2d, xzr + dup v21.2d, xzr + dup v22.2d, xzr + dup v23.2d, xzr + b Compute8x16Enter + InitFromBias8x16: + ld1 {v8.8h, v9.8h}, [x12] + ld1 {v10.8h, v11.8h}, [x12] + ld1 {v12.8h, v13.8h}, [x12] + ld1 {v14.8h, v15.8h}, [x12] + ld1 {v16.8h, v17.8h}, [x12] + ld1 {v18.8h, v19.8h}, [x12] + ld1 {v20.8h, v21.8h}, [x12] + ld1 {v22.8h, v23.8h}, [x12] + add x12, x12, #32 + Compute8x16Enter: + bl Compute8x16Unit + Activation8x16: + cmp x4, #3 + beq Relu68x16 + cmp x4, #1 + beq Relu8x16 + b Write8x16 + + Relu68x16: + fmin v8.8h, v8.8h, v7.8h + fmin v9.8h, v9.8h, v7.8h + fmin v10.8h, v10.8h, v7.8h + fmin v11.8h, v11.8h, v7.8h + fmin v12.8h, v12.8h, v7.8h + fmin v13.8h, v13.8h, v7.8h + fmin v14.8h, v14.8h, v7.8h + fmin v15.8h, v15.8h, v7.8h + fmin v16.8h, v16.8h, v7.8h + fmin v17.8h, v17.8h, v7.8h + fmin v18.8h, v18.8h, v7.8h + fmin v19.8h, v19.8h, v7.8h + fmin v20.8h, v20.8h, v7.8h + fmin v21.8h, v21.8h, v7.8h + fmin v22.8h, v22.8h, v7.8h + fmin v23.8h, v23.8h, v7.8h + + Relu8x16: + dup v6.8h, wzr + fmax v8.8h, v8.8h, v6.8h + fmax v9.8h, v9.8h, v6.8h + fmax v10.8h, v10.8h, v6.8h + fmax v11.8h, v11.8h, v6.8h + fmax v12.8h, v12.8h, v6.8h + fmax v13.8h, v13.8h, v6.8h + fmax v14.8h, v14.8h, v6.8h + fmax v15.8h, v15.8h, v6.8h + fmax v16.8h, v16.8h, v6.8h + fmax v17.8h, v17.8h, v6.8h + fmax v18.8h, v18.8h, v6.8h + fmax v19.8h, v19.8h, v6.8h + fmax v20.8h, v20.8h, v6.8h + fmax v21.8h, v21.8h, v6.8h + fmax v22.8h, v22.8h, v6.8h + fmax v23.8h, v23.8h, v6.8h + Write8x16: + mov x22, x21 + add x23, x21, x8, lsl #2 + st1 {v8.8h, v9.8h}, [x22], x8 + st1 {v10.8h, v11.8h}, [x22], x8 + st1 {v12.8h, v13.8h}, [x22], x8 + st1 {v14.8h, v15.8h}, [x22] + st1 {v16.8h, v17.8h}, [x23], x8 + st1 {v18.8h, v19.8h}, [x23], x8 + st1 {v20.8h, v21.8h}, [x23], x8 + st1 {v22.8h, v23.8h}, [x23] + add x21, x21, #32 + subs x13, x13, #16 + bge LoopCol8x16 + + LoopCol8x8: + adds x13, x13, #16 + cbz x13, LoopRow8End + subs x13, x13, #8 + blt LoopCol8x4 + mov x10, x0 // update matrixA + ld1 {v0.8h}, [x10], #16 + mov x14, x5 // reload depth + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + cbnz x12, InitFromBias8x8 + dup v8.2d, xzr + dup v10.2d, xzr + dup v12.2d, xzr + dup v14.2d, xzr + dup v16.2d, xzr + dup v18.2d, xzr + dup v20.2d, xzr + dup v22.2d, xzr + b Compute8x8Enter + InitFromBias8x8: + ld1 {v8.8h}, [x12] + ld1 {v10.8h}, [x12] + ld1 {v12.8h}, [x12] + ld1 {v14.8h}, [x12] + ld1 {v16.8h}, [x12] + ld1 {v18.8h}, [x12] + ld1 {v20.8h}, [x12] + ld1 {v22.8h}, [x12] + add x12, x12, #16 + Compute8x8Enter: + bl Compute8x8Unit + Activation8x8: + cmp x4, #3 + beq Relu68x8 + cmp x4, #1 + beq Relu8x8 + b Write8x8 + + Relu68x8: + fmin v8.8h, v8.8h, v7.8h + fmin v10.8h, v10.8h, v7.8h + fmin v12.8h, v12.8h, v7.8h + fmin v14.8h, v14.8h, v7.8h + fmin v16.8h, v16.8h, v7.8h + fmin v18.8h, v18.8h, v7.8h + fmin v20.8h, v20.8h, v7.8h + fmin v22.8h, v22.8h, v7.8h + + Relu8x8: + dup v6.8h, wzr + fmax v8.8h, v8.8h, v6.8h + fmax v10.8h, v10.8h, v6.8h + fmax v12.8h, v12.8h, v6.8h + fmax v14.8h, v14.8h, v6.8h + fmax v16.8h, v16.8h, v6.8h + fmax v18.8h, v18.8h, v6.8h + fmax v20.8h, v20.8h, v6.8h + fmax v22.8h, v22.8h, v6.8h + Write8x8: + mov x22, x21 + add x23, x21, x8, lsl #2 + st1 {v8.8h}, [x22], x8 + st1 {v10.8h}, [x22], x8 + st1 {v12.8h}, [x22], x8 + st1 {v14.8h}, [x22] + st1 {v16.8h}, [x23], x8 + st1 {v18.8h}, [x23], x8 + st1 {v20.8h}, [x23], x8 + st1 {v22.8h}, [x23] + add x21, x21, #16 + subs x13, x13, #8 + + LoopCol8x4: + adds x13, x13, #8 + cbz x13, LoopRow8End + LoopCol8x4Core: + mov x10, x0 // update matrixA + ld1 {v0.8h}, [x10], #16 + mov x14, x5 // reload depth + prfm pldl1strm, [x11, #632] + ld1 {v3.4h}, [x11], #8 + cbnz x12, InitFromBias8x4 + dup v8.2s, wzr + dup v10.2s, wzr + dup v12.2s, wzr + dup v14.2s, wzr + dup v16.2s, wzr + dup v18.2s, wzr + dup v20.2s, wzr + dup v22.2s, wzr + b Compute8x4Enter + InitFromBias8x4: + ld1 {v8.4h}, [x12] + ld1 {v10.4h}, [x12] + ld1 {v12.4h}, [x12] + ld1 {v14.4h}, [x12] + ld1 {v16.4h}, [x12] + ld1 {v18.4h}, [x12] + ld1 {v20.4h}, [x12] + ld1 {v22.4h}, [x12] + add x12, x12, #8 + Compute8x4Enter: + bl Compute8x4Unit + Activation8x4: + cmp x4, #3 + beq Relu68x4 + cmp x4, #1 + beq Relu8x4 + b Write8x4 + + Relu68x4: + fmin v8.4h, v8.4h, v7.4h + fmin v10.4h, v10.4h, v7.4h + fmin v12.4h, v12.4h, v7.4h + fmin v14.4h, v14.4h, v7.4h + fmin v16.4h, v16.4h, v7.4h + fmin v18.4h, v18.4h, v7.4h + fmin v20.4h, v20.4h, v7.4h + fmin v22.4h, v22.4h, v7.4h + + Relu8x4: + dup v6.4h, wzr + fmax v8.4h, v8.4h, v6.4h + fmax v10.4h, v10.4h, v6.4h + fmax v12.4h, v12.4h, v6.4h + fmax v14.4h, v14.4h, v6.4h + fmax v16.4h, v16.4h, v6.4h + fmax v18.4h, v18.4h, v6.4h + fmax v20.4h, v20.4h, v6.4h + fmax v22.4h, v22.4h, v6.4h + Write8x4: + mov x22, x21 + add x23, x21, x8, lsl #2 + cmp x13, #1 + beq Write8x1 + cmp x13, #2 + beq Write8x2 + cmp x13, #3 + beq Write8x3 + st1 {v8.4h}, [x22], x8 + st1 {v10.4h}, [x22], x8 + st1 {v12.4h}, [x22], x8 + st1 {v14.4h}, [x22] + st1 {v16.4h}, [x23], x8 + st1 {v18.4h}, [x23], x8 + st1 {v20.4h}, [x23], x8 + st1 {v22.4h}, [x23] + add x21, x21, #8 + subs x13, x13, #4 + bgt LoopCol8x4Core + b LoopRow8End + Write8x1: + st1 {v8.h}[0], [x22], x8 + st1 {v10.h}[0], [x22], x8 + st1 {v12.h}[0], [x22], x8 + st1 {v14.h}[0], [x22] + st1 {v16.h}[0], [x23], x8 + st1 {v18.h}[0], [x23], x8 + st1 {v20.h}[0], [x23], x8 + st1 {v22.h}[0], [x23] + b LoopRow8End + Write8x2: + st1 {v8.s}[0], [x22], x8 + st1 {v10.s}[0], [x22], x8 + st1 {v12.s}[0], [x22], x8 + st1 {v14.s}[0], [x22] + st1 {v16.s}[0], [x23], x8 + st1 {v18.s}[0], [x23], x8 + st1 {v20.s}[0], [x23], x8 + st1 {v22.s}[0], [x23] + b LoopRow8End + Write8x3: + add x23, x22, #4 + st1 {v8.s}[0], [x22], x8 + st1 {v8.h}[2], [x23], x8 + st1 {v10.s}[0], [x22], x8 + st1 {v10.h}[2], [x23], x8 + st1 {v12.s}[0], [x22], x8 + st1 {v12.h}[2], [x23], x8 + st1 {v14.s}[0], [x22], x8 + st1 {v14.h}[2], [x23], x8 + st1 {v16.s}[0], [x22], x8 + st1 {v16.h}[2], [x23], x8 + st1 {v18.s}[0], [x22], x8 + st1 {v18.h}[2], [x23], x8 + st1 {v20.s}[0], [x22], x8 + st1 {v20.h}[2], [x23], x8 + st1 {v22.s}[0], [x22], x8 + st1 {v22.h}[2], [x23], x8 + LoopRow8End: + add x0, x0, x16, lsl #3 + add x2, x2, x8, lsl #3 + subs x6, x6, #8 + +LoopRow4: + adds x6, x6, #8 + cbz x6, End + subs x6, x6, #4 + blt LoopRowTail + mov x11, x1 // reload matrixB + mov x12, x3 // reload bias + mov x13, x7 // reload col + mov x21, x2 // relocate output + subs x13, x13, #16 + blt LoopCol4x8 + LoopCol4x16: + mov x10, x0 // update matrixA + ld1 {v0.4h}, [x10], #8 + mov x14, x5 // reload depth + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + cbnz x12, InitFromBias4x16 + dup v8.2d, xzr + dup v9.2d, xzr + dup v10.2d, xzr + dup v11.2d, xzr + dup v12.2d, xzr + dup v13.2d, xzr + dup v14.2d, xzr + dup v15.2d, xzr + b Compute4x16Enter + InitFromBias4x16: + ld1 {v8.8h, v9.8h}, [x12] + ld1 {v10.8h, v11.8h}, [x12] + ld1 {v12.8h, v13.8h}, [x12] + ld1 {v14.8h, v15.8h}, [x12] + add x12, x12, #32 + Compute4x16Enter: + bl Compute4x16Unit + Activation4x16: + cmp x4, #3 + beq Relu64x16 + cmp x4, #1 + beq Relu4x16 + b Write4x16 + + Relu64x16: + fmin v8.8h, v8.8h, v7.8h + fmin v9.8h, v9.8h, v7.8h + fmin v10.8h, v10.8h, v7.8h + fmin v11.8h, v11.8h, v7.8h + fmin v12.8h, v12.8h, v7.8h + fmin v13.8h, v13.8h, v7.8h + fmin v14.8h, v14.8h, v7.8h + fmin v15.8h, v15.8h, v7.8h + + Relu4x16: + dup v6.8h, wzr + fmax v8.8h, v8.8h, v6.8h + fmax v9.8h, v9.8h, v6.8h + fmax v10.8h, v10.8h, v6.8h + fmax v11.8h, v11.8h, v6.8h + fmax v12.8h, v12.8h, v6.8h + fmax v13.8h, v13.8h, v6.8h + fmax v14.8h, v14.8h, v6.8h + fmax v15.8h, v15.8h, v6.8h + Write4x16: + mov x22, x21 + st1 {v8.8h, v9.8h}, [x22], x8 + st1 {v10.8h, v11.8h}, [x22], x8 + st1 {v12.8h, v13.8h}, [x22], x8 + st1 {v14.8h, v15.8h}, [x22] + add x21, x21, #32 + subs x13, x13, #16 + bge LoopCol4x16 + + LoopCol4x8: + adds x13, x13, #16 + cbz x13, LoopRow4End + subs x13, x13, #8 + blt LoopCol4x4 + mov x10, x0 // update matrixA + ld1 {v0.4h}, [x10], #8 + mov x14, x5 // reload depth + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + cbnz x12, InitFromBias4x8 + dup v8.2d, xzr + dup v10.2d, xzr + dup v12.2d, xzr + dup v14.2d, xzr + b Compute4x8Enter + InitFromBias4x8: + ld1 {v8.8h}, [x12] + ld1 {v10.8h}, [x12] + ld1 {v12.8h}, [x12] + ld1 {v14.8h}, [x12] + add x12, x12, #16 + Compute4x8Enter: + bl Compute4x8Unit + Activation4x8: + cmp x4, #3 + beq Relu64x8 + cmp x4, #1 + beq Relu4x8 + b Write4x8 + + Relu64x8: + fmin v8.8h, v8.8h, v7.8h + fmin v10.8h, v10.8h, v7.8h + fmin v12.8h, v12.8h, v7.8h + fmin v14.8h, v14.8h, v7.8h + + Relu4x8: + dup v6.8h, wzr + fmax v8.8h, v8.8h, v6.8h + fmax v10.8h, v10.8h, v6.8h + fmax v12.8h, v12.8h, v6.8h + fmax v14.8h, v14.8h, v6.8h + Write4x8: + mov x22, x21 + st1 {v8.8h}, [x22], x8 + st1 {v10.8h}, [x22], x8 + st1 {v12.8h}, [x22], x8 + st1 {v14.8h}, [x22] + add x21, x21, #16 + subs x13, x13, #8 + + LoopCol4x4: + adds x13, x13, #8 + cbz x13, LoopRow4End + LoopCol4x4Core: + mov x10, x0 // update matrixA + ld1 {v0.4h}, [x10], #8 + mov x14, x5 // reload depth + prfm pldl1strm, [x11, #632] + ld1 {v3.4h}, [x11], #8 + cbnz x12, InitFromBias4x4 + dup v8.2s, wzr + dup v10.2s, wzr + dup v12.2s, wzr + dup v14.2s, wzr + b Compute4x4Enter + InitFromBias4x4: + ld1 {v8.4h}, [x12] + ld1 {v10.4h}, [x12] + ld1 {v12.4h}, [x12] + ld1 {v14.4h}, [x12] + add x12, x12, #8 + Compute4x4Enter: + bl Compute4x4Unit + Activation4x4: + cmp x4, #3 + beq Relu64x4 + cmp x4, #1 + beq Relu4x4 + b Write4x4 + + Relu64x4: + fmin v8.4h, v8.4h, v7.4h + fmin v10.4h, v10.4h, v7.4h + fmin v12.4h, v12.4h, v7.4h + fmin v14.4h, v14.4h, v7.4h + + Relu4x4: + dup v6.4h, wzr + fmax v8.4h, v8.4h, v6.4h + fmax v10.4h, v10.4h, v6.4h + fmax v12.4h, v12.4h, v6.4h + fmax v14.4h, v14.4h, v6.4h + Write4x4: + mov x22, x21 + cmp x13, #1 + beq Write4x1 + cmp x13, #2 + beq Write4x2 + cmp x13, #3 + beq Write4x3 + st1 {v8.4h}, [x22], x8 + st1 {v10.4h}, [x22], x8 + st1 {v12.4h}, [x22], x8 + st1 {v14.4h}, [x22] + add x21, x21, #8 + subs x13, x13, #4 + bgt LoopCol4x4Core + b LoopRow4End + Write4x1: + st1 {v8.h}[0], [x22], x8 + st1 {v10.h}[0], [x22], x8 + st1 {v12.h}[0], [x22], x8 + st1 {v14.h}[0], [x22] + b LoopRow4End + Write4x2: + st1 {v8.s}[0], [x22], x8 + st1 {v10.s}[0], [x22], x8 + st1 {v12.s}[0], [x22], x8 + st1 {v14.s}[0], [x22] + b LoopRow4End + Write4x3: + add x23, x22, #4 + st1 {v8.s}[0], [x22], x8 + st1 {v8.h}[2], [x23], x8 + st1 {v10.s}[0], [x22], x8 + st1 {v10.h}[2], [x23], x8 + st1 {v12.s}[0], [x22], x8 + st1 {v12.h}[2], [x23], x8 + st1 {v14.s}[0], [x22], x8 + st1 {v14.h}[2], [x23], x8 + LoopRow4End: + add x0, x0, x16, lsl #2 + add x2, x2, x8, lsl #2 + subs x6, x6, #4 + +LoopRowTail: + adds x6, x6, #4 + cbz x6, End + cmp x6, #1 + beq LoopRow1 + cmp x6, #2 + beq LoopRow2 + // LoopRow3 + mov x11, x1 // reload matrixB + mov x12, x3 // reload bias + mov x13, x7 // reload col + mov x21, x2 // relocate output + subs x13, x13, #16 + blt LoopCol3x8 + LoopCol3x16: + mov x10, x0 // update matrixA + mov x14, x5 // reload depth + cbnz x12, InitFromBias3x16 + dup v8.2d, xzr + dup v9.2d, xzr + dup v10.2d, xzr + dup v11.2d, xzr + dup v12.2d, xzr + dup v13.2d, xzr + b Compute3x16Enter + InitFromBias3x16: + ld1 {v8.8h, v9.8h}, [x12] + ld1 {v10.8h, v11.8h}, [x12] + ld1 {v12.8h, v13.8h}, [x12] + add x12, x12, #32 + Compute3x16Enter: + bl Compute3x16Unit + Activation3x16: + cmp x4, #3 + beq Relu63x16 + cmp x4, #1 + beq Relu3x16 + b Write3x16 + + Relu63x16: + fmin v8.8h, v8.8h, v7.8h + fmin v9.8h, v9.8h, v7.8h + fmin v10.8h, v10.8h, v7.8h + fmin v11.8h, v11.8h, v7.8h + fmin v12.8h, v12.8h, v7.8h + fmin v13.8h, v13.8h, v7.8h + + Relu3x16: + dup v6.8h, wzr + fmax v8.8h, v8.8h, v6.8h + fmax v9.8h, v9.8h, v6.8h + fmax v10.8h, v10.8h, v6.8h + fmax v11.8h, v11.8h, v6.8h + fmax v12.8h, v12.8h, v6.8h + fmax v13.8h, v13.8h, v6.8h + Write3x16: + mov x22, x21 + st1 {v8.8h, v9.8h}, [x22], x8 + st1 {v10.8h, v11.8h}, [x22], x8 + st1 {v12.8h, v13.8h}, [x22] + add x21, x21, #32 + subs x13, x13, #16 + bge LoopCol3x16 + + LoopCol3x8: + adds x13, x13, #16 + cbz x13, End + subs x13, x13, #8 + blt LoopCol3x4 + mov x10, x0 // update matrixA + mov x14, x5 // reload depth + cbnz x12, InitFromBias3x8 + dup v8.2d, xzr + dup v10.2d, xzr + dup v12.2d, xzr + b Compute3x8Enter + InitFromBias3x8: + ld1 {v8.8h}, [x12] + ld1 {v10.8h}, [x12] + ld1 {v12.8h}, [x12] + add x12, x12, #16 + Compute3x8Enter: + bl Compute3x8Unit + Activation3x8: + cmp x4, #3 + beq Relu63x8 + cmp x4, #1 + beq Relu3x8 + b Write3x8 + + Relu63x8: + fmin v8.8h, v8.8h, v7.8h + fmin v10.8h, v10.8h, v7.8h + fmin v12.8h, v12.8h, v7.8h + + Relu3x8: + dup v6.8h, wzr + fmax v8.8h, v8.8h, v6.8h + fmax v10.8h, v10.8h, v6.8h + fmax v12.8h, v12.8h, v6.8h + Write3x8: + mov x22, x21 + st1 {v8.8h}, [x22], x8 + st1 {v10.8h}, [x22], x8 + st1 {v12.8h}, [x22] + add x21, x21, #16 + subs x13, x13, #8 + + LoopCol3x4: + adds x13, x13, #8 + cbz x13, End + LoopCol3x4Core: + mov x10, x0 // update matrixA + mov x14, x5 // reload depth + cbnz x12, InitFromBias3x4 + dup v8.2s, wzr + dup v10.2s, wzr + dup v12.2s, wzr + b Compute3x4Enter + InitFromBias3x4: + ld1 {v8.4h}, [x12] + ld1 {v10.4h}, [x12] + ld1 {v12.4h}, [x12] + add x12, x12, #8 + Compute3x4Enter: + bl Compute3x4Unit + Activation3x4: + cmp x4, #3 + beq Relu63x4 + cmp x4, #1 + beq Relu3x4 + b Write3x4 + + Relu63x4: + fmin v8.4h, v8.4h, v7.4h + fmin v10.4h, v10.4h, v7.4h + fmin v12.4h, v12.4h, v7.4h + + Relu3x4: + dup v6.4h, wzr + fmax v8.4h, v8.4h, v6.4h + fmax v10.4h, v10.4h, v6.4h + fmax v12.4h, v12.4h, v6.4h + Write3x4: + mov x22, x21 + cmp x13, #1 + beq Write3x1 + cmp x13, #2 + beq Write3x2 + cmp x13, #3 + beq Write3x3 + st1 {v8.4h}, [x22], x8 + st1 {v10.4h}, [x22], x8 + st1 {v12.4h}, [x22] + add x21, x21, #8 + subs x13, x13, #4 + bgt LoopCol3x4Core + b End + Write3x1: + st1 {v8.h}[0], [x22], x8 + st1 {v10.h}[0], [x22], x8 + st1 {v12.h}[0], [x22] + b End + Write3x2: + st1 {v8.s}[0], [x22], x8 + st1 {v10.s}[0], [x22], x8 + st1 {v12.s}[0], [x22] + b End + Write3x3: + add x23, x22, #4 + st1 {v8.s}[0], [x22], x8 + st1 {v8.h}[2], [x23], x8 + st1 {v10.s}[0], [x22], x8 + st1 {v10.h}[2], [x23], x8 + st1 {v12.s}[0], [x22], x8 + st1 {v12.h}[2], [x23], x8 + b End + +LoopRow2: + mov x11, x1 // reload matrixB + mov x12, x3 // reload bias + mov x13, x7 // reload col + mov x21, x2 // relocate output + subs x13, x13, #16 + blt LoopCol2x8 + LoopCol2x16: + mov x10, x0 // update matrixA + mov x14, x5 // reload depth + cbnz x12, InitFromBias2x16 + dup v8.2d, xzr + dup v9.2d, xzr + dup v10.2d, xzr + dup v11.2d, xzr + b Compute2x16Enter + InitFromBias2x16: + ld1 {v8.8h, v9.8h}, [x12] + ld1 {v10.8h, v11.8h}, [x12] + add x12, x12, #32 + Compute2x16Enter: + bl Compute2x16Unit + Activation2x16: + cmp x4, #3 + beq Relu62x16 + cmp x4, #1 + beq Relu2x16 + b Write2x16 + + Relu62x16: + fmin v8.8h, v8.8h, v7.8h + fmin v9.8h, v9.8h, v7.8h + fmin v10.8h, v10.8h, v7.8h + fmin v11.8h, v11.8h, v7.8h + + Relu2x16: + dup v6.8h, wzr + fmax v8.8h, v8.8h, v6.8h + fmax v9.8h, v9.8h, v6.8h + fmax v10.8h, v10.8h, v6.8h + fmax v11.8h, v11.8h, v6.8h + Write2x16: + mov x22, x21 + st1 {v8.8h, v9.8h}, [x22], x8 + st1 {v10.8h, v11.8h}, [x22] + add x21, x21, #32 + subs x13, x13, #16 + bge LoopCol2x16 + + LoopCol2x8: + adds x13, x13, #16 + cbz x13, End + subs x13, x13, #8 + blt LoopCol2x4 + mov x10, x0 // update matrixA + mov x14, x5 // reload depth + cbnz x12, InitFromBias2x8 + dup v8.2d, xzr + dup v10.2d, xzr + b Compute2x8Enter + InitFromBias2x8: + ld1 {v8.8h}, [x12] + ld1 {v10.8h}, [x12] + add x12, x12, #16 + Compute2x8Enter: + bl Compute2x8Unit + Activation2x8: + cmp x4, #3 + beq Relu62x8 + cmp x4, #1 + beq Relu2x8 + b Write2x8 + + Relu62x8: + fmin v8.8h, v8.8h, v7.8h + fmin v10.8h, v10.8h, v7.8h + + Relu2x8: + dup v6.8h, wzr + fmax v8.8h, v8.8h, v6.8h + fmax v10.8h, v10.8h, v6.8h + Write2x8: + mov x22, x21 + st1 {v8.8h}, [x22], x8 + st1 {v10.8h}, [x22] + add x21, x21, #16 + subs x13, x13, #8 + + LoopCol2x4: + adds x13, x13, #8 + cbz x13, End + LoopCol2x4Core: + mov x10, x0 // update matrixA + mov x14, x5 // reload depth + cbnz x12, InitFromBias2x4 + dup v8.2s, wzr + dup v10.2s, wzr + b Compute2x4Enter + InitFromBias2x4: + ld1 {v8.4h}, [x12] + ld1 {v10.4h}, [x12] + add x12, x12, #8 + Compute2x4Enter: + bl Compute2x4Unit + Activation2x4: + cmp x4, #3 + beq Relu62x4 + cmp x4, #1 + beq Relu2x4 + b Write2x4 + + Relu62x4: + fmin v8.4h, v8.4h, v7.4h + fmin v10.4h, v10.4h, v7.4h + Relu2x4: + dup v6.4h, wzr + fmax v8.4h, v8.4h, v6.4h + fmax v10.4h, v10.4h, v6.4h + Write2x4: + mov x22, x21 + cmp x13, #1 + beq Write2x1 + cmp x13, #2 + beq Write2x2 + cmp x13, #3 + beq Write2x3 + st1 {v8.4h}, [x22], x8 + st1 {v10.4h}, [x22] + add x21, x21, #8 + subs x13, x13, #4 + bgt LoopCol2x4Core + b End + Write2x1: + st1 {v8.h}[0], [x22], x8 + st1 {v10.h}[0], [x22] + b End + Write2x2: + st1 {v8.s}[0], [x22], x8 + st1 {v10.s}[0], [x22] + b End + Write2x3: + add x23, x22, #4 + st1 {v8.s}[0], [x22], x8 + st1 {v8.h}[2], [x23], x8 + st1 {v10.s}[0], [x22], x8 + st1 {v10.h}[2], [x23], x8 + b End + +LoopRow1: + mov x11, x1 // reload matrixB + mov x12, x3 // reload bias + mov x13, x7 // reload col + mov x21, x2 // relocate output + subs x13, x13, #16 + blt LoopCol1x8 + LoopCol1x16: + mov x10, x0 // update matrixA + mov x14, x5 // reload depth + cbnz x12, InitFromBias1x16 + dup v8.2d, xzr + dup v9.2d, xzr + b Compute1x16Enter + InitFromBias1x16: + ld1 {v8.8h, v9.8h}, [x12], #32 + Compute1x16Enter: + bl Compute1x16Unit + Activation1x16: + cmp x4, #3 + beq Relu61x16 + cmp x4, #1 + beq Relu1x16 + b Write1x16 + + Relu61x16: + fmin v8.8h, v8.8h, v7.8h + fmin v9.8h, v9.8h, v7.8h + + Relu1x16: + dup v6.8h, wzr + fmax v8.8h, v8.8h, v6.8h + fmax v9.8h, v9.8h, v6.8h + Write1x16: + st1 {v8.8h, v9.8h}, [x21], #32 + subs x13, x13, #16 + bge LoopCol1x16 + + LoopCol1x8: + adds x13, x13, #16 + cbz x13, End + subs x13, x13, #8 + blt LoopCol1x4 + mov x10, x0 // update matrixA + mov x14, x5 // reload depth + cbnz x12, InitFromBias1x8 + dup v8.2d, xzr + b Compute1x8Enter + InitFromBias1x8: + ld1 {v8.8h}, [x12], #16 + Compute1x8Enter: + bl Compute1x8Unit + Activation1x8: + cmp x4, #3 + beq Relu61x8 + cmp x4, #1 + beq Relu1x8 + b Write1x8 + + Relu61x8: + fmin v8.8h, v8.8h, v7.8h + + Relu1x8: + dup v6.8h, wzr + fmax v8.8h, v8.8h, v6.8h + Write1x8: + st1 {v8.8h}, [x21], #16 + subs x13, x13, #8 + + LoopCol1x4: + adds x13, x13, #8 + cbz x13, End + LoopCol1x4Core: + mov x10, x0 // update matrixA + mov x14, x5 // reload depth + cbnz x12, InitFromBias1x4 + dup v8.2s, wzr + b Compute1x4Enter + InitFromBias1x4: + ld1 {v8.4h}, [x12], #8 + Compute1x4Enter: + bl Compute1x4Unit + Activation1x4: + cmp x4, #3 + beq Relu61x4 + cmp x4, #1 + beq Relu1x4 + b Write1x4 + + Relu61x4: + fmin v8.4h, v8.4h, v7.4h + Relu1x4: + dup v6.4h, wzr + fmax v8.4h, v8.4h, v6.4h + Write1x4: + cmp x13, #1 + beq Write1x1 + cmp x13, #2 + beq Write1x2 + cmp x13, #3 + beq Write1x3 + st1 {v8.4h}, [x21], #8 + subs x13, x13, #4 + bgt LoopCol1x4Core + b End + Write1x1: + st1 {v8.h}[0], [x21] + b End + Write1x2: + st1 {v8.s}[0], [x21] + b End + Write1x3: + add x22, x21, #4 + st1 {v8.s}[0], [x21] + st1 {v8.h}[2], [x22] + b End + +Compute12x16Unit: + subs x14, x14, #2 + ble Compute12x16End + Compute12x16: + prfm pldl1keep, [x10, #632] + ld1 {v1.8h, v2.8h}, [x10], #32 + ld1 {v4.8h, v5.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + fmla v16.8h, v3.8h, v0.h[4] + fmla v18.8h, v3.8h, v0.h[5] + fmla v20.8h, v3.8h, v0.h[6] + fmla v22.8h, v3.8h, v0.h[7] + fmla v24.8h, v3.8h, v1.h[0] + fmla v26.8h, v3.8h, v1.h[1] + fmla v28.8h, v3.8h, v1.h[2] + fmla v30.8h, v3.8h, v1.h[3] + prfm pldl1strm, [x11, #632] + ld1 {v6.8h}, [x11], #16 + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v0.h[1] + fmla v13.8h, v4.8h, v0.h[2] + fmla v15.8h, v4.8h, v0.h[3] + fmla v17.8h, v4.8h, v0.h[4] + fmla v19.8h, v4.8h, v0.h[5] + fmla v21.8h, v4.8h, v0.h[6] + fmla v23.8h, v4.8h, v0.h[7] + fmla v25.8h, v4.8h, v1.h[0] + fmla v27.8h, v4.8h, v1.h[1] + fmla v29.8h, v4.8h, v1.h[2] + fmla v31.8h, v4.8h, v1.h[3] + + fmla v8.8h, v5.8h, v1.h[4] + fmla v10.8h, v5.8h, v1.h[5] + fmla v12.8h, v5.8h, v1.h[6] + fmla v14.8h, v5.8h, v1.h[7] + fmla v16.8h, v5.8h, v2.h[0] + fmla v18.8h, v5.8h, v2.h[1] + fmla v20.8h, v5.8h, v2.h[2] + fmla v22.8h, v5.8h, v2.h[3] + fmla v24.8h, v5.8h, v2.h[4] + fmla v26.8h, v5.8h, v2.h[5] + fmla v28.8h, v5.8h, v2.h[6] + fmla v30.8h, v5.8h, v2.h[7] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + fmla v9.8h, v6.8h, v1.h[4] + fmla v11.8h, v6.8h, v1.h[5] + fmla v13.8h, v6.8h, v1.h[6] + fmla v15.8h, v6.8h, v1.h[7] + prfm pldl1keep, [x10, #632] + ld1 {v0.8h}, [x10], #16 + fmla v17.8h, v6.8h, v2.h[0] + fmla v19.8h, v6.8h, v2.h[1] + fmla v21.8h, v6.8h, v2.h[2] + fmla v23.8h, v6.8h, v2.h[3] + fmla v25.8h, v6.8h, v2.h[4] + fmla v27.8h, v6.8h, v2.h[5] + fmla v29.8h, v6.8h, v2.h[6] + fmla v31.8h, v6.8h, v2.h[7] + + subs x14, x14, #2 + bgt Compute12x16 + Compute12x16End: + cbnz x14, Compute12x16End1 + prfm pldl1keep, [x10, #632] + ld1 {v1.4h}, [x10], #8 + ld1 {v4.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + fmla v16.8h, v3.8h, v0.h[4] + fmla v18.8h, v3.8h, v0.h[5] + fmla v20.8h, v3.8h, v0.h[6] + fmla v22.8h, v3.8h, v0.h[7] + fmla v24.8h, v3.8h, v1.h[0] + fmla v26.8h, v3.8h, v1.h[1] + fmla v28.8h, v3.8h, v1.h[2] + fmla v30.8h, v3.8h, v1.h[3] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v0.h[1] + fmla v13.8h, v4.8h, v0.h[2] + fmla v15.8h, v4.8h, v0.h[3] + ld1 {v2.8h}, [x10], #16 + fmla v17.8h, v4.8h, v0.h[4] + fmla v19.8h, v4.8h, v0.h[5] + fmla v21.8h, v4.8h, v0.h[6] + fmla v23.8h, v4.8h, v0.h[7] + fmla v25.8h, v4.8h, v1.h[0] + fmla v27.8h, v4.8h, v1.h[1] + fmla v29.8h, v4.8h, v1.h[2] + fmla v31.8h, v4.8h, v1.h[3] + mov v0.16b, v2.16b + Compute12x16End1: + ld1 {v1.4h}, [x10] + ld1 {v4.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + fmla v16.8h, v3.8h, v0.h[4] + fmla v18.8h, v3.8h, v0.h[5] + fmla v20.8h, v3.8h, v0.h[6] + fmla v22.8h, v3.8h, v0.h[7] + fmla v24.8h, v3.8h, v1.h[0] + fmla v26.8h, v3.8h, v1.h[1] + fmla v28.8h, v3.8h, v1.h[2] + fmla v30.8h, v3.8h, v1.h[3] + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v0.h[1] + fmla v13.8h, v4.8h, v0.h[2] + fmla v15.8h, v4.8h, v0.h[3] + fmla v17.8h, v4.8h, v0.h[4] + fmla v19.8h, v4.8h, v0.h[5] + fmla v21.8h, v4.8h, v0.h[6] + fmla v23.8h, v4.8h, v0.h[7] + fmla v25.8h, v4.8h, v1.h[0] + fmla v27.8h, v4.8h, v1.h[1] + fmla v29.8h, v4.8h, v1.h[2] + fmla v31.8h, v4.8h, v1.h[3] + ret + +Compute12x8Unit: + subs x14, x14, #2 + ble Compute12x8End + Compute12x8: + prfm pldl1keep, [x10, #632] + ld1 {v1.8h, v2.8h}, [x10], #32 + ld1 {v4.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + fmla v16.8h, v3.8h, v0.h[4] + fmla v18.8h, v3.8h, v0.h[5] + fmla v20.8h, v3.8h, v0.h[6] + fmla v22.8h, v3.8h, v0.h[7] + fmla v24.8h, v3.8h, v1.h[0] + fmla v26.8h, v3.8h, v1.h[1] + fmla v28.8h, v3.8h, v1.h[2] + fmla v30.8h, v3.8h, v1.h[3] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + fmla v8.8h, v4.8h, v1.h[4] + fmla v10.8h, v4.8h, v1.h[5] + fmla v12.8h, v4.8h, v1.h[6] + fmla v14.8h, v4.8h, v1.h[7] + ld1 {v0.8h}, [x10], #16 + fmla v16.8h, v4.8h, v2.h[0] + fmla v18.8h, v4.8h, v2.h[1] + fmla v20.8h, v4.8h, v2.h[2] + fmla v22.8h, v4.8h, v2.h[3] + fmla v24.8h, v4.8h, v2.h[4] + fmla v26.8h, v4.8h, v2.h[5] + fmla v28.8h, v4.8h, v2.h[6] + fmla v30.8h, v4.8h, v2.h[7] + + subs x14, x14, #2 + bgt Compute12x8 + Compute12x8End: + cbnz x14, Compute12x8End1 + prfm pldl1keep, [x10, #632] + ld1 {v1.4h}, [x10], #8 + ld1 {v4.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + fmla v16.8h, v3.8h, v0.h[4] + fmla v18.8h, v3.8h, v0.h[5] + fmla v20.8h, v3.8h, v0.h[6] + fmla v22.8h, v3.8h, v0.h[7] + fmla v24.8h, v3.8h, v1.h[0] + fmla v26.8h, v3.8h, v1.h[1] + fmla v28.8h, v3.8h, v1.h[2] + fmla v30.8h, v3.8h, v1.h[3] + ld1 {v0.8h}, [x10], #16 + mov v3.16b, v4.16b + Compute12x8End1: + ld1 {v1.4h}, [x10] + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + fmla v16.8h, v3.8h, v0.h[4] + fmla v18.8h, v3.8h, v0.h[5] + fmla v20.8h, v3.8h, v0.h[6] + fmla v22.8h, v3.8h, v0.h[7] + fmla v24.8h, v3.8h, v1.h[0] + fmla v26.8h, v3.8h, v1.h[1] + fmla v28.8h, v3.8h, v1.h[2] + fmla v30.8h, v3.8h, v1.h[3] + ret + +Compute12x4Unit: + subs x14, x14, #2 + ble Compute12x4End + Compute12x4: + prfm pldl1keep, [x10, #632] + ld1 {v1.8h, v2.8h}, [x10], #32 + ld1 {v4.4h}, [x11], #8 + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v0.h[1] + fmla v12.4h, v3.4h, v0.h[2] + fmla v14.4h, v3.4h, v0.h[3] + fmla v16.4h, v3.4h, v0.h[4] + fmla v18.4h, v3.4h, v0.h[5] + fmla v20.4h, v3.4h, v0.h[6] + fmla v22.4h, v3.4h, v0.h[7] + fmla v24.4h, v3.4h, v1.h[0] + fmla v26.4h, v3.4h, v1.h[1] + fmla v28.4h, v3.4h, v1.h[2] + fmla v30.4h, v3.4h, v1.h[3] + prfm pldl1strm, [x11, #632] + ld1 {v3.4h}, [x11], #8 + fmla v8.4h, v4.4h, v1.h[4] + fmla v10.4h, v4.4h, v1.h[5] + fmla v12.4h, v4.4h, v1.h[6] + fmla v14.4h, v4.4h, v1.h[7] + ld1 {v0.8h}, [x10], #16 + fmla v16.4h, v4.4h, v2.h[0] + fmla v18.4h, v4.4h, v2.h[1] + fmla v20.4h, v4.4h, v2.h[2] + fmla v22.4h, v4.4h, v2.h[3] + fmla v24.4h, v4.4h, v2.h[4] + fmla v26.4h, v4.4h, v2.h[5] + fmla v28.4h, v4.4h, v2.h[6] + fmla v30.4h, v4.4h, v2.h[7] + + subs x14, x14, #2 + bgt Compute12x4 + Compute12x4End: + cbnz x14, Compute12x4End1 + prfm pldl1keep, [x10, #632] + ld1 {v1.4h}, [x10], #8 + ld1 {v4.4h}, [x11], #8 + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v0.h[1] + fmla v12.4h, v3.4h, v0.h[2] + fmla v14.4h, v3.4h, v0.h[3] + fmla v16.4h, v3.4h, v0.h[4] + fmla v18.4h, v3.4h, v0.h[5] + fmla v20.4h, v3.4h, v0.h[6] + fmla v22.4h, v3.4h, v0.h[7] + fmla v24.4h, v3.4h, v1.h[0] + fmla v26.4h, v3.4h, v1.h[1] + fmla v28.4h, v3.4h, v1.h[2] + fmla v30.4h, v3.4h, v1.h[3] + ld1 {v0.8h}, [x10], #16 + mov v3.8b, v4.8b + Compute12x4End1: + ld1 {v1.4h}, [x10] + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v0.h[1] + fmla v12.4h, v3.4h, v0.h[2] + fmla v14.4h, v3.4h, v0.h[3] + fmla v16.4h, v3.4h, v0.h[4] + fmla v18.4h, v3.4h, v0.h[5] + fmla v20.4h, v3.4h, v0.h[6] + fmla v22.4h, v3.4h, v0.h[7] + fmla v24.4h, v3.4h, v1.h[0] + fmla v26.4h, v3.4h, v1.h[1] + fmla v28.4h, v3.4h, v1.h[2] + fmla v30.4h, v3.4h, v1.h[3] + ret + +Compute8x16Unit: + subs x14, x14, #2 + ble Compute8x16End + Compute8x16: + prfm pldl1keep, [x10, #632] + ld1 {v1.8h}, [x10], #16 + ld1 {v4.8h, v5.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + fmla v16.8h, v3.8h, v0.h[4] + fmla v18.8h, v3.8h, v0.h[5] + fmla v20.8h, v3.8h, v0.h[6] + fmla v22.8h, v3.8h, v0.h[7] + prfm pldl1strm, [x11, #632] + ld1 {v6.8h}, [x11], #16 + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v0.h[1] + fmla v13.8h, v4.8h, v0.h[2] + fmla v15.8h, v4.8h, v0.h[3] + fmla v17.8h, v4.8h, v0.h[4] + fmla v19.8h, v4.8h, v0.h[5] + fmla v21.8h, v4.8h, v0.h[6] + fmla v23.8h, v4.8h, v0.h[7] + + fmla v8.8h, v5.8h, v1.h[0] + fmla v10.8h, v5.8h, v1.h[1] + fmla v12.8h, v5.8h, v1.h[2] + fmla v14.8h, v5.8h, v1.h[3] + fmla v16.8h, v5.8h, v1.h[4] + fmla v18.8h, v5.8h, v1.h[5] + fmla v20.8h, v5.8h, v1.h[6] + fmla v22.8h, v5.8h, v1.h[7] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + fmla v9.8h, v6.8h, v1.h[0] + fmla v11.8h, v6.8h, v1.h[1] + fmla v13.8h, v6.8h, v1.h[2] + fmla v15.8h, v6.8h, v1.h[3] + prfm pldl1keep, [x10, #632] + ld1 {v0.8h}, [x10], #16 + fmla v17.8h, v6.8h, v1.h[4] + fmla v19.8h, v6.8h, v1.h[5] + fmla v21.8h, v6.8h, v1.h[6] + fmla v23.8h, v6.8h, v1.h[7] + + subs x14, x14, #2 + bgt Compute8x16 + Compute8x16End: + cbnz x14, Compute8x16End1 + prfm pldl1keep, [x10, #632] + ld1 {v1.8h}, [x10] + ld1 {v4.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + fmla v16.8h, v3.8h, v0.h[4] + fmla v18.8h, v3.8h, v0.h[5] + fmla v20.8h, v3.8h, v0.h[6] + fmla v22.8h, v3.8h, v0.h[7] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v0.h[1] + fmla v13.8h, v4.8h, v0.h[2] + fmla v15.8h, v4.8h, v0.h[3] + fmla v17.8h, v4.8h, v0.h[4] + fmla v19.8h, v4.8h, v0.h[5] + fmla v21.8h, v4.8h, v0.h[6] + fmla v23.8h, v4.8h, v0.h[7] + mov v0.16b, v1.16b + Compute8x16End1: + ld1 {v4.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + fmla v16.8h, v3.8h, v0.h[4] + fmla v18.8h, v3.8h, v0.h[5] + fmla v20.8h, v3.8h, v0.h[6] + fmla v22.8h, v3.8h, v0.h[7] + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v0.h[1] + fmla v13.8h, v4.8h, v0.h[2] + fmla v15.8h, v4.8h, v0.h[3] + fmla v17.8h, v4.8h, v0.h[4] + fmla v19.8h, v4.8h, v0.h[5] + fmla v21.8h, v4.8h, v0.h[6] + fmla v23.8h, v4.8h, v0.h[7] + ret + +Compute8x8Unit: + subs x14, x14, #2 + ble Compute8x8End + Compute8x8: + prfm pldl1keep, [x10, #632] + ld1 {v1.8h}, [x10], #16 + ld1 {v4.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + fmla v16.8h, v3.8h, v0.h[4] + fmla v18.8h, v3.8h, v0.h[5] + fmla v20.8h, v3.8h, v0.h[6] + fmla v22.8h, v3.8h, v0.h[7] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + fmla v8.8h, v4.8h, v1.h[0] + fmla v10.8h, v4.8h, v1.h[1] + fmla v12.8h, v4.8h, v1.h[2] + fmla v14.8h, v4.8h, v1.h[3] + ld1 {v0.8h}, [x10], #16 + fmla v16.8h, v4.8h, v1.h[4] + fmla v18.8h, v4.8h, v1.h[5] + fmla v20.8h, v4.8h, v1.h[6] + fmla v22.8h, v4.8h, v1.h[7] + + subs x14, x14, #2 + bgt Compute8x8 + Compute8x8End: + cbnz x14, Compute8x8End1 + prfm pldl1keep, [x10, #632] + ld1 {v1.8h}, [x10] + ld1 {v4.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + fmla v16.8h, v3.8h, v0.h[4] + fmla v18.8h, v3.8h, v0.h[5] + fmla v20.8h, v3.8h, v0.h[6] + fmla v22.8h, v3.8h, v0.h[7] + mov v0.16b, v1.16b + mov v3.16b, v4.16b + Compute8x8End1: + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + fmla v16.8h, v3.8h, v0.h[4] + fmla v18.8h, v3.8h, v0.h[5] + fmla v20.8h, v3.8h, v0.h[6] + fmla v22.8h, v3.8h, v0.h[7] + ret + +Compute8x4Unit: + subs x14, x14, #2 + ble Compute8x4End + Compute8x4: + prfm pldl1keep, [x10, #632] + ld1 {v1.8h}, [x10], #16 + ld1 {v4.4h}, [x11], #8 + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v0.h[1] + fmla v12.4h, v3.4h, v0.h[2] + fmla v14.4h, v3.4h, v0.h[3] + fmla v16.4h, v3.4h, v0.h[4] + fmla v18.4h, v3.4h, v0.h[5] + fmla v20.4h, v3.4h, v0.h[6] + fmla v22.4h, v3.4h, v0.h[7] + prfm pldl1strm, [x11, #632] + ld1 {v3.4h}, [x11], #8 + fmla v8.4h, v4.4h, v1.h[0] + fmla v10.4h, v4.4h, v1.h[1] + fmla v12.4h, v4.4h, v1.h[2] + fmla v14.4h, v4.4h, v1.h[3] + ld1 {v0.8h}, [x10], #16 + fmla v16.4h, v4.4h, v1.h[4] + fmla v18.4h, v4.4h, v1.h[5] + fmla v20.4h, v4.4h, v1.h[6] + fmla v22.4h, v4.4h, v1.h[7] + + subs x14, x14, #2 + bgt Compute8x4 + Compute8x4End: + cbnz x14, Compute8x4End1 + prfm pldl1keep, [x10, #632] + ld1 {v1.8h}, [x10] + ld1 {v4.4h}, [x11], #8 + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v0.h[1] + fmla v12.4h, v3.4h, v0.h[2] + fmla v14.4h, v3.4h, v0.h[3] + fmla v16.4h, v3.4h, v0.h[4] + fmla v18.4h, v3.4h, v0.h[5] + fmla v20.4h, v3.4h, v0.h[6] + fmla v22.4h, v3.4h, v0.h[7] + mov v0.16b, v1.16b + mov v3.8b, v4.8b + Compute8x4End1: + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v0.h[1] + fmla v12.4h, v3.4h, v0.h[2] + fmla v14.4h, v3.4h, v0.h[3] + fmla v16.4h, v3.4h, v0.h[4] + fmla v18.4h, v3.4h, v0.h[5] + fmla v20.4h, v3.4h, v0.h[6] + fmla v22.4h, v3.4h, v0.h[7] + ret + +Compute4x16Unit: + subs x14, x14, #2 + ble Compute4x16End + Compute4x16: + prfm pldl1keep, [x10, #632] + ld1 {v1.4h}, [x10], #8 + ld1 {v4.8h, v5.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + prfm pldl1strm, [x11, #632] + ld1 {v6.8h}, [x11], #16 + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v0.h[1] + fmla v13.8h, v4.8h, v0.h[2] + fmla v15.8h, v4.8h, v0.h[3] + + fmla v8.8h, v5.8h, v1.h[0] + fmla v10.8h, v5.8h, v1.h[1] + fmla v12.8h, v5.8h, v1.h[2] + fmla v14.8h, v5.8h, v1.h[3] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + fmla v9.8h, v6.8h, v1.h[0] + fmla v11.8h, v6.8h, v1.h[1] + fmla v13.8h, v6.8h, v1.h[2] + fmla v15.8h, v6.8h, v1.h[3] + ld1 {v0.4h}, [x10], #8 + + subs x14, x14, #2 + bgt Compute4x16 + Compute4x16End: + cbnz x14, Compute4x16End1 + prfm pldl1keep, [x10, #632] + ld1 {v1.4h}, [x10] + ld1 {v4.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v0.h[1] + fmla v13.8h, v4.8h, v0.h[2] + fmla v15.8h, v4.8h, v0.h[3] + mov v0.8b, v1.8b + Compute4x16End1: + ld1 {v4.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v0.h[1] + fmla v13.8h, v4.8h, v0.h[2] + fmla v15.8h, v4.8h, v0.h[3] + ret + +Compute4x8Unit: + subs x14, x14, #2 + ble Compute4x8End + Compute4x8: + prfm pldl1keep, [x10, #632] + ld1 {v1.4h}, [x10], #8 + ld1 {v4.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + fmla v8.8h, v4.8h, v1.h[0] + fmla v10.8h, v4.8h, v1.h[1] + fmla v12.8h, v4.8h, v1.h[2] + fmla v14.8h, v4.8h, v1.h[3] + ld1 {v0.4h}, [x10], #8 + + subs x14, x14, #2 + bgt Compute4x8 + Compute4x8End: + cbnz x14, Compute4x8End1 + prfm pldl1keep, [x10, #632] + ld1 {v1.4h}, [x10] + ld1 {v4.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + mov v0.8b, v1.8b + mov v3.16b, v4.16b + Compute4x8End1: + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v0.h[1] + fmla v12.8h, v3.8h, v0.h[2] + fmla v14.8h, v3.8h, v0.h[3] + ret + +Compute4x4Unit: + subs x14, x14, #2 + ble Compute4x4End + Compute4x4: + prfm pldl1keep, [x10, #632] + ld1 {v1.4h}, [x10], #8 + ld1 {v4.4h}, [x11], #8 + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v0.h[1] + fmla v12.4h, v3.4h, v0.h[2] + fmla v14.4h, v3.4h, v0.h[3] + prfm pldl1strm, [x11, #632] + ld1 {v3.4h}, [x11], #8 + fmla v8.4h, v4.4h, v1.h[0] + fmla v10.4h, v4.4h, v1.h[1] + fmla v12.4h, v4.4h, v1.h[2] + fmla v14.4h, v4.4h, v1.h[3] + ld1 {v0.4h}, [x10], #8 + + subs x14, x14, #2 + bgt Compute4x4 + Compute4x4End: + cbnz x14, Compute4x4End1 + prfm pldl1keep, [x10, #632] + ld1 {v1.4h}, [x10] + ld1 {v4.4h}, [x11], #8 + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v0.h[1] + fmla v12.4h, v3.4h, v0.h[2] + fmla v14.4h, v3.4h, v0.h[3] + mov v0.8b, v1.8b + mov v3.8b, v4.8b + Compute4x4End1: + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v0.h[1] + fmla v12.4h, v3.4h, v0.h[2] + fmla v14.4h, v3.4h, v0.h[3] + ret + +Compute3x16Unit: + add x19, x10, x16 + add x20, x10, x16, lsl #1 + subs x14, x14, #8 + blt Compute3x16End4 + Compute3x16: + ld1 {v0.8h}, [x10], #16 + ld1 {v1.8h}, [x19], #16 + ld1 {v2.8h}, [x20], #16 + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + fmla v12.8h, v3.8h, v2.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v1.h[0] + fmla v13.8h, v4.8h, v2.h[0] + fmla v8.8h, v5.8h, v0.h[1] + fmla v10.8h, v5.8h, v1.h[1] + fmla v12.8h, v5.8h, v2.h[1] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v0.h[1] + fmla v11.8h, v6.8h, v1.h[1] + fmla v13.8h, v6.8h, v2.h[1] + fmla v8.8h, v3.8h, v0.h[2] + fmla v10.8h, v3.8h, v1.h[2] + fmla v12.8h, v3.8h, v2.h[2] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[2] + fmla v11.8h, v4.8h, v1.h[2] + fmla v13.8h, v4.8h, v2.h[2] + fmla v8.8h, v5.8h, v0.h[3] + fmla v10.8h, v5.8h, v1.h[3] + fmla v12.8h, v5.8h, v2.h[3] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v0.h[3] + fmla v11.8h, v6.8h, v1.h[3] + fmla v13.8h, v6.8h, v2.h[3] + + fmla v8.8h, v3.8h, v0.h[4] + fmla v10.8h, v3.8h, v1.h[4] + fmla v12.8h, v3.8h, v2.h[4] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[4] + fmla v11.8h, v4.8h, v1.h[4] + fmla v13.8h, v4.8h, v2.h[4] + fmla v8.8h, v5.8h, v0.h[5] + fmla v10.8h, v5.8h, v1.h[5] + fmla v12.8h, v5.8h, v2.h[5] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v0.h[5] + fmla v11.8h, v6.8h, v1.h[5] + fmla v13.8h, v6.8h, v2.h[5] + fmla v8.8h, v3.8h, v0.h[6] + fmla v10.8h, v3.8h, v1.h[6] + fmla v12.8h, v3.8h, v2.h[6] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[6] + fmla v11.8h, v4.8h, v1.h[6] + fmla v13.8h, v4.8h, v2.h[6] + fmla v8.8h, v5.8h, v0.h[7] + fmla v10.8h, v5.8h, v1.h[7] + fmla v12.8h, v5.8h, v2.h[7] + fmla v9.8h, v6.8h, v0.h[7] + fmla v11.8h, v6.8h, v1.h[7] + fmla v13.8h, v6.8h, v2.h[7] + + subs x14, x14, #8 + bge Compute3x16 + Compute3x16End4: + adds x14, x14, #8 + cbz x14, Compute3x16Return + subs x14, x14, #4 + blt Compute3x16EndTail + ld1 {v0.4h}, [x10], #8 + ld1 {v1.4h}, [x19], #8 + ld1 {v2.4h}, [x20], #8 + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + fmla v12.8h, v3.8h, v2.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v1.h[0] + fmla v13.8h, v4.8h, v2.h[0] + fmla v8.8h, v5.8h, v0.h[1] + fmla v10.8h, v5.8h, v1.h[1] + fmla v12.8h, v5.8h, v2.h[1] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v0.h[1] + fmla v11.8h, v6.8h, v1.h[1] + fmla v13.8h, v6.8h, v2.h[1] + fmla v8.8h, v3.8h, v0.h[2] + fmla v10.8h, v3.8h, v1.h[2] + fmla v12.8h, v3.8h, v2.h[2] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[2] + fmla v11.8h, v4.8h, v1.h[2] + fmla v13.8h, v4.8h, v2.h[2] + fmla v8.8h, v5.8h, v0.h[3] + fmla v10.8h, v5.8h, v1.h[3] + fmla v12.8h, v5.8h, v2.h[3] + fmla v9.8h, v6.8h, v0.h[3] + fmla v11.8h, v6.8h, v1.h[3] + fmla v13.8h, v6.8h, v2.h[3] + subs x14, x14, #4 + Compute3x16EndTail: + adds x14, x14, #4 + cbz x14, Compute3x16Return + cmp x14, #1 + beq Compute3x16EndTail1 + cmp x14, #2 + beq Compute3x16EndTail2 + ld1 {v0.4h}, [x10] + ld1 {v1.4h}, [x19] + ld1 {v2.s}[0], [x20], #4 + ld1 {v2.h}[2], [x20] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + fmla v12.8h, v3.8h, v2.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v1.h[0] + fmla v13.8h, v4.8h, v2.h[0] + fmla v8.8h, v5.8h, v0.h[1] + fmla v10.8h, v5.8h, v1.h[1] + fmla v12.8h, v5.8h, v2.h[1] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v0.h[1] + fmla v11.8h, v6.8h, v1.h[1] + fmla v13.8h, v6.8h, v2.h[1] + fmla v8.8h, v3.8h, v0.h[2] + fmla v10.8h, v3.8h, v1.h[2] + fmla v12.8h, v3.8h, v2.h[2] + fmla v9.8h, v4.8h, v0.h[2] + fmla v11.8h, v4.8h, v1.h[2] + fmla v13.8h, v4.8h, v2.h[2] + b Compute3x16Return + Compute3x16EndTail2: + ld1 {v0.4h}, [x10] + ld1 {v1.4h}, [x19] + ld1 {v2.s}[0], [x20] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + fmla v12.8h, v3.8h, v2.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v1.h[0] + fmla v13.8h, v4.8h, v2.h[0] + fmla v8.8h, v5.8h, v0.h[1] + fmla v10.8h, v5.8h, v1.h[1] + fmla v12.8h, v5.8h, v2.h[1] + fmla v9.8h, v6.8h, v0.h[1] + fmla v11.8h, v6.8h, v1.h[1] + fmla v13.8h, v6.8h, v2.h[1] + b Compute3x16Return + Compute3x16EndTail1: + ld1 {v0.h}[0], [x10] + ld1 {v1.h}[0], [x19] + ld1 {v2.h}[0], [x20] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + fmla v12.8h, v3.8h, v2.h[0] + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v1.h[0] + fmla v13.8h, v4.8h, v2.h[0] + Compute3x16Return: + ret + +Compute3x8Unit: + add x19, x10, x16 + add x20, x10, x16, lsl #1 + subs x14, x14, #8 + blt Compute3x8End4 + Compute3x8: + ld1 {v0.8h}, [x10], #16 + ld1 {v1.8h}, [x19], #16 + ld1 {v2.8h}, [x20], #16 + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + fmla v12.8h, v3.8h, v2.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v8.8h, v4.8h, v0.h[1] + fmla v10.8h, v4.8h, v1.h[1] + fmla v12.8h, v4.8h, v2.h[1] + fmla v8.8h, v5.8h, v0.h[2] + fmla v10.8h, v5.8h, v1.h[2] + fmla v12.8h, v5.8h, v2.h[2] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v6.8h, v0.h[3] + fmla v10.8h, v6.8h, v1.h[3] + fmla v12.8h, v6.8h, v2.h[3] + fmla v8.8h, v3.8h, v0.h[4] + fmla v10.8h, v3.8h, v1.h[4] + fmla v12.8h, v3.8h, v2.h[4] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v8.8h, v4.8h, v0.h[5] + fmla v10.8h, v4.8h, v1.h[5] + fmla v12.8h, v4.8h, v2.h[5] + fmla v8.8h, v5.8h, v0.h[6] + fmla v10.8h, v5.8h, v1.h[6] + fmla v12.8h, v5.8h, v2.h[6] + fmla v8.8h, v6.8h, v0.h[7] + fmla v10.8h, v6.8h, v1.h[7] + fmla v12.8h, v6.8h, v2.h[7] + + subs x14, x14, #8 + bge Compute3x8 + Compute3x8End4: + adds x14, x14, #8 + cbz x14, Compute3x8Return + subs x14, x14, #4 + blt Compute3x8EndTail + ld1 {v0.4h}, [x10], #8 + ld1 {v1.4h}, [x19], #8 + ld1 {v2.4h}, [x20], #8 + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + fmla v12.8h, v3.8h, v2.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v8.8h, v4.8h, v0.h[1] + fmla v10.8h, v4.8h, v1.h[1] + fmla v12.8h, v4.8h, v2.h[1] + fmla v8.8h, v5.8h, v0.h[2] + fmla v10.8h, v5.8h, v1.h[2] + fmla v12.8h, v5.8h, v2.h[2] + fmla v8.8h, v6.8h, v0.h[3] + fmla v10.8h, v6.8h, v1.h[3] + fmla v12.8h, v6.8h, v2.h[3] + subs x14, x14, #4 + Compute3x8EndTail: + adds x14, x14, #4 + cbz x14, Compute3x8Return + cmp x14, #1 + beq Compute3x8EndTail1 + cmp x14, #2 + beq Compute3x8EndTail2 + ld1 {v0.4h}, [x10] + ld1 {v1.4h}, [x19] + ld1 {v2.s}[0], [x20], #4 + ld1 {v2.h}[2], [x20] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + fmla v12.8h, v3.8h, v2.h[0] + ld1 {v5.8h}, [x11], #16 + fmla v8.8h, v4.8h, v0.h[1] + fmla v10.8h, v4.8h, v1.h[1] + fmla v12.8h, v4.8h, v2.h[1] + fmla v8.8h, v5.8h, v0.h[2] + fmla v10.8h, v5.8h, v1.h[2] + fmla v12.8h, v5.8h, v2.h[2] + b Compute3x8Return + Compute3x8EndTail2: + ld1 {v0.4h}, [x10] + ld1 {v1.4h}, [x19] + ld2 {v2.h, v3.h}[0], [x20] + prfm pldl1strm, [x11, #632] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v8.8h, v5.8h, v0.h[0] + fmla v10.8h, v5.8h, v1.h[0] + fmla v12.8h, v5.8h, v2.h[0] + fmla v8.8h, v6.8h, v0.h[1] + fmla v10.8h, v6.8h, v1.h[1] + fmla v12.8h, v6.8h, v3.h[0] + b Compute3x8Return + Compute3x8EndTail1: + ld1 {v0.h}[0], [x10] + ld1 {v1.h}[0], [x19] + ld1 {v2.h}[0], [x20] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + fmla v12.8h, v3.8h, v2.h[0] + Compute3x8Return: + ret + +Compute3x4Unit: + add x19, x10, x16 + add x20, x10, x16, lsl #1 + subs x14, x14, #8 + blt Compute3x4End4 + Compute3x4: + ld1 {v0.8h}, [x10], #16 + ld1 {v1.8h}, [x19], #16 + ld1 {v2.8h}, [x20], #16 + prfm pldl1strm, [x11, #632] + ld1 {v3.4h, v4.4h}, [x11], #16 + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v1.h[0] + fmla v12.4h, v3.4h, v2.h[0] + ld1 {v5.4h, v6.4h}, [x11], #16 + fmla v8.4h, v4.4h, v0.h[1] + fmla v10.4h, v4.4h, v1.h[1] + fmla v12.4h, v4.4h, v2.h[1] + fmla v8.4h, v5.4h, v0.h[2] + fmla v10.4h, v5.4h, v1.h[2] + fmla v12.4h, v5.4h, v2.h[2] + prfm pldl1strm, [x11, #632] + ld1 {v3.4h, v4.4h}, [x11], #16 + fmla v8.4h, v6.4h, v0.h[3] + fmla v10.4h, v6.4h, v1.h[3] + fmla v12.4h, v6.4h, v2.h[3] + fmla v8.4h, v3.4h, v0.h[4] + fmla v10.4h, v3.4h, v1.h[4] + fmla v12.4h, v3.4h, v2.h[4] + ld1 {v5.4h, v6.4h}, [x11], #16 + fmla v8.4h, v4.4h, v0.h[5] + fmla v10.4h, v4.4h, v1.h[5] + fmla v12.4h, v4.4h, v2.h[5] + fmla v8.4h, v5.4h, v0.h[6] + fmla v10.4h, v5.4h, v1.h[6] + fmla v12.4h, v5.4h, v2.h[6] + fmla v8.4h, v6.4h, v0.h[7] + fmla v10.4h, v6.4h, v1.h[7] + fmla v12.4h, v6.4h, v2.h[7] + + subs x14, x14, #8 + bge Compute3x4 + Compute3x4End4: + adds x14, x14, #8 + cbz x14, Compute3x4Return + subs x14, x14, #4 + blt Compute3x4EndTail + ld1 {v0.4h}, [x10], #8 + ld1 {v1.4h}, [x19], #8 + ld1 {v2.4h}, [x20], #8 + prfm pldl1strm, [x11, #632] + ld1 {v3.4h, v4.4h}, [x11], #16 + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v1.h[0] + fmla v12.4h, v3.4h, v2.h[0] + ld1 {v5.4h, v6.4h}, [x11], #16 + fmla v8.4h, v4.4h, v0.h[1] + fmla v10.4h, v4.4h, v1.h[1] + fmla v12.4h, v4.4h, v2.h[1] + fmla v8.4h, v5.4h, v0.h[2] + fmla v10.4h, v5.4h, v1.h[2] + fmla v12.4h, v5.4h, v2.h[2] + fmla v8.4h, v6.4h, v0.h[3] + fmla v10.4h, v6.4h, v1.h[3] + fmla v12.4h, v6.4h, v2.h[3] + subs x14, x14, #4 + Compute3x4EndTail: + adds x14, x14, #4 + cbz x14, Compute3x4Return + cmp x14, #1 + beq Compute3x4EndTail1 + cmp x14, #2 + beq Compute3x4EndTail2 + ld1 {v0.4h}, [x10] + ld1 {v1.4h}, [x19] + ld1 {v2.s}[0], [x20], #4 + ld1 {v2.h}[2], [x20] + prfm pldl1strm, [x11, #632] + ld1 {v3.4h, v4.4h}, [x11], #16 + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v1.h[0] + fmla v12.4h, v3.4h, v2.h[0] + ld1 {v5.4h}, [x11], #8 + fmla v8.4h, v4.4h, v0.h[1] + fmla v10.4h, v4.4h, v1.h[1] + fmla v12.4h, v4.4h, v2.h[1] + fmla v8.4h, v5.4h, v0.h[2] + fmla v10.4h, v5.4h, v1.h[2] + fmla v12.4h, v5.4h, v2.h[2] + b Compute3x4Return + Compute3x4EndTail2: + ld1 {v0.4h}, [x10] + ld1 {v1.4h}, [x19] + ld2 {v2.h, v3.h}[0], [x20] + prfm pldl1strm, [x11, #632] + ld1 {v5.4h, v6.4h}, [x11], #16 + fmla v8.4h, v5.4h, v0.h[0] + fmla v10.4h, v5.4h, v1.h[0] + fmla v12.4h, v5.4h, v2.h[0] + fmla v8.4h, v6.4h, v0.h[1] + fmla v10.4h, v6.4h, v1.h[1] + fmla v12.4h, v6.4h, v3.h[0] + b Compute3x4Return + Compute3x4EndTail1: + ld1 {v0.h}[0], [x10] + ld1 {v1.h}[0], [x19] + ld1 {v2.h}[0], [x20] + prfm pldl1strm, [x11, #632] + ld1 {v3.4h}, [x11], #8 + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v1.h[0] + fmla v12.4h, v3.4h, v2.h[0] + Compute3x4Return: + ret + +Compute2x16Unit: + add x19, x10, x16 + subs x14, x14, #8 + blt Compute2x16End4 + Compute2x16: + ld1 {v0.8h}, [x10], #16 + ld1 {v1.8h}, [x19], #16 + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v1.h[0] + fmla v8.8h, v5.8h, v0.h[1] + fmla v10.8h, v5.8h, v1.h[1] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v0.h[1] + fmla v11.8h, v6.8h, v1.h[1] + fmla v8.8h, v3.8h, v0.h[2] + fmla v10.8h, v3.8h, v1.h[2] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[2] + fmla v11.8h, v4.8h, v1.h[2] + fmla v8.8h, v5.8h, v0.h[3] + fmla v10.8h, v5.8h, v1.h[3] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v0.h[3] + fmla v11.8h, v6.8h, v1.h[3] + + fmla v8.8h, v3.8h, v0.h[4] + fmla v10.8h, v3.8h, v1.h[4] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[4] + fmla v11.8h, v4.8h, v1.h[4] + fmla v8.8h, v5.8h, v0.h[5] + fmla v10.8h, v5.8h, v1.h[5] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v0.h[5] + fmla v11.8h, v6.8h, v1.h[5] + fmla v8.8h, v3.8h, v0.h[6] + fmla v10.8h, v3.8h, v1.h[6] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[6] + fmla v11.8h, v4.8h, v1.h[6] + fmla v8.8h, v5.8h, v0.h[7] + fmla v10.8h, v5.8h, v1.h[7] + fmla v9.8h, v6.8h, v0.h[7] + fmla v11.8h, v6.8h, v1.h[7] + + subs x14, x14, #8 + bge Compute2x16 + Compute2x16End4: + adds x14, x14, #8 + cbz x14, Compute2x16Return + subs x14, x14, #4 + blt Compute2x16EndTail + ld1 {v0.4h}, [x10], #8 + ld1 {v1.4h}, [x19], #8 + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v1.h[0] + fmla v8.8h, v5.8h, v0.h[1] + fmla v10.8h, v5.8h, v1.h[1] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v0.h[1] + fmla v11.8h, v6.8h, v1.h[1] + fmla v8.8h, v3.8h, v0.h[2] + fmla v10.8h, v3.8h, v1.h[2] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[2] + fmla v11.8h, v4.8h, v1.h[2] + fmla v8.8h, v5.8h, v0.h[3] + fmla v10.8h, v5.8h, v1.h[3] + fmla v9.8h, v6.8h, v0.h[3] + fmla v11.8h, v6.8h, v1.h[3] + subs x14, x14, #4 + Compute2x16EndTail: + adds x14, x14, #4 + cbz x14, Compute2x16Return + cmp x14, #1 + beq Compute2x16EndTail1 + cmp x14, #2 + beq Compute2x16EndTail2 + ld1 {v0.4h}, [x10] + ld1 {v1.s}[0], [x19], #4 + ld1 {v1.h}[2], [x19] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v1.h[0] + fmla v8.8h, v5.8h, v0.h[1] + fmla v10.8h, v5.8h, v1.h[1] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v0.h[1] + fmla v11.8h, v6.8h, v1.h[1] + fmla v8.8h, v3.8h, v0.h[2] + fmla v10.8h, v3.8h, v1.h[2] + fmla v9.8h, v4.8h, v0.h[2] + fmla v11.8h, v4.8h, v1.h[2] + b Compute2x16Return + Compute2x16EndTail2: + ld1 {v0.4h}, [x10] + ld2 {v1.h, v2.h}[0], [x19] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v1.h[0] + fmla v8.8h, v5.8h, v0.h[1] + fmla v10.8h, v5.8h, v2.h[0] + fmla v9.8h, v6.8h, v0.h[1] + fmla v11.8h, v6.8h, v2.h[0] + b Compute2x16Return + Compute2x16EndTail1: + ld1 {v0.h}[0], [x10] + ld1 {v1.h}[0], [x19] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + fmla v9.8h, v4.8h, v0.h[0] + fmla v11.8h, v4.8h, v1.h[0] + Compute2x16Return: + ret + +Compute2x8Unit: + add x19, x10, x16 + subs x14, x14, #8 + blt Compute2x8End4 + Compute2x8: + ld1 {v0.8h}, [x10], #16 + ld1 {v1.8h}, [x19], #16 + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v8.8h, v4.8h, v0.h[1] + fmla v10.8h, v4.8h, v1.h[1] + fmla v8.8h, v5.8h, v0.h[2] + fmla v10.8h, v5.8h, v1.h[2] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v6.8h, v0.h[3] + fmla v10.8h, v6.8h, v1.h[3] + fmla v8.8h, v3.8h, v0.h[4] + fmla v10.8h, v3.8h, v1.h[4] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v8.8h, v4.8h, v0.h[5] + fmla v10.8h, v4.8h, v1.h[5] + fmla v8.8h, v5.8h, v0.h[6] + fmla v10.8h, v5.8h, v1.h[6] + fmla v8.8h, v6.8h, v0.h[7] + fmla v10.8h, v6.8h, v1.h[7] + + subs x14, x14, #8 + bge Compute2x8 + Compute2x8End4: + adds x14, x14, #8 + cbz x14, Compute2x8Return + subs x14, x14, #4 + blt Compute2x8EndTail + ld1 {v0.4h}, [x10], #8 + ld1 {v1.4h}, [x19], #8 + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v8.8h, v4.8h, v0.h[1] + fmla v10.8h, v4.8h, v1.h[1] + fmla v8.8h, v5.8h, v0.h[2] + fmla v10.8h, v5.8h, v1.h[2] + fmla v8.8h, v6.8h, v0.h[3] + fmla v10.8h, v6.8h, v1.h[3] + subs x14, x14, #4 + Compute2x8EndTail: + adds x14, x14, #4 + cbz x14, Compute2x8Return + cmp x14, #1 + beq Compute2x8EndTail1 + cmp x14, #2 + beq Compute2x8EndTail2 + ld1 {v0.4h}, [x10] + ld3 {v1.h, v2.h, v3.h}[0], [x19] + prfm pldl1strm, [x11, #632] + ld1 {v4.8h, v5.8h}, [x11], #32 + fmla v8.8h, v4.8h, v0.h[0] + fmla v10.8h, v4.8h, v1.h[0] + ld1 {v6.8h}, [x11], #16 + fmla v8.8h, v5.8h, v0.h[1] + fmla v10.8h, v5.8h, v2.h[0] + fmla v8.8h, v6.8h, v0.h[2] + fmla v10.8h, v6.8h, v3.h[0] + b Compute2x8Return + Compute2x8EndTail2: + ld1 {v0.4h}, [x10] + ld2 {v1.h, v2.h}[0], [x19] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + fmla v8.8h, v4.8h, v0.h[1] + fmla v10.8h, v4.8h, v2.h[0] + b Compute2x8Return + Compute2x8EndTail1: + ld1 {v0.h}[0], [x10] + ld1 {v1.h}[0], [x19] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + fmla v10.8h, v3.8h, v1.h[0] + Compute2x8Return: + ret + +Compute2x4Unit: + add x19, x10, x16 + subs x14, x14, #8 + blt Compute2x4End4 + Compute2x4: + ld1 {v0.8h}, [x10], #16 + ld1 {v1.8h}, [x19], #16 + prfm pldl1strm, [x11, #632] + ld1 {v3.4h, v4.4h}, [x11], #16 + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v1.h[0] + ld1 {v5.4h, v6.4h}, [x11], #16 + fmla v8.4h, v4.4h, v0.h[1] + fmla v10.4h, v4.4h, v1.h[1] + fmla v8.4h, v5.4h, v0.h[2] + fmla v10.4h, v5.4h, v1.h[2] + prfm pldl1strm, [x11, #632] + ld1 {v3.4h, v4.4h}, [x11], #16 + fmla v8.4h, v6.4h, v0.h[3] + fmla v10.4h, v6.4h, v1.h[3] + fmla v8.4h, v3.4h, v0.h[4] + fmla v10.4h, v3.4h, v1.h[4] + ld1 {v5.4h, v6.4h}, [x11], #16 + fmla v8.4h, v4.4h, v0.h[5] + fmla v10.4h, v4.4h, v1.h[5] + fmla v8.4h, v5.4h, v0.h[6] + fmla v10.4h, v5.4h, v1.h[6] + fmla v8.4h, v6.4h, v0.h[7] + fmla v10.4h, v6.4h, v1.h[7] + + subs x14, x14, #8 + bge Compute2x4 + Compute2x4End4: + adds x14, x14, #8 + cbz x14, Compute2x4Return + subs x14, x14, #4 + blt Compute2x4EndTail + ld1 {v0.4h}, [x10], #8 + ld1 {v1.4h}, [x19], #8 + prfm pldl1strm, [x11, #632] + ld1 {v3.4h, v4.4h}, [x11], #16 + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v1.h[0] + ld1 {v5.4h, v6.4h}, [x11], #16 + fmla v8.4h, v4.4h, v0.h[1] + fmla v10.4h, v4.4h, v1.h[1] + fmla v8.4h, v5.4h, v0.h[2] + fmla v10.4h, v5.4h, v1.h[2] + fmla v8.4h, v6.4h, v0.h[3] + fmla v10.4h, v6.4h, v1.h[3] + subs x14, x14, #4 + Compute2x4EndTail: + adds x14, x14, #4 + cbz x14, Compute2x4Return + cmp x14, #1 + beq Compute2x4EndTail1 + cmp x14, #2 + beq Compute2x4EndTail2 + ld1 {v0.4h}, [x10] + ld3 {v1.h, v2.h, v3.h}[0], [x19] + prfm pldl1strm, [x11, #632] + ld1 {v4.4h, v5.4h}, [x11], #16 + fmla v8.4h, v4.4h, v0.h[0] + fmla v10.4h, v4.4h, v1.h[0] + ld1 {v6.4h}, [x11], #8 + fmla v8.4h, v5.4h, v0.h[1] + fmla v10.4h, v5.4h, v2.h[0] + fmla v8.4h, v6.4h, v0.h[2] + fmla v10.4h, v6.4h, v3.h[0] + b Compute2x4Return + Compute2x4EndTail2: + ld1 {v0.4h}, [x10] + ld2 {v1.h, v2.h}[0], [x19] + prfm pldl1strm, [x11, #632] + ld1 {v3.4h, v4.4h}, [x11], #16 + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v1.h[0] + fmla v8.4h, v4.4h, v0.h[1] + fmla v10.4h, v4.4h, v2.h[0] + b Compute2x4Return + Compute2x4EndTail1: + ld1 {v0.h}[0], [x10] + ld1 {v1.h}[0], [x19] + prfm pldl1strm, [x11, #632] + ld1 {v3.4h}, [x11], #8 + fmla v8.4h, v3.4h, v0.h[0] + fmla v10.4h, v3.4h, v1.h[0] + Compute2x4Return: + ret + +Compute1x16Unit: + subs x14, x14, #8 + blt Compute1x16End4 + Compute1x16: + ld1 {v0.8h}, [x10], #16 + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[0] + fmla v8.8h, v5.8h, v0.h[1] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v0.h[1] + fmla v8.8h, v3.8h, v0.h[2] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[2] + fmla v8.8h, v5.8h, v0.h[3] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v0.h[3] + + fmla v8.8h, v3.8h, v0.h[4] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[4] + fmla v8.8h, v5.8h, v0.h[5] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v0.h[5] + fmla v8.8h, v3.8h, v0.h[6] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[6] + fmla v8.8h, v5.8h, v0.h[7] + fmla v9.8h, v6.8h, v0.h[7] + + subs x14, x14, #8 + bge Compute1x16 + Compute1x16End4: + adds x14, x14, #8 + cbz x14, Compute1x16Return + subs x14, x14, #4 + blt Compute1x16EndTail + ld1 {v0.4h}, [x10], #8 + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[0] + fmla v8.8h, v5.8h, v0.h[1] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v0.h[1] + fmla v8.8h, v3.8h, v0.h[2] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[2] + fmla v8.8h, v5.8h, v0.h[3] + fmla v9.8h, v6.8h, v0.h[3] + subs x14, x14, #4 + Compute1x16EndTail: + adds x14, x14, #4 + cbz x14, Compute1x16Return + cmp x14, #1 + beq Compute1x16EndTail1 + cmp x14, #2 + beq Compute1x16EndTail2 + ld3 {v0.h, v1.h, v2.h}[0], [x10] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[0] + fmla v8.8h, v5.8h, v1.h[0] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v9.8h, v6.8h, v1.h[0] + fmla v8.8h, v3.8h, v2.h[0] + fmla v9.8h, v4.8h, v2.h[0] + b Compute1x16Return + Compute1x16EndTail2: + ld2 {v0.h, v1.h}[0], [x10] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v9.8h, v4.8h, v0.h[0] + fmla v8.8h, v5.8h, v1.h[0] + fmla v9.8h, v6.8h, v1.h[0] + b Compute1x16Return + Compute1x16EndTail1: + ld1 {v0.h}[0], [x10] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v9.8h, v4.8h, v0.h[0] + Compute1x16Return: + ret + +Compute1x8Unit: + subs x14, x14, #8 + blt Compute1x8End4 + Compute1x8: + ld1 {v0.8h}, [x10], #16 + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v8.8h, v4.8h, v0.h[1] + fmla v8.8h, v5.8h, v0.h[2] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v6.8h, v0.h[3] + fmla v8.8h, v3.8h, v0.h[4] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v8.8h, v4.8h, v0.h[5] + fmla v8.8h, v5.8h, v0.h[6] + fmla v8.8h, v6.8h, v0.h[7] + + subs x14, x14, #8 + bge Compute1x8 + Compute1x8End4: + adds x14, x14, #8 + cbz x14, Compute1x8Return + subs x14, x14, #4 + blt Compute1x8EndTail + ld1 {v0.4h}, [x10], #8 + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + ld1 {v5.8h, v6.8h}, [x11], #32 + fmla v8.8h, v4.8h, v0.h[1] + fmla v8.8h, v5.8h, v0.h[2] + fmla v8.8h, v6.8h, v0.h[3] + subs x14, x14, #4 + Compute1x8EndTail: + adds x14, x14, #4 + cbz x14, Compute1x8Return + cmp x14, #1 + beq Compute1x8EndTail1 + cmp x14, #2 + beq Compute1x8EndTail2 + ld3 {v0.h, v1.h, v2.h}[0], [x10] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + ld1 {v5.8h}, [x11], #16 + fmla v8.8h, v4.8h, v1.h[0] + fmla v8.8h, v5.8h, v2.h[0] + b Compute1x8Return + Compute1x8EndTail2: + ld2 {v0.h, v1.h}[0], [x10] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h, v4.8h}, [x11], #32 + fmla v8.8h, v3.8h, v0.h[0] + fmla v8.8h, v4.8h, v1.h[0] + b Compute1x8Return + Compute1x8EndTail1: + ld1 {v0.h}[0], [x10] + prfm pldl1strm, [x11, #632] + ld1 {v3.8h}, [x11], #16 + fmla v8.8h, v3.8h, v0.h[0] + Compute1x8Return: + ret + +Compute1x4Unit: + subs x14, x14, #8 + blt Compute1x4End4 + Compute1x4: + ld1 {v0.8h}, [x10], #16 + prfm pldl1strm, [x11, #632] + ld1 {v3.4h, v4.4h}, [x11], #16 + fmla v8.4h, v3.4h, v0.h[0] + ld1 {v5.4h, v6.4h}, [x11], #16 + fmla v8.4h, v4.4h, v0.h[1] + fmla v8.4h, v5.4h, v0.h[2] + prfm pldl1strm, [x11, #632] + ld1 {v3.4h, v4.4h}, [x11], #16 + fmla v8.4h, v6.4h, v0.h[3] + fmla v8.4h, v3.4h, v0.h[4] + ld1 {v5.4h, v6.4h}, [x11], #16 + fmla v8.4h, v4.4h, v0.h[5] + fmla v8.4h, v5.4h, v0.h[6] + fmla v8.4h, v6.4h, v0.h[7] + + subs x14, x14, #8 + bge Compute1x4 + Compute1x4End4: + adds x14, x14, #8 + cbz x14, Compute1x4Return + subs x14, x14, #4 + blt Compute1x4EndTail + ld1 {v0.4h}, [x10], #8 + prfm pldl1strm, [x11, #632] + ld1 {v3.4h, v4.4h}, [x11], #16 + fmla v8.4h, v3.4h, v0.h[0] + ld1 {v5.4h, v6.4h}, [x11], #16 + fmla v8.4h, v4.4h, v0.h[1] + fmla v8.4h, v5.4h, v0.h[2] + fmla v8.4h, v6.4h, v0.h[3] + subs x14, x14, #4 + Compute1x4EndTail: + adds x14, x14, #4 + cbz x14, Compute1x4Return + cmp x14, #1 + beq Compute1x4EndTail1 + cmp x14, #2 + beq Compute1x4EndTail2 + ld3 {v0.h, v1.h, v2.h}[0], [x10] + prfm pldl1strm, [x11, #632] + ld1 {v3.4h, v4.4h}, [x11], #16 + fmla v8.4h, v3.4h, v0.h[0] + ld1 {v5.4h}, [x11], #8 + fmla v8.4h, v4.4h, v1.h[0] + fmla v8.4h, v5.4h, v2.h[0] + b Compute1x4Return + Compute1x4EndTail2: + ld2 {v0.h, v1.h}[0], [x10] + prfm pldl1strm, [x11, #632] + ld1 {v3.4h, v4.4h}, [x11], #16 + fmla v8.4h, v3.4h, v0.h[0] + fmla v8.4h, v4.4h, v1.h[0] + b Compute1x4Return + Compute1x4EndTail1: + ld1 {v0.h}[0], [x10] + prfm pldl1strm, [x11, #632] + ld1 {v3.4h}, [x11], #8 + fmla v8.4h, v3.4h, v0.h[0] + Compute1x4Return: + ret + +End: + sub sp, sp, #192 + ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 + ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 + ldp x19, x20, [sp], #16 + ldp x21, x22, [sp], #16 + ldp x23, x24, [sp], #16 + ldp x29, x30, [sp], #16 + ret +#endif diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/custom_gather_d_grad_v2_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/custom_gather_d_grad_v2_parameter.h new file mode 100644 index 00000000..541c7ff1 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/custom_gather_d_grad_v2_parameter.h @@ -0,0 +1,28 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_CUSTOM_GATHER_D_GRAD_V2_PARAMETER_H_ +#define MINDSPORE_NNACL_CUSTOM_GATHER_D_GRAD_V2_PARAMETER_H_ + +#include "nnacl/op_base.h" + +typedef struct CustomGatherGradV2Parameter { + // Primitive parameter + OpParameter op_parameter_; + // shape correlative + int dim; +} CustomGatherGradV2Parameter; + +#endif // MINDSPORE_NNACL_CUSTOM_GATHER_D_GRAD_V2_PARAMETER_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/custom_gru_fp16.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/custom_gru_fp16.c index 6e754569..72391811 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/custom_gru_fp16.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/custom_gru_fp16.c @@ -35,13 +35,13 @@ void CustomGruFp16(float16_t *output, const float16_t *input, const float16_t *w float16_t *hidden_gate = buffer[C3NUM]; for (int i = 0; i < num_step; ++i) { if (batch_size != 1) { - RowMajor2ColNMajorFp16(input + i * batch_size * input_size, buffer[0], batch_size, input_size); + RowMajor2ColNMajorFp16(input + i * batch_size * input_size, buffer[0], batch_size, input_size, false); for (int j = 0; j < C3NUM; ++j) { MatmulBaseFp16Neon(buffer[0], weight_input + j * weight_in_offset, input_gate + j * output_size, bias_input + j * col_align, ActType_No, input_size, batch_size, hidden_size, hidden_size, OutType_Nhwc); } - RowMajor2ColNMajorFp16(init_h, buffer[C2NUM], batch_size, hidden_size); + RowMajor2ColNMajorFp16(init_h, buffer[C2NUM], batch_size, hidden_size, false); for (int j = 0; j < C3NUM; ++j) { MatmulBaseFp16Neon(buffer[C2NUM], weight_hidden + j * weight_hidden_offset, hidden_gate + j * output_size, bias_hidden + j * col_align, ActType_No, hidden_size, batch_size, hidden_size, hidden_size, diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/exp_fp16.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/exp_fp16.c index d1555953..93f005c8 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/exp_fp16.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/exp_fp16.c @@ -20,8 +20,10 @@ #if defined(ENABLE_NEON) static inline void simd_exp_fp16(float16x8_t input, float16_t *dst) { - static float16x8_t maxv = {88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f, 88.0f}; - static float16x8_t minv = {-88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f, -88.0f}; + static float16x8_t maxv = {88.72283935546875f, 88.72283935546875f, 88.72283935546875f, 88.72283935546875f, + 88.72283935546875f, 88.72283935546875f, 88.72283935546875f, 88.72283935546875f}; + static float16x8_t minv = {-87.3365478515625f, -87.3365478515625f, -87.3365478515625f, -87.3365478515625f, + -87.3365478515625f, -87.3365478515625f, -87.3365478515625f, -87.3365478515625f}; input = vmaxq_f16(minv, vminq_f16(input, maxv)); vst1q_f16(dst, VexpFp16(input)); } diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.c index 813237fa..614842a1 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.c @@ -23,28 +23,38 @@ #include "nnacl/fp16/cast_fp16.h" #include "nnacl/intrinsics/ms_simd_instructions_fp16.h" -void PackLstmWeightFp32ToFp16(float16_t *dst, const float *src, int batch, int deep, int col, int col_align) { +void PackLstmWeightFp32ToFp16(float16_t *dst, const float *src, int batch, int deep, int col, int col_align, + const int32_t *order) { for (int i = 0; i < batch; i++) { const float *src_batch = src + i * col * deep; - float16_t *dst_batch = dst + i * col_align * deep; + float16_t *dst_batch = dst + (order == NULL ? i : order[i]) * col_align * deep; +#ifdef ENABLE_ARM64 + RowMajor2ColNMajorFp16(src_batch, dst_batch, col, deep, true); +#else RowMajor2Col8MajorFp16(src_batch, dst_batch, col, deep, true); +#endif } } -void PackLstmWeightFp16(float16_t *dst, const float16_t *src, int batch, int deep, int col, int col_align) { +void PackLstmWeightFp16(float16_t *dst, const float16_t *src, int batch, int deep, int col, int col_align, + const int32_t *order) { for (int i = 0; i < batch; i++) { const float16_t *src_batch = src + i * col * deep; - float16_t *dst_batch = dst + i * col_align * deep; + float16_t *dst_batch = dst + (order == NULL ? i : order[i]) * col_align * deep; +#ifdef ENABLE_ARM64 + RowMajor2ColNMajorFp16(src_batch, dst_batch, col, deep, false); +#else RowMajor2Col8MajorFp16(src_batch, dst_batch, col, deep, false); +#endif } } -void PackLstmBiasFp32ToFp16(float16_t *dst, const float *src, int batch, int col, int col_align, - bool is_bidirectional) { +void PackLstmBiasFp32ToFp16(float16_t *dst, const float *src, int batch, int col, int col_align, bool is_bidirectional, + const int32_t *order) { int unidirectional_batch = is_bidirectional ? batch / 2 : batch; for (int i = 0; i < unidirectional_batch; i++) { const float *src_batch = src + i * col; - float16_t *dst_batch = dst + i * col_align; + float16_t *dst_batch = dst + (order == NULL ? i : order[i]) * col_align; Float32ToFloat16(src_batch, dst_batch, col); } if (is_bidirectional) { @@ -52,17 +62,18 @@ void PackLstmBiasFp32ToFp16(float16_t *dst, const float *src, int batch, int col float16_t *backward_dst = dst + unidirectional_batch * col_align; for (int i = 0; i < unidirectional_batch; i++) { const float *backward_src_batch = backward_src + i * col; - float16_t *backward_dst_batch = backward_dst + i * col_align; + float16_t *backward_dst_batch = backward_dst + (order == NULL ? i : order[i]) * col_align; Float32ToFloat16(backward_src_batch, backward_dst_batch, col); } } } -void PackLstmBiasFp16(float16_t *dst, const float16_t *src, int batch, int col, int col_align, bool is_bidirectional) { +void PackLstmBiasFp16(float16_t *dst, const float16_t *src, int batch, int col, int col_align, bool is_bidirectional, + const int32_t *order) { int unidirectional_batch = is_bidirectional ? batch / 2 : batch; for (int i = 0; i < unidirectional_batch; i++) { const float16_t *src_batch = src + i * col; - float16_t *dst_batch = dst + i * col_align; + float16_t *dst_batch = dst + (order == NULL ? i : order[i]) * col_align; (void)memcpy(dst_batch, src_batch, col * sizeof(float16_t)); } if (is_bidirectional) { @@ -70,7 +81,7 @@ void PackLstmBiasFp16(float16_t *dst, const float16_t *src, int batch, int col, float16_t *backward_dst = dst + unidirectional_batch * col_align; for (int i = 0; i < unidirectional_batch; i++) { const float16_t *backward_src_batch = backward_src + i * col; - float16_t *backward_dst_batch = backward_dst + i * col_align; + float16_t *backward_dst_batch = backward_dst + (order == NULL ? i : order[i]) * col_align; (void)memcpy(backward_dst_batch, backward_src_batch, col * sizeof(float16_t)); } } @@ -152,13 +163,13 @@ void UpdateOutputFp16(float16_t *hidden_state, float16_t *output, const float16_ const LstmParameter *lstm_param) { int batch = lstm_param->batch_; int hidden_size = lstm_param->hidden_size_; - int project_size = lstm_param->project_size_; + int output_size = lstm_param->output_size_; float16_t *state_buffer = buffer[C5NUM]; float16_t *hidden_buffer = weight_project ? buffer[C3NUM] : hidden_state; float16_t zoneout = lstm_param->zoneout_hidden_; if (!(zoneout >= -FLT_EPSILON && zoneout <= FLT_EPSILON)) { - (void)memcpy(state_buffer, hidden_state, batch * project_size * sizeof(float16_t)); - ElementOptMulFp16(state_buffer, &zoneout, state_buffer, batch * project_size, false); + (void)memcpy(state_buffer, hidden_state, batch * output_size * sizeof(float16_t)); + ElementOptMulFp16(state_buffer, &zoneout, state_buffer, batch * output_size, false); } TanhFp16(cell_state, hidden_buffer, batch * hidden_size); @@ -166,19 +177,32 @@ void UpdateOutputFp16(float16_t *hidden_state, float16_t *output, const float16_ if (weight_project) { float16_t *left_matrix = hidden_buffer; +#ifdef ENABLE_ARM64 + if (batch >= C4NUM) { + left_matrix = buffer[C6NUM]; + RowMajor2ColLadder12MajorFp16(hidden_buffer, left_matrix, batch, hidden_size); + } +#else if (batch != 1) { left_matrix = buffer[C6NUM]; RowMajor2Col16MajorFp16(hidden_buffer, left_matrix, batch, hidden_size, false); } - LstmMatMulFp16(hidden_state, left_matrix, weight_project, project_bias, batch, hidden_size, project_size, +#endif + LstmMatMulFp16(hidden_state, left_matrix, weight_project, project_bias, batch, hidden_size, output_size, batch == 1); } if (!(zoneout >= -FLT_EPSILON && zoneout <= FLT_EPSILON)) { - ElementOptMulAccFp16(hidden_state, 1 - zoneout, state_buffer, batch * project_size); + ElementOptMulAccFp16(hidden_state, 1 - zoneout, state_buffer, batch * output_size); } - (void)memcpy(output, hidden_state, batch * project_size * sizeof(float16_t)); + (void)memcpy(output, hidden_state, batch * output_size * sizeof(float16_t)); } +#ifdef ENABLE_ARM64 +void LstmMatMulFp16(float16_t *c, const float16_t *a, const float16_t *b, const float16_t *bias, int row, int deep, + int col, bool is_vec) { + MatmulFp16OptV2(a, b, c, bias, ActType_No, deep, row, col, col, OutType_Nhwc); +} +#else void LstmMatMulFp16(float16_t *c, const float16_t *a, const float16_t *b, const float16_t *bias, int row, int deep, int col, bool is_vec) { if (is_vec) { @@ -188,11 +212,12 @@ void LstmMatMulFp16(float16_t *c, const float16_t *a, const float16_t *b, const MatMulFp16(a, b, c, bias, ActType_No, deep, row, col, col, OutType_Nhwc); } } +#endif void UpdateLstmGateFp16(float16_t *gate_buffer, const float16_t *input, const float16_t *weight, const float16_t *bias, int row, int deep, int col, int col_align, bool is_vec) { for (int i = 0; i < 4; i++) { - const float16_t *weight_i = weight + deep * col * i; + const float16_t *weight_i = weight + deep * col_align * i; const float16_t *bias_i = bias + col_align * i; float16_t *gate = gate_buffer + row * col * i; LstmMatMulFp16(gate, input, weight_i, bias_i, row, deep, col, is_vec); @@ -207,16 +232,26 @@ void LstmStepUnitFp16(float16_t *output, float16_t *input_gate, float16_t *forge float16_t *state_gate = buffer[C3NUM]; float16_t *cell_buffer = buffer[C4NUM]; float16_t *hidden_buffer = buffer[C5NUM]; +#ifdef ENABLE_ARM64 + if (lstm_param->batch_ <= C3NUM) { + UpdateLstmGateFp16(state_gate, hidden_state, state_weight, state_bias, lstm_param->batch_, lstm_param->output_size_, + lstm_param->hidden_size_, lstm_param->state_col_align_, false); + } else { + RowMajor2ColLadder12MajorFp16(hidden_state, packed_state, lstm_param->batch_, lstm_param->output_size_); + UpdateLstmGateFp16(state_gate, packed_state, state_weight, state_bias, lstm_param->batch_, lstm_param->output_size_, + lstm_param->hidden_size_, lstm_param->state_col_align_, false); + } +#else bool is_vec = lstm_param->batch_ == 1; if (is_vec) { - UpdateLstmGateFp16(state_gate, hidden_state, state_weight, state_bias, lstm_param->batch_, - lstm_param->project_size_, lstm_param->hidden_size_, lstm_param->state_col_align_, is_vec); + UpdateLstmGateFp16(state_gate, hidden_state, state_weight, state_bias, lstm_param->batch_, lstm_param->output_size_, + lstm_param->hidden_size_, lstm_param->state_col_align_, is_vec); } else { - // pack state for matmul - RowMajor2Col16MajorFp16(hidden_state, packed_state, lstm_param->batch_, lstm_param->project_size_, false); - UpdateLstmGateFp16(state_gate, packed_state, state_weight, state_bias, lstm_param->batch_, - lstm_param->project_size_, lstm_param->hidden_size_, lstm_param->state_col_align_, is_vec); + RowMajor2Col16MajorFp16(hidden_state, packed_state, lstm_param->batch_, lstm_param->output_size_, false); + UpdateLstmGateFp16(state_gate, packed_state, state_weight, state_bias, lstm_param->batch_, lstm_param->output_size_, + lstm_param->hidden_size_, lstm_param->state_col_align_, is_vec); } +#endif ElementAddFp16(input_gate, state_gate, input_gate, lstm_param->batch_ * lstm_param->hidden_size_); ElementAddFp16(forget_gate, state_gate + lstm_param->batch_ * lstm_param->hidden_size_ * 2, forget_gate, lstm_param->batch_ * lstm_param->hidden_size_); @@ -247,24 +282,43 @@ void LstmStepUnitFp16(float16_t *output, float16_t *input_gate, float16_t *forge } if (!(lstm_param->zoneout_hidden_ >= -FLT_EPSILON && lstm_param->zoneout_hidden_ <= FLT_EPSILON)) { - (void)memcpy(hidden_state, hidden_buffer, lstm_param->batch_ * lstm_param->project_size_ * sizeof(float16_t)); + (void)memcpy(hidden_state, hidden_buffer, lstm_param->batch_ * lstm_param->output_size_ * sizeof(float16_t)); } } -void LstmUnidirectionalFp16(float16_t *output, const float16_t *packed_input, const float16_t *weight_i, - const float16_t *weight_h, const float16_t *input_bias, const float16_t *state_bias, - const float16_t *weight_project, const float16_t *project_bias, float16_t *hidden_state, - float16_t *cell_state, float16_t *buffer[C7NUM], const LstmParameter *lstm_param, - bool is_backward) { - float16_t *gate = buffer[1]; +#ifdef ENABLE_ARM64 +void LstmGateCompute(float16_t *gate, const float16_t *input, const float16_t *weight_i, const float16_t *input_bias, + const LstmParameter *lstm_param) { + int row_input = lstm_param->seq_len_ * lstm_param->batch_; + for (int i = 0; i < C4NUM; i++) { + const float16_t *weight_loop = weight_i + lstm_param->input_size_ * lstm_param->input_col_align_ * i; + const float16_t *bias_loop = input_bias + lstm_param->input_col_align_ * i; + float16_t *gate_loop = gate + lstm_param->seq_len_ * lstm_param->batch_ * lstm_param->hidden_size_ * i; + MatmulFp16OptV2(input, weight_loop, gate_loop, bias_loop, ActType_No, lstm_param->input_size_, row_input, + lstm_param->hidden_size_, lstm_param->hidden_size_, OutType_Nhwc); + } +} +#else +void LstmGateCompute(float16_t *gate, const float16_t *input, const float16_t *weight_i, const float16_t *input_bias, + const LstmParameter *lstm_param) { for (int i = 0; i < C4NUM; i++) { const float16_t *weight_loop = weight_i + lstm_param->input_size_ * lstm_param->input_col_align_ * i; const float16_t *bias_loop = input_bias + lstm_param->input_col_align_ * i; float16_t *gate_loop = gate + lstm_param->seq_len_ * lstm_param->batch_ * lstm_param->hidden_size_ * i; - MatMulFp16(packed_input, weight_loop, gate_loop, bias_loop, ActType_No, lstm_param->input_size_, + MatMulFp16(input, weight_loop, gate_loop, bias_loop, ActType_No, lstm_param->input_size_, lstm_param->seq_len_ * lstm_param->batch_, lstm_param->hidden_size_, lstm_param->hidden_size_, OutType_Nhwc); } +} +#endif + +void LstmUnidirectionalFp16(float16_t *output, const float16_t *packed_input, const float16_t *weight_i, + const float16_t *weight_h, const float16_t *input_bias, const float16_t *state_bias, + const float16_t *weight_project, const float16_t *project_bias, float16_t *hidden_state, + float16_t *cell_state, float16_t *buffer[C7NUM], const LstmParameter *lstm_param, + bool is_backward) { + float16_t *gate = buffer[1]; + LstmGateCompute(gate, packed_input, weight_i, input_bias, lstm_param); float16_t *input_gate = gate; float16_t *forget_gate = gate + lstm_param->seq_len_ * lstm_param->batch_ * lstm_param->hidden_size_ * 2; @@ -287,26 +341,33 @@ void LstmFp16(float16_t *output, const float16_t *input, const float16_t *weight const float16_t *project_bias, float16_t *hidden_state, float16_t *cell_state, float16_t *buffer[C7NUM], const LstmParameter *lstm_param) { // forward +#ifdef ENABLE_ARM64 + const float16_t *packed_input = input; + if (lstm_param->batch_ * lstm_param->seq_len_ >= C4NUM) { + float16_t *temp_input = buffer[0]; + RowMajor2ColLadder12MajorFp16(input, temp_input, lstm_param->seq_len_ * lstm_param->batch_, + lstm_param->input_size_); + packed_input = temp_input; + } +#else float16_t *packed_input = buffer[0]; RowMajor2Col16MajorFp16(input, packed_input, lstm_param->seq_len_ * lstm_param->batch_, lstm_param->input_size_, false); +#endif LstmUnidirectionalFp16(output, packed_input, weight_i, weight_h, input_bias, state_bias, weight_project, project_bias, hidden_state, cell_state, buffer, lstm_param, false); // backward if (lstm_param->bidirectional_) { const float16_t *backward_weight_i = weight_i + 4 * lstm_param->input_col_align_ * lstm_param->input_size_; - const float16_t *backward_weight_h = weight_h + 4 * lstm_param->state_col_align_ * lstm_param->hidden_size_; + const float16_t *backward_weight_h = weight_h + 4 * lstm_param->state_col_align_ * lstm_param->output_size_; const float16_t *backward_input_bias = input_bias + 4 * lstm_param->input_col_align_; const float16_t *backward_state_bias = state_bias + 4 * lstm_param->state_col_align_; const float16_t *backward_weight_project = - weight_project ? weight_project + lstm_param->hidden_size_ * (lstm_param->batch_ == 1 - ? lstm_param->project_size_ - : UP_ROUND(lstm_param->project_size_, C8NUM)) - : NULL; - float16_t *backward_output = output + lstm_param->batch_ * lstm_param->hidden_size_; + weight_project ? weight_project + lstm_param->hidden_size_ * lstm_param->proj_col_align_ : NULL; + float16_t *backward_output = output + lstm_param->batch_ * lstm_param->output_size_; float16_t *backward_cell_state = cell_state + lstm_param->batch_ * lstm_param->hidden_size_; - float16_t *backward_hidden_state = hidden_state + lstm_param->batch_ * lstm_param->hidden_size_; + float16_t *backward_hidden_state = hidden_state + lstm_param->batch_ * lstm_param->output_size_; LstmUnidirectionalFp16(backward_output, packed_input, backward_weight_i, backward_weight_h, backward_input_bias, backward_state_bias, backward_weight_project, project_bias, backward_hidden_state, diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.h index f6f853b4..d6af9c78 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/lstm_fp16.h @@ -21,13 +21,17 @@ #ifdef __cplusplus extern "C" { #endif -void PackLstmWeightFp32ToFp16(float16_t *dst, const float *src, int batch, int deep, int col, int col_align); +void PackLstmWeightFp32ToFp16(float16_t *dst, const float *src, int batch, int deep, int col, int col_align, + const int32_t *order); -void PackLstmWeightFp16(float16_t *dst, const float16_t *src, int batch, int deep, int col, int col_align); +void PackLstmWeightFp16(float16_t *dst, const float16_t *src, int batch, int deep, int col, int col_align, + const int32_t *order); -void PackLstmBiasFp32ToFp16(float16_t *dst, const float *src, int batch, int col, int col_align, bool is_bidirectional); +void PackLstmBiasFp32ToFp16(float16_t *dst, const float *src, int batch, int col, int col_align, bool is_bidirectional, + const int32_t *order); -void PackLstmBiasFp16(float16_t *dst, const float16_t *src, int batch, int col, int col_align, bool is_bidirectional); +void PackLstmBiasFp16(float16_t *dst, const float16_t *src, int batch, int col, int col_align, bool is_bidirectional, + const int32_t *order); void LstmMatMulFp16(float16_t *c, const float16_t *a, const float16_t *b, const float16_t *bias, int row, int deep, int col, bool is_vec); diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.c index 1aefbaf5..39dcb9ee 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.c @@ -16,7 +16,7 @@ #include "nnacl/fp16/matmul_fp16.h" -static void Col2Row8SrcFromFp16(const void *src_ptr, float16_t *dst_ptr, size_t row, size_t col) { +static void Col2Row8SrcFromFp16(const void *src_ptr, float16_t *dst_ptr, int row, int col) { int row_c8 = row / C8NUM * C8NUM; int col_c8 = col / C8NUM * C8NUM; const float16_t *src = (const float16_t *)src_ptr; @@ -108,7 +108,7 @@ static void Col2Row8SrcFromFp16(const void *src_ptr, float16_t *dst_ptr, size_t } } -static void Col2Row8SrcFromFp32(const void *src_ptr, float16_t *dst_ptr, size_t row, size_t col) { +static void Col2Row8SrcFromFp32(const void *src_ptr, float16_t *dst_ptr, int row, int col) { int row_c8 = row / C8NUM * C8NUM; int col_c8 = col / C8NUM * C8NUM; int ci = 0; @@ -410,17 +410,14 @@ void VecMatmulFp16(const float16_t *a, const float16_t *b, float16_t *c, const f int di = 0; for (; di < depth - C8NUM + 1; di += C8NUM) { float16x8_t av = vld1q_f16(a + di); - float16x8_t bv_0; - float16x8_t bv_1; - for (int i = 0; i < C8NUM; i += C2NUM) { - bv_0 = vld1q_f16(bv_base); // bv_i为一行,8列数据 - acc_0 = vfmaq_n_f16(acc_0, bv_0, av[i]); // av[i]为向量中的一个值 - bv_base += C8NUM; - - bv_1 = vld1q_f16(bv_base); // bv_i为一行,8列数据 - acc_0 = vfmaq_n_f16(acc_0, bv_1, av[i + 1]); // av[i]为向量中的一个值 + float16x8_t bv_0[C8NUM]; + for (int i = 0; i < C8NUM; ++i) { + bv_0[i] = vld1q_f16(bv_base); bv_base += C8NUM; } + for (int i = 0; i < C8NUM; ++i) { + acc_0 = vfmaq_n_f16(acc_0, bv_0[i], av[i]); + } } if (di < depth) { for (; di < depth; ++di) { @@ -636,8 +633,94 @@ void RowMajor2Col16MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, si } #ifdef ENABLE_ARM64 -void RowMajor2ColNMajorFp16(const float16_t *src_ptr, float16_t *dst_ptr, int row, int col) { - // Col16Major ==> Col8Major ==> Col4Major +void RowMajor2ColLadder12MajorFp16(const float16_t *src, float16_t *dst_ptr, int row, int col) { + // Col12Major ==> Col8Major ==> Col4Major + const float16_t *src_r = src; + float16_t *dst_r = dst_ptr; + int ri = 0; + size_t col8 = col / C8NUM * C8NUM; + // find 16 block unit + for (; ri <= row - C12NUM; ri += C12NUM) { + size_t ci = 0; + for (; ci < col8; ci += C8NUM) { + const float16_t *src_c = src_r + ci; + float16_t *dst_c = dst_r + ci * C12NUM; + Transpose12x8ARM64Fp16(src_c, dst_c, col * C2NUM, C24NUM); + } + for (; ci < col; ci++) { + const float16_t *src_c = src_r + ci; + float16_t *dst_c = dst_r + ci * C12NUM; + for (size_t i = 0; i < C12NUM; i++) { + dst_c[i] = src_c[i * col]; + } + } + src_r += C12NUM * col; + dst_r += C12NUM * col; + } + for (; ri <= row - C8NUM; ri += C8NUM) { + size_t ci = 0; + for (; ci < col8; ci += C8NUM) { + const float16_t *src_c = src_r + ci; + float16_t *dst_c = dst_r + ci * C8NUM; + Transpose8x8ARM64Fp16(src_c, dst_c, col * sizeof(float16_t), C8NUM * sizeof(float16_t)); + } + for (; ci < col; ci++) { + const float16_t *src_c = src_r + ci; + float16_t *dst_c = dst_r + ci * C8NUM; + for (size_t i = 0; i < C8NUM; i++) { + dst_c[i] = src_c[i * col]; + } + } + src_r += C8NUM * col; + dst_r += C8NUM * col; + } + for (; ri <= row - C4NUM; ri += C4NUM) { + size_t ci = 0; + for (; ci < col8; ci += C8NUM) { + const float16_t *src_c = src_r + ci; + float16_t *dst_c = dst_r + ci * C4NUM; + Transpose4x8ARM64Fp16(src_c, dst_c, col * sizeof(float16_t), C4NUM * sizeof(float16_t)); + } + for (; ci < col; ci++) { + const float16_t *src_c = src_r + ci; + float16_t *dst_c = dst_r + ci * C4NUM; + for (size_t i = 0; i < C4NUM; i++) { + dst_c[i] = src_c[i * col]; + } + } + src_r += C4NUM * col; + dst_r += C4NUM * col; + } + if (ri < row) { + memcpy(dst_r, src_r, (row - ri) * col * C2NUM); + } +} + +void RowMajor2RowLadder12MajorFp16(const float16_t *src, float16_t *dst, int row, int col) { + // Row12 ==> Row8 ==> Row4 + for (int r = 0; r < row; r++) { + int c = 0; + for (; c <= col - C12NUM; c += C12NUM) { + MS_FLOAT16X8 src_data = MS_LDQ_F16(src + r * col + c); + MS_FLOAT16X4 src_data1 = MS_LD_F16(src + r * col + c + C8NUM); + MS_STQ_F16(dst + c / C12NUM * C12NUM * row + r * C12NUM, src_data); + MS_ST_F16(dst + c / C12NUM * C12NUM * row + r * C12NUM + C8NUM, src_data1); + } + for (; c <= col - C8NUM; c += C8NUM) { + MS_FLOAT16X8 src_data = MS_LDQ_F16(src + r * col + c); + MS_STQ_F16(dst + c / C12NUM * C12NUM * row + r * C8NUM, src_data); + } + for (; c <= col - C4NUM; c += C4NUM) { + MS_FLOAT16X4 src_data = MS_LD_F16(src + r * col + c); + MS_ST_F16(dst + c / C4NUM * C4NUM * row + r * C4NUM, src_data); + } + for (; c < col; ++c) { + dst[c / C4NUM * C4NUM * row + r + c % C4NUM * row] = src[r * col + c]; + } + } +} + +void RowMajor2ColNMajorFp16srcFp16(const float16_t *src_ptr, float16_t *dst_ptr, int row, int col) { const float16_t *src_r = src_ptr; float16_t *dst_r = dst_ptr; int ri = 0; @@ -702,6 +785,112 @@ void RowMajor2ColNMajorFp16(const float16_t *src_ptr, float16_t *dst_ptr, int ro dst_r += 1; } } + +void RowMajor2ColNMajorFp16(const void *src_ptr, float16_t *dst_ptr, int row, int col, bool is_fp32_src) { + // Col16Major ==> Col8Major ==> Col4Major + if (!is_fp32_src) { + RowMajor2ColNMajorFp16srcFp16((const float16_t *)src_ptr, dst_ptr, row, col); + return; + } + const float *src_r = src_ptr; + float16_t *dst_r = dst_ptr; + int ri = 0; + // find 16 block unit + for (; ri <= row - C16NUM; ri += C16NUM) { + for (int r = 0; r < C16NUM; ++r) { + for (int c = 0; c < col; ++c) { + dst_r[c * C16NUM + r % C16NUM] = src_r[r * col + c]; + } + } + src_r += C16NUM * col; + dst_r += C16NUM * col; + } + for (; ri <= row - C8NUM; ri += C8NUM) { + for (int r = 0; r < C8NUM; ++r) { + for (int c = 0; c < col; ++c) { + dst_r[c * C8NUM + r % C8NUM] = src_r[r * col + c]; + } + } + src_r += C8NUM * col; + dst_r += C8NUM * col; + } + for (; ri <= row - C4NUM; ri += C4NUM) { + for (int r = 0; r < C4NUM; ++r) { + for (int c = 0; c < col; ++c) { + dst_r[c * C4NUM + r % C4NUM] = src_r[r * col + c]; + } + } + src_r += C4NUM * col; + dst_r += C4NUM * col; + } + for (; ri < row; ++ri) { + for (size_t i = 0; i < col; ++i) { + dst_r[i * C4NUM] = src_r[i]; + } + src_r += col; + dst_r += 1; + } +} + +void RowMajor2RowNMajorFp16(const void *src_ptr, float16_t *dst, int row, int col, bool is_fp32_src) { + // Row16 ==> Row8 ==> Row4 + if (is_fp32_src) { + const float *src = (const float *)src_ptr; + for (int r = 0; r < row; r++) { + int c = 0; + for (; c <= col - C16NUM; c += C16NUM) { + const float *cur_src = src + r * col + c; + MS_FLOAT32X4X4 src_f32_data = {MS_LDQ_F32(cur_src), MS_LDQ_F32(cur_src + C4NUM), MS_LDQ_F32(cur_src + C8NUM), + MS_LDQ_F32(cur_src + C12NUM)}; + MS_FLOAT16X4X4 res = { + MS_CVT_F16_F32(src_f32_data.val[0]), + MS_CVT_F16_F32(src_f32_data.val[1]), + MS_CVT_F16_F32(src_f32_data.val[2]), + MS_CVT_F16_F32(src_f32_data.val[3]), + }; + MS_ST4_F16(dst + c / C16NUM * C16NUM * row + r * C16NUM, res); + } + for (; c <= col - C8NUM; c += C8NUM) { + const float *cur_src = src + r * col + c; + MS_FLOAT32X4X2 src_f32_data = {MS_LDQ_F32(cur_src), MS_LDQ_F32(cur_src + C4NUM)}; + MS_FLOAT16X4X2 res = { + MS_CVT_F16_F32(src_f32_data.val[0]), + MS_CVT_F16_F32(src_f32_data.val[1]), + }; + MS_ST2_F16(dst + c / C8NUM * C8NUM * row + r * C8NUM, res); + } + for (; c <= col - C4NUM; c += C4NUM) { + MS_FLOAT16X4 src_data = MS_CVT_F16_F32(MS_LDQ_F32(src + r * col + c)); + MS_ST_F16(dst + c / C4NUM * C4NUM * row + r * C4NUM, src_data); + } + for (; c < col; ++c) { + dst[c / C4NUM * C4NUM * row + r * C4NUM + c % C4NUM] = src[r * col + c]; + } + } + return; + } + const float16_t *src = (const float16_t *)src_ptr; + for (int r = 0; r < row; r++) { + int c = 0; + for (; c <= col - C16NUM; c += C16NUM) { + MS_FLOAT16X8 src_data = MS_LDQ_F16(src + r * col + c); + MS_FLOAT16X8 src_data1 = MS_LDQ_F16(src + r * col + c + C8NUM); + MS_STQ_F16(dst + c / C16NUM * C16NUM * row + r * C16NUM, src_data); + MS_STQ_F16(dst + c / C16NUM * C16NUM * row + r * C16NUM + C8NUM, src_data1); + } + for (; c <= col - C8NUM; c += C8NUM) { + MS_FLOAT16X8 src_data = MS_LDQ_F16(src + r * col + c); + MS_STQ_F16(dst + c / C8NUM * C8NUM * row + r * C8NUM, src_data); + } + for (; c <= col - C4NUM; c += C4NUM) { + MS_FLOAT16X4 src_data = MS_LD_F16(src + r * col + c); + MS_ST_F16(dst + c / C4NUM * C4NUM * row + r * C4NUM, src_data); + } + for (; c < col; ++c) { + dst[c / C4NUM * C4NUM * row + r * C4NUM + c % C4NUM] = src[r * col + c]; + } + } +} #endif void RowMajor2Col12MajorFp16Opt(const float16_t *src_ptr, float16_t *dst_ptr, size_t row, size_t col) { @@ -802,32 +991,6 @@ void RowMajor2Row16MajorFp16(const void *src, float16_t *dst, int row, int col, } } -#ifdef ENABLE_ARM64 -void RowMajor2RowNMajorFp16(const float16_t *src, float16_t *dst, int row, int col) { - // Row16 ==> Row8 ==> Row4 - for (int r = 0; r < row; r++) { - int c = 0; - for (; c <= col - C16NUM; c += C16NUM) { - MS_FLOAT16X8 src_data = MS_LDQ_F16(src + r * col + c); - MS_FLOAT16X8 src_data1 = MS_LDQ_F16(src + r * col + c + C8NUM); - MS_STQ_F16(dst + c / C16NUM * C16NUM * row + r * C16NUM, src_data); - MS_STQ_F16(dst + c / C16NUM * C16NUM * row + r * C16NUM + C8NUM, src_data1); - } - for (; c <= col - C8NUM; c += C8NUM) { - MS_FLOAT16X8 src_data = MS_LDQ_F16(src + r * col + c); - MS_STQ_F16(dst + c / C8NUM * C8NUM * row + r * C8NUM, src_data); - } - for (; c <= col - C4NUM; c += C4NUM) { - MS_FLOAT16X4 src_data = MS_LD_F16(src + r * col + c); - MS_ST_F16(dst + c / C4NUM * C4NUM * row + r * C4NUM, src_data); - } - for (; c < col; ++c) { - dst[c / C4NUM * C4NUM * row + r * C4NUM + c % C4NUM] = src[r * col + c]; - } - } -} -#endif - void RowMajor2Row16MajorFp16Opt(const float16_t *src, float16_t *dst, int row, int col) { int col_align = UP_ROUND(col, C16NUM); for (int r = 0; r < row; r++) { diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.h index be7f8443..7acef622 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp16/matmul_fp16.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef NNACL_FP16_MATMUL_FP16_H_ -#define NNACL_FP16_MATMUL_FP16_H_ +#ifndef MINDSPORE_NNACL_FP16_MATMUL_H_ +#define MINDSPORE_NNACL_FP16_MATMUL_H_ #include #include @@ -45,9 +45,13 @@ void MatMul12x8Fp16(const float16_t *a, const float16_t *b, float16_t *dst, cons int deep, int row, int col, int stride, int write_mode); #ifdef ENABLE_ARM64 -void RowMajor2ColNMajorFp16(const float16_t *src, float16_t *dst_ptr, int row, int col); +void RowMajor2ColLadder12MajorFp16(const float16_t *src, float16_t *dst_ptr, int row, int col); -void RowMajor2RowNMajorFp16(const float16_t *src, float16_t *dst, int row, int col); +void RowMajor2RowLadder12MajorFp16(const float16_t *src, float16_t *dst, int row, int col); + +void RowMajor2ColNMajorFp16(const void *src, float16_t *dst_ptr, int row, int col, bool is_fp32_src); + +void RowMajor2RowNMajorFp16(const void *src, float16_t *dst, int row, int col, bool is_fp32_src); void MatMul12x16Fp16Opt(const float16_t *a, const float16_t *b, float16_t *dst, const float16_t *bias, ActType act_type, int deep, int row, int col, size_t stride, size_t out_type); @@ -60,6 +64,9 @@ void MatmulFp16Neon64Opt(const float16_t *a, const float16_t *b, float16_t *c, c void MatmulBaseFp16Neon(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc); +void MatmulFp16OptV2(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, + size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc); + #ifdef ENABLE_DEBUG void MatmulBaseFp16(const float16_t *a, const float16_t *b, float16_t *c, const float16_t *bias, int act_type, size_t depth, size_t row, size_t col, size_t stride, size_t write_nhwc); @@ -118,4 +125,4 @@ void RowMajor2ColMajorFp16(const void *src, float16_t *dst, int row, int col, bo } #endif -#endif // NNACL_FP16_MATMUL_FP16_H_ +#endif // MINDSPORE_NNACL_FP16_MATMUL_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.c index 74e75115..da9f6bef 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.c @@ -33,7 +33,7 @@ static void PackLstmMatrix(const float *src_batch, float *dst_batch, int col, in } static void PackLstmWeightBatch(float *dst, const float *src, int batch, int deep, int col, int col_align, - const int32_t *order) { + const int *order) { for (int i = 0; i < batch; i++) { const float *src_batch = src + i * col * deep; float *dst_batch = dst + ((order == NULL) ? i : order[i]) * col_align * deep; @@ -41,12 +41,12 @@ static void PackLstmWeightBatch(float *dst, const float *src, int batch, int dee } } -void PackLstmWeight(float *dst, const float *src, int batch, int deep, int col, int col_align, const int32_t *order) { +void PackLstmWeight(float *dst, const float *src, int batch, int deep, int col, int col_align, const int *order) { PackLstmWeightBatch(dst, src, batch, deep, col, col_align, order); } void PackLstmWeightWithStride(float *dst, const float *src, int batch, int deep, int col, int col_align, - bool is_bidirectional, int stride, const int32_t *order) { + bool is_bidirectional, int stride, const int *order) { int unidirectional_batch = is_bidirectional ? batch / 2 : batch; PackLstmWeightBatch(dst, src, unidirectional_batch, deep, col, col_align, order); src += stride; @@ -57,7 +57,7 @@ void PackLstmWeightWithStride(float *dst, const float *src, int batch, int deep, } void PackLstmBias(float *dst, const float *src, int batch, int col, int col_align, bool is_bidirectional, - const int32_t *order) { + const int *order) { int unidirectional_batch = is_bidirectional ? batch / 2 : batch; for (int i = 0; i < unidirectional_batch; i++) { const float *src_batch = src + i * col; @@ -76,7 +76,7 @@ void PackLstmBias(float *dst, const float *src, int batch, int col, int col_alig } void PackLstmBiasWithStride(float *dst, const float *src, int batch, int col, int col_align, bool is_bidirectional, - int b_stride, const int32_t *order) { + int b_stride, const int *order) { int unidirectional_batch = is_bidirectional ? batch / 2 : batch; for (int i = 0; i < unidirectional_batch; i++) { const float *src_batch = src + i * col; @@ -175,13 +175,13 @@ void UpdateOutput(float *hidden_state, float *output, const float *cell_state, c const float *weight_project, float *buffer[C8NUM], const LstmParameter *lstm_param) { int batch = lstm_param->batch_; int hidden_size = lstm_param->hidden_size_; - int project_size = lstm_param->project_size_; + int output_size = lstm_param->output_size_; float *state_buffer = buffer[C4NUM]; float *hidden_buffer = weight_project ? buffer[C2NUM] : hidden_state; float zoneout = lstm_param->zoneout_hidden_; if (!(zoneout >= -FLT_EPSILON && zoneout <= FLT_EPSILON)) { - (void)memcpy(state_buffer, hidden_state, batch * project_size * sizeof(float)); - ElementOptMul(state_buffer, &zoneout, state_buffer, batch * project_size, false); + (void)memcpy(state_buffer, hidden_state, batch * hidden_size * sizeof(float)); + ElementOptMul(state_buffer, &zoneout, state_buffer, batch * hidden_size, false); } Tanh(cell_state, batch * hidden_size, hidden_buffer); @@ -193,20 +193,13 @@ void UpdateOutput(float *hidden_state, float *output, const float *cell_state, c left_matrix = buffer[C6NUM]; PackLstmInput(hidden_buffer, left_matrix, batch, hidden_size); } -#ifdef ENABLE_AVX - int col_tile = batch == 1 ? C8NUM : C16NUM; -#elif defined(ENABLE_ARM32) - int col_tile = C4NUM; -#else - int col_tile = C8NUM; -#endif - LstmMatMul(hidden_state, left_matrix, weight_project, NULL, batch, hidden_size, project_size, - UP_ROUND(project_size, col_tile), batch == 1, buffer[C7NUM]); + LstmMatMul(hidden_state, left_matrix, weight_project, NULL, batch, hidden_size, output_size, + lstm_param->proj_col_align_, batch == 1, buffer[C7NUM]); } if (!(zoneout >= -FLT_EPSILON && zoneout <= FLT_EPSILON)) { - ElementOptMulAcc(hidden_state, 1 - zoneout, state_buffer, batch * project_size); + ElementOptMulAcc(hidden_state, 1 - zoneout, state_buffer, batch * output_size); } - (void)memcpy(output, hidden_state, batch * project_size * sizeof(float)); + (void)memcpy(output, hidden_state, batch * output_size * sizeof(float)); } void UpdateLstmGate(float *gate_buffer, const float *input, const float *weight, const float *bias, int row, int deep, @@ -238,12 +231,12 @@ void LstmStepUnit(float *output, float *input_gate, float *forget_gate, float *c bool is_vec = lstm_param->batch_ == 1; // state * weight if (is_vec) { - UpdateLstmGate(state_gate, hidden_state, state_weight, state_bias, lstm_param->batch_, lstm_param->project_size_, + UpdateLstmGate(state_gate, hidden_state, state_weight, state_bias, lstm_param->batch_, lstm_param->output_size_, lstm_param->hidden_size_, lstm_param->state_col_align_, is_vec, packed_output); } else { // pack state for matmul - PackLstmInput(hidden_state, packed_state, lstm_param->batch_, lstm_param->project_size_); - UpdateLstmGate(state_gate, packed_state, state_weight, state_bias, lstm_param->batch_, lstm_param->project_size_, + PackLstmInput(hidden_state, packed_state, lstm_param->batch_, lstm_param->output_size_); + UpdateLstmGate(state_gate, packed_state, state_weight, state_bias, lstm_param->batch_, lstm_param->output_size_, lstm_param->hidden_size_, lstm_param->state_col_align_, is_vec, packed_output); } ElementAdd(input_gate, state_gate, input_gate, lstm_param->batch_ * lstm_param->hidden_size_); @@ -276,7 +269,7 @@ void LstmStepUnit(float *output, float *input_gate, float *forget_gate, float *c } if (!(lstm_param->zoneout_hidden_ >= -FLT_EPSILON && lstm_param->zoneout_hidden_ <= FLT_EPSILON)) { - (void)memcpy(hidden_state, hidden_buffer, lstm_param->batch_ * lstm_param->project_size_ * sizeof(float)); + (void)memcpy(hidden_state, hidden_buffer, lstm_param->batch_ * lstm_param->output_size_ * sizeof(float)); } } @@ -322,12 +315,12 @@ void Lstm(float *output, const float *input, const float *weight_i, const float // backward if (lstm_param->bidirectional_) { const float *backward_weight_i = weight_i + 4 * lstm_param->input_col_align_ * lstm_param->input_size_; - const float *backward_weight_h = weight_h + 4 * lstm_param->state_col_align_ * lstm_param->hidden_size_; + const float *backward_weight_h = weight_h + 4 * lstm_param->state_col_align_ * lstm_param->output_size_; const float *backward_input_bias = input_bias + 4 * lstm_param->input_col_align_; const float *backward_state_bias = state_bias + 4 * lstm_param->state_col_align_; - float *backward_output = output + lstm_param->batch_ * lstm_param->hidden_size_; + float *backward_output = output + lstm_param->batch_ * lstm_param->output_size_; float *backward_cell_state = cell_state + lstm_param->batch_ * lstm_param->hidden_size_; - float *backward_hidden_state = hidden_state + lstm_param->batch_ * lstm_param->hidden_size_; + float *backward_hidden_state = hidden_state + lstm_param->batch_ * lstm_param->output_size_; LstmUnidirectional(backward_output, packed_input, backward_weight_i, backward_weight_h, backward_input_bias, backward_state_bias, backward_hidden_state, backward_cell_state, buffer, lstm_param, true); diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.h index 88dd9d16..f94f0bb7 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/lstm_fp32.h @@ -21,16 +21,16 @@ #ifdef __cplusplus extern "C" { #endif -void PackLstmWeight(float *dst, const float *src, int batch, int deep, int col, int col_align, const int32_t *order); +void PackLstmWeight(float *dst, const float *src, int batch, int deep, int col, int col_align, const int *order); void PackLstmWeightWithStride(float *dst, const float *src, int batch, int deep, int col, int col_align, - bool is_bidirectional, int stride, const int32_t *order); + bool is_bidirectional, int stride, const int *order); void PackLstmBias(float *dst, const float *src, int batch, int col, int col_align, bool is_bidirectional, - const int32_t *order); + const int *order); void PackLstmBiasWithStride(float *dst, const float *src, int batch, int col, int col_align, bool is_bidirectional, - int b_stride, const int32_t *order); + int b_stride, const int *order); void PackLstmInput(const float *src, float *dst, int row, int deep); diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/matmul_fp32.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/matmul_fp32.c index 308419fb..1898ffd4 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/matmul_fp32.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/fp32/matmul_fp32.c @@ -440,8 +440,8 @@ void MatVecMulNoPackFp32(const float *a, const float *b, float *c, const float * } c[oc_index] = dst; } - a += k; - b += k * col; + a += C1500NUM; + b += C1500NUM * col; } if (k == depth) { return; diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/custom_gather_d_grad_v2_infer.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/custom_gather_d_grad_v2_infer.c new file mode 100644 index 00000000..ad1cac2e --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/custom_gather_d_grad_v2_infer.c @@ -0,0 +1,36 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nnacl/infer/custom_gather_d_grad_v2_infer.h" +#include "nnacl/infer/infer_register.h" + +int CustomGatherDGradV2InferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, + size_t outputs_size, OpParameter *parameter) { + int check_ret = CheckAugmentNullSize(inputs, inputs_size, outputs, outputs_size, parameter, C3NUM, C1NUM); + if (check_ret != NNACL_OK) { + return check_ret; + } + const TensorC *input = inputs[0]; + TensorC *output = outputs[0]; + SetDataTypeFormat(output, input); + if (!InferFlag(inputs, inputs_size)) { + return NNACL_INFER_INVALID; + } + SetShapeTensor(output, input); + return NNACL_OK; +} + +REG_INFER(CustomGatherDGradV2, PrimType_Inner_CustomGatherDGradV2, CustomGatherDGradV2InferShape) diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/custom_gather_d_grad_v2_infer.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/custom_gather_d_grad_v2_infer.h new file mode 100644 index 00000000..68d85d20 --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/custom_gather_d_grad_v2_infer.h @@ -0,0 +1,30 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_NNACL_CUSTOM_GATHER_D_GRAD_V2_INFER_H +#define MINDSPORE_NNACL_CUSTOM_GATHER_D_GRAD_V2_INFER_H +#include "nnacl/infer/common_infer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +int CustomGatherDGradV2InferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, + size_t outputs_size, OpParameter *parameter); + +#ifdef __cplusplus +} +#endif +#endif // MINDSPORE_NNACL_CUSTOM_GATHER_D_GRAD_V2_INFER_H diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/lstm_infer.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/lstm_infer.c index 9892ef0b..391e2522 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/lstm_infer.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/lstm_infer.c @@ -17,41 +17,81 @@ #include "nnacl/infer/lstm_infer.h" #include "nnacl/infer/infer_register.h" -static const int num_of_gates = 4; -static const int no_of_recorde_values = 6; +static const int no_of_recorde_values = 5; int CheckInputShapeValid(const TensorC *const *inputs, size_t inputs_size, const LstmParameter *parameter) { + if (inputs_size < C6NUM) { + return NNACL_INPUT_TENSOR_ERROR; + } const TensorC *input = inputs[FIRST_INPUT]; const TensorC *weight_i = inputs[SECOND_INPUT]; const TensorC *weight_g = inputs[THIRD_INPUT]; const TensorC *bias = inputs[FOURTH_INPUT]; - const TensorC *cell = inputs[FIFTH_INPUT]; + const TensorC *hidden_init = inputs[FIFTH_INPUT]; + const TensorC *cell_init = inputs[SIXTH_INPUT]; + + NNACL_CHECK_TRUE_RET(input->shape_size_ == DIMENSION_3D && weight_i->shape_size_ == DIMENSION_3D && + weight_g->shape_size_ == DIMENSION_3D && bias->shape_size_ == DIMENSION_2D, + NNACL_ERR); int batch = input->shape_[kNHWC_H]; int input_size = input->shape_[kNHWC_W]; int hidden_size = weight_i->shape_[kNHWC_H] / C4NUM; - int project_size = inputs_size == C7NUM ? inputs[C6NUM]->shape_[kNHWC_H] : hidden_size; - bool bidirectional = parameter->bidirectional_; - if (input->shape_size_ != DIMENSION_3D || weight_i->shape_size_ != DIMENSION_3D) { - return NNACL_ERR; + int out_size = hidden_size; + if (inputs_size == C7NUM) { + NNACL_CHECK_TRUE_RET(inputs[SEVENTH_INPUT]->shape_size_ == DIMENSION_3D, NNACL_INPUT_TENSOR_ERROR); + out_size = inputs[SEVENTH_INPUT]->shape_[kNHWC_H]; } + bool bidirectional = parameter->bidirectional_; int bidirection = bidirectional ? C2NUM : C1NUM; NNACL_CHECK_TRUE_RET(weight_i->shape_[kNHWC_N] == bidirection && weight_i->shape_[kNHWC_H] == hidden_size * C4NUM && weight_i->shape_[kNHWC_W] == input_size, NNACL_ERR); NNACL_CHECK_TRUE_RET(weight_g->shape_[kNHWC_N] == bidirection && weight_g->shape_[kNHWC_H] == hidden_size * C4NUM && - weight_g->shape_[kNHWC_W] == project_size, + weight_g->shape_[kNHWC_W] == out_size, NNACL_ERR); NNACL_CHECK_TRUE_RET(bias->shape_[kNHWC_N] == bidirection && bias->shape_[kNHWC_H] == hidden_size * C8NUM, NNACL_ERR); - if (!bidirectional && cell->shape_size_ == DIMENSION_2D) { - NNACL_CHECK_TRUE_RET(cell->shape_[kNHWC_N] == batch && cell->shape_[kNHWC_H] == hidden_size, NNACL_ERR); + if (!bidirectional && hidden_init->shape_size_ == DIMENSION_2D) { + NNACL_CHECK_TRUE_RET(hidden_init->shape_[kNHWC_N] == batch && hidden_init->shape_[kNHWC_H] == out_size, NNACL_ERR); } else { - NNACL_CHECK_TRUE_RET( - cell->shape_[kNHWC_N] == bidirection && cell->shape_[kNHWC_H] == batch && cell->shape_[kNHWC_W] == project_size, - NNACL_ERR); + NNACL_CHECK_TRUE_RET(hidden_init->shape_size_ == DIMENSION_3D && hidden_init->shape_[kNHWC_N] == bidirection && + hidden_init->shape_[kNHWC_H] == batch && hidden_init->shape_[kNHWC_W] == out_size, + NNACL_ERR); + } + if (!bidirectional && cell_init->shape_size_ == DIMENSION_2D) { + NNACL_CHECK_TRUE_RET(cell_init->shape_[kNHWC_N] == batch && cell_init->shape_[kNHWC_H] == hidden_size, NNACL_ERR); + } else { + NNACL_CHECK_TRUE_RET(cell_init->shape_size_ == DIMENSION_3D && cell_init->shape_[kNHWC_N] == bidirection && + cell_init->shape_[kNHWC_H] == batch && cell_init->shape_[kNHWC_W] == hidden_size, + NNACL_ERR); } return NNACL_OK; } +int InferFirstOutputMindir(const TensorC *const *inputs, size_t inputs_size, TensorC *output, LstmParameter *param) { + for (size_t i = 0; i < inputs_size; ++i) { + if (inputs[i]->shape_size_ != C3NUM) { + return NNACL_INPUT_TENSOR_ERROR; + } + } + ShapeSet(output->shape_, &output->shape_size_, inputs[0]->shape_, inputs[0]->shape_size_); + int out_size = inputs[SECOND_INPUT]->shape_[THIRD_INPUT]; + output->shape_[THIRD_INPUT] = (param->bidirectional_ ? C2NUM : 1) * out_size; + return NNACL_OK; +} + +int InferFirstOutputNonMindir(const TensorC *const *inputs, size_t inputs_size, TensorC *output, LstmParameter *param) { + if (CheckInputShapeValid(inputs, inputs_size, param) != NNACL_OK) { + return NNACL_ERR; + } + ShapeSet(output->shape_, &output->shape_size_, inputs[0]->shape_, inputs[0]->shape_size_); + const TensorC *hidden_init = inputs[FIFTH_INPUT]; + int out_size = hidden_init->shape_[hidden_init->shape_size_ - 1]; + output->shape_[THIRD_INPUT] = out_size; + int direction = param->bidirectional_ ? C2NUM : C1NUM; + int ret = ShapeInsert(output->shape_, &output->shape_size_, 1, direction); + return ret; +} + int LstmInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **outputs, size_t outputs_size, OpParameter *parameter) { int check_ret = CheckAugmentWithMinSize(inputs, inputs_size, outputs, outputs_size, parameter, 4, 3); @@ -60,9 +100,8 @@ int LstmInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **o } const TensorC *input = inputs[0]; - const TensorC *weight_i = inputs[1]; TensorC *output = outputs[0]; - for (int i = 0; i < 3; i++) { + for (int i = 0; i < outputs_size; i++) { SetDataTypeFormat(outputs[i], input); } @@ -71,42 +110,31 @@ int LstmInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **o if (!InferFlag(inputs, inputs_size)) { return NNACL_INFER_INVALID; } - int dir_multiplier = param->bidirectional_ ? 2 : 1; - int out_shape[MAX_SHAPE_SIZE]; - size_t out_shape_size = 0; - int hidden_size = 1; - int project_size = 1; - ShapeSet(out_shape, &out_shape_size, input->shape_, input->shape_size_); - if (inputs_size == DIMENSION_4D) { // if input from MINDIR - hidden_size = weight_i->shape_[THIRD_INPUT]; - project_size = hidden_size; - out_shape[THIRD_INPUT] = hidden_size * dir_multiplier; - } else { - if (CheckInputShapeValid(inputs, inputs_size, param) != NNACL_OK) { - return NNACL_ERR; + int hidden_size = 0; + int out_size = 0; + if (inputs_size == C4NUM) { + int ret = InferFirstOutputMindir(inputs, inputs_size, output, param); + if (ret != NNACL_OK) { + return ret; } - hidden_size = weight_i->shape_[1] / num_of_gates; - project_size = inputs_size == C7NUM ? inputs[C6NUM]->shape_[kNHWC_H] : hidden_size; - out_shape[THIRD_INPUT] = project_size; - if (param->bidirectional_) { - int ret = ShapeInsert(out_shape, &out_shape_size, 1, 2); - if (ret != NNACL_OK) { - return NNACL_ERR; - } - } else { - int ret = ShapeInsert(out_shape, &out_shape_size, 1, 1); - if (ret != NNACL_OK) { - return NNACL_ERR; - } + hidden_size = inputs[THIRD_INPUT]->shape_[THIRD_INPUT]; + out_size = inputs[SECOND_INPUT]->shape_[THIRD_INPUT]; + } else { + int ret = InferFirstOutputNonMindir(inputs, inputs_size, output, param); + if (ret != NNACL_OK) { + return ret; } + hidden_size = inputs[SIXTH_INPUT]->shape_[inputs[SIXTH_INPUT]->shape_size_ - 1]; + out_size = inputs[FIFTH_INPUT]->shape_[inputs[FIFTH_INPUT]->shape_size_ - 1]; } - SetShapeArray(output, out_shape, out_shape_size); + + int dir_multiplier = param->bidirectional_ ? C2NUM : C1NUM; int state_shape[MAX_SHAPE_SIZE]; size_t state_shape_size = 0; ShapeSet(state_shape, &state_shape_size, input->shape_, input->shape_size_); state_shape[FIRST_INPUT] = dir_multiplier; - state_shape[THIRD_INPUT] = project_size; + state_shape[THIRD_INPUT] = out_size; SetShapeArray(outputs[SECOND_INPUT], state_shape, state_shape_size); state_shape[THIRD_INPUT] = hidden_size; SetShapeArray(outputs[THIRD_INPUT], state_shape, state_shape_size); @@ -116,11 +144,9 @@ int LstmInferShape(const TensorC *const *inputs, size_t inputs_size, TensorC **o const size_t intermediate_states_shape_size = 1; int batch_size = input->shape_[SECOND_INPUT]; int seq_len = input->shape_[FIRST_INPUT]; - intermediate_states_shape[FIRST_INPUT] = no_of_recorde_values * batch_size * hidden_size * seq_len * dir_multiplier; - SetDataTypeFormat(outputs[FOURTH_INPUT], inputs[FIRST_INPUT]); + intermediate_states_shape[FIRST_INPUT] = + batch_size * seq_len * dir_multiplier * (out_size + no_of_recorde_values * hidden_size); SetShapeArray(outputs[FOURTH_INPUT], intermediate_states_shape, intermediate_states_shape_size); - - SetDataTypeFormat(outputs[FIFTH_INPUT], inputs[FIRST_INPUT]); SetShapeArray(outputs[FIFTH_INPUT], state_shape, state_shape_size); } diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/reshape_infer.c b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/reshape_infer.c index 287e9de3..3c192df7 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/reshape_infer.c +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/infer/reshape_infer.c @@ -33,12 +33,14 @@ int CalShape(const int *data, const TensorC *const *inputs, int *out_shape, size } ShapePush(out_shape, out_shape_size, data[i]); } - + if (size == 0) { + return NNACL_ERR; + } if ((int)(data[index]) == -1) { if (index >= MAX_SHAPE_SIZE) { return NNACL_ERR; } - out_shape[index] = size == 0 ? 0 : input_count / size; + out_shape[index] = input_count / size; } return NNACL_OK; } diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions.h index 377993cd..6a933785 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions.h @@ -308,7 +308,7 @@ static inline float simd_exp32_f32(float data) { #else data = MS_MAX32_F32(-88.0f, MS_MIN32_F32(88.0f, data)); // clamp(-88, 88) #endif - int integer = floor(data * 1.44269504088896341f + 0.5f); + int integer = data / param[0]; float decimal = data - integer * param[0]; fi int_exp; int_exp.i = (integer + 127) << 23; // Approximate calculation : (integer + 127) << 23 @@ -324,14 +324,19 @@ static inline void simd_exp32(float src, float *dst) { int i; } fi; static float param[] = {0.693147f, 1.0f / 120, 1.0f / 24, 1.0f / 6, 1.0f / 2, 1.0f}; // log(2.0f) - src = MS_MAX32_F32(-88.0f, MS_MIN32_F32(88.0f, src)); // clamp(-88.0f, 88.0f) + src = MS_MAX32_F32(-87.3365478515625f, MS_MIN32_F32(88.72283935546875f, src)); // clamp(logf(FLT_MIN), logf(FLT_MAX)) int integer = floor(src * 1.44269504088896341f + 0.5f); float decimal = src - integer * param[0]; fi int_exp; - int_exp.i = (integer + 127) << 23; // integer num approximate calculation : (x + 127) << 23 + const int shift = 23; + const int bias = 126; + const float factor = 2; + // 2^n * exp(r) should be counted 2 * 2^(n - 1) * exp(r), + // because n may be 128, and it is not representable by fp32. + int_exp.i = (integer + bias) << shift; // integer num 2^(n - 1) approximate calculation : ((x - 1) + 127) << 23 const float decimal_exp = 1.0f + decimal * (1.0f + decimal * (0.5f + decimal * (param[3] + decimal * (param[2] + decimal * param[1])))); - *dst = int_exp.f * decimal_exp; + *dst = factor * int_exp.f * decimal_exp; } // define (float/int) data diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions_fp16.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions_fp16.h index a29c4dbb..94ed4b89 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions_fp16.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_instructions_fp16.h @@ -94,9 +94,13 @@ static inline float16x4_t ms_vcvt_f16_f32(float32x4_t in) { #define MS_FLOAT16X8 float16x8_t #define MS_FLOAT16X4 float16x4_t +#define MS_FLOAT16X4X4 float16x4x4_t +#define MS_FLOAT16X4X2 float16x4x2_t #define MS_MOVQ_F16 vmovq_n_f16 #define MS_STQ_F16(ptr, val) vst1q_f16(ptr, val) #define MS_ST_F16 vst1_f16 +#define MS_ST2_F16 vst2_f16 +#define MS_ST4_F16 vst4_f16 #define MS_MINQ_F16 vminq_f16 #define MS_MAXQ_F16 vmaxq_f16 #define MS_LDQ_F16(ptr) vld1q_f16(ptr) diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_neon_instructions.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_neon_instructions.h index c4bc34d9..fb38b452 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_neon_instructions.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/intrinsics/ms_simd_neon_instructions.h @@ -25,6 +25,8 @@ #define MS128_F32_GETI(src, i) src[i] #define MS_FLOAT32X4 float32x4_t #define MS_FLOAT128_F32 float32x4_t +#define MS_FLOAT32X4X2 float32x4x2_t +#define MS_FLOAT32X4X4 float32x4x4_t #define MS_INT32X4 int32x4_t #define MS_INT128_EPI32 int32x4_t #define MS_UINT32X4 uint32x4_t @@ -222,29 +224,30 @@ static inline MS_FLOAT32X4 VexpFp32(MS_FLOAT32X4 input) { {1.0f / 6, 1.0f / 6, 1.0f / 6, 1.0f / 6}, {0.5f, 0.5f, 0.5f, 0.5f}, {1.0f, 1.0f, 1.0f, 1.0f}, - {1.44269504088896341f, 1.44269504088896341f, 1.44269504088896341f, 1.44269504088896341f}}; + {1.44269504088896341f, 1.44269504088896341f, 1.44269504088896341f, 1.44269504088896341f}, + {2.0f, 2.0f, 2.0f, 2.0f}}; static MS_FLOAT32X4 negative_flag = {-0.0f, -0.0f, -0.0f, -0.0f}; MS_INT32X4 integer = MS_CVTQPS_EPI32(MS_FMADD128_F32(input, param[6], MS_OR128_F32(MS_AND128_F32(input, negative_flag), param[4]))); MS_FLOAT32X4 decimal = MS_SUBQ_F32(input, MS_MULQ_F32(MS_CVTQEPI32_PS(integer), param[0])); - MS_INT32X4 int_exp = MS_SLLIQ_EPI32(MS_ADDQ_EPI32(integer, MS_MOVQ_EPI32(127)), 23); + MS_INT32X4 int_exp = MS_SLLIQ_EPI32(MS_ADDQ_EPI32(integer, MS_MOVQ_EPI32(126)), 23); MS_FLOAT32X4 tmp = MS_MULQ_F32(decimal, (MS_ADDQ_F32(param[2], MS_MULQ_F32(decimal, param[1])))); tmp = MS_MULQ_F32(decimal, MS_ADDQ_F32(param[4], MS_MULQ_F32(decimal, MS_ADDQ_F32(param[3], tmp)))); MS_FLOAT32X4 decimal_exp = MS_ADDQ_F32(param[5], MS_MULQ_F32(decimal, MS_ADDQ_F32(param[5], tmp))); - return MS_MULQ_F32(decimal_exp, MS_CAST128_F32_S32(int_exp)); + return MS_MULQ_F32(param[7], MS_MULQ_F32(decimal_exp, MS_CAST128_F32_S32(int_exp))); } static inline void simd_exp128(MS_FLOAT32X4 input, float *dst) { - static MS_FLOAT32X4 maxv = {88.0f, 88.0f, 88.0f, 88.0f}; - static MS_FLOAT32X4 minv = {-88.0f, -88.0f, -88.0f, -88.0f}; + static MS_FLOAT32X4 maxv = {88.72283935546875f, 88.72283935546875f, 88.72283935546875f, 88.72283935546875f}; + static MS_FLOAT32X4 minv = {-87.3365478515625f, -87.3365478515625f, -87.3365478515625f, -87.3365478515625f}; input = MS_MAXQ_F32(minv, MS_MINQ_F32(input, maxv)); MS_STQ_F32(dst, VexpFp32(input)); } static inline MS_FLOAT32X4 simd_exp128_f32(MS_FLOAT32X4 input) { - static MS_FLOAT32X4 maxv = {88.0f, 88.0f, 88.0f, 88.0f}; - static MS_FLOAT32X4 minv = {-88.0f, -88.0f, -88.0f, -88.0f}; + static MS_FLOAT32X4 maxv = {88.72283935546875f, 88.72283935546875f, 88.72283935546875f, 88.72283935546875f}; + static MS_FLOAT32X4 minv = {-87.3365478515625f, -87.3365478515625f, -87.3365478515625f, -87.3365478515625f}; input = MS_MAXQ_F32(minv, MS_MINQ_F32(input, maxv)); return VexpFp32(input); } @@ -286,18 +289,6 @@ static inline MS_FLOAT32X4 MS_TANHX4_F32(MS_FLOAT32X4 src) { return res; } -static inline MS_FLOAT128_F32 SIMD_SIGN128_F32(MS_FLOAT128_F32 src) { - MS_FLOAT128_F32 abs_src = MS_ABS128_F32(src); - MS_FLOAT128_F32 src_tmp = MS_OR128_F32(src, MS_MOV128_F32(1.0f)); - MS_FLOAT128_F32 sign = MS_DIV128_F32(abs_src, src_tmp); - return sign; -} - -static inline MS_FLOAT128_F32 SIMD_SIGNABS128_F32(MS_FLOAT128_F32 src, MS_FLOAT128_F32 abs_src) { - MS_FLOAT128_F32 src_tmp = MS_OR128_F32(src, MS_MOV128_F32(1.0f)); - return MS_DIV128_F32(abs_src, src_tmp); -} - #define MS_TANH128_F32 MS_TANHX4_F32 static inline MS_FLOAT32X4 MS128_ERF_F32(MS_FLOAT32X4 src) { diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/lstm_parameter.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/lstm_parameter.h index 9ecd8409..5baf10fa 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/lstm_parameter.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/lstm_parameter.h @@ -25,6 +25,7 @@ typedef struct LstmParameter { int input_size_; int hidden_size_; int project_size_; + int output_size_; int seq_len_; int batch_; // other parameter @@ -36,6 +37,8 @@ typedef struct LstmParameter { int input_col_align_; int state_row_align_; int state_col_align_; + int proj_col_align_; + bool has_bias_; } LstmParameter; #endif // NNACL_LSTM_PARAMETER_H_ diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h index 895f7e3d..bd0d152c 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/nnacl/op_base.h @@ -562,6 +562,7 @@ enum PrimType { PrimType_Inner_CustomMaskedFill = 10014, PrimType_Inner_CustomTensorScatterMax = 10015, PrimType_Inner_CustomIsInf = 10016, + PrimType_Inner_CustomGatherDGradV2 = 10017, PrimType_InnerOpMax, PrimType_InnerOpMin = PrimType_Inner_ToFormat }; diff --git a/mindspore/core/mindrt/src/thread/threadpool.cc b/mindspore/core/mindrt/src/thread/threadpool.cc index 2301be8c..342ffb7f 100644 --- a/mindspore/core/mindrt/src/thread/threadpool.cc +++ b/mindspore/core/mindrt/src/thread/threadpool.cc @@ -53,7 +53,7 @@ Worker::~Worker() { void Worker::CreateThread() { thread_ = std::make_unique(&Worker::Run, this); } void Worker::ReinitAfterFork() { - THREAD_INFO("worker %ld recreate thread after fork in child process", worker_id_); + THREAD_INFO("worker %zu recreate thread after fork in child process", worker_id_); if (cond_var_ != nullptr) { (void)cond_var_.release(); cond_var_ = std::make_unique(); diff --git a/mindspore/core/ops/base_operator.h b/mindspore/core/ops/base_operator.h index 811a6000..23652e8e 100644 --- a/mindspore/core/ops/base_operator.h +++ b/mindspore/core/ops/base_operator.h @@ -75,7 +75,7 @@ class MIND_API OperatorRegisterHelper { public: OperatorRegisterHelper(const std::string &kname, const OperatorDefineFunc &fn) { OperatorRegister::GetInstance().SetOperatorMap(kname, fn); - (void)id_; // make compiler happy on macos + // (void)id_; // make compiler happy on macos } ~OperatorRegisterHelper() = default; diff --git a/mindspore/core/ops/grad/gather_d_grad_v2.cc b/mindspore/core/ops/grad/gather_d_grad_v2.cc index 3ce5f887..c999ca88 100644 --- a/mindspore/core/ops/grad/gather_d_grad_v2.cc +++ b/mindspore/core/ops/grad/gather_d_grad_v2.cc @@ -75,6 +75,11 @@ TypePtr GatherDGradV2InferType(const PrimitivePtr &prim, const std::vectorGetAttr(kDim); + return GetValue(value_ptr); +} + AbstractBasePtr GatherDGradV2Infer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive, const std::vector &input_args) { auto infer_type = GatherDGradV2InferType(primitive, input_args); diff --git a/mindspore/core/ops/grad/gather_d_grad_v2.h b/mindspore/core/ops/grad/gather_d_grad_v2.h index 94274e3b..40a6e412 100644 --- a/mindspore/core/ops/grad/gather_d_grad_v2.h +++ b/mindspore/core/ops/grad/gather_d_grad_v2.h @@ -25,6 +25,7 @@ class MIND_API GatherDGradV2 : public BaseOperator { public: MIND_API_BASE_MEMBER(GatherDGradV2); GatherDGradV2() : BaseOperator(kNameGatherDGradV2) { InitIOName({"x", "dim", "index", "grad"}, {"output"}); } + int64_t get_dim() const; }; MIND_API abstract::AbstractBasePtr GatherDGradV2Infer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive, diff --git a/mindspore/core/ops/grad/lstm_grad.cc b/mindspore/core/ops/grad/lstm_grad.cc index d51c4882..c25e0379 100644 --- a/mindspore/core/ops/grad/lstm_grad.cc +++ b/mindspore/core/ops/grad/lstm_grad.cc @@ -98,15 +98,22 @@ void LSTMGrad::set_zoneout_hidden(float zoneout_hidden) { float LSTMGrad::get_zoneout_hidden() const { return GetValue(this->GetAttr(kZoneoutHidden)); } +void LSTMGrad::set_proj_size(const int64_t proj_size) { + (void)CheckAndConvertUtils::CheckInteger(kProjection_size, proj_size, kGreaterThan, 0, this->name()); + (void)AddAttr(kProjection_size, api::MakeValue(proj_size)); +} +int64_t LSTMGrad::get_proj_size() const { return GetValue(GetAttr(kProjection_size)); } + void LSTMGrad::Init(const int64_t input_size, const int64_t hidden_size, const int64_t num_layers, const bool has_bias, - const float dropout, const bool bidirectional, const float zoneout_cell, - const float zoneout_hidden) { + const float dropout, const bool bidirectional, const float zoneout_cell, const float zoneout_hidden, + const int64_t proj_size) { this->set_input_size(input_size); this->set_hidden_size(hidden_size); this->set_num_layers(num_layers); this->set_has_bias(has_bias); this->set_dropout(dropout); this->set_bidirectional(bidirectional); + this->set_proj_size(proj_size); if (bidirectional) { constexpr int k2Directions = 2; this->set_num_directions(k2Directions); diff --git a/mindspore/core/ops/grad/lstm_grad.h b/mindspore/core/ops/grad/lstm_grad.h index 73272d55..f6eba32c 100644 --- a/mindspore/core/ops/grad/lstm_grad.h +++ b/mindspore/core/ops/grad/lstm_grad.h @@ -31,7 +31,7 @@ class MIND_API LSTMGrad : public BaseOperator { LSTMGrad() : BaseOperator(kNameLSTMGrad) {} void Init(const int64_t input_size, const int64_t hidden_size, const int64_t num_layers, const bool has_bias, const float dropout, const bool bidirectional = false, const float zoneout_cell = 0.0f, - const float zoneout_hidden = 0.0f); + const float zoneout_hidden = 0.0f, const int64_t proj_size = 0); void set_input_size(const int64_t input_size); int64_t get_input_size() const; void set_hidden_size(const int64_t hidden_size); @@ -51,6 +51,8 @@ class MIND_API LSTMGrad : public BaseOperator { void set_zoneout_hidden(float zoneout_hidden); float get_zoneout_hidden() const; int64_t get_good_ld(const int64_t dim, const int64_t type_size); + void set_proj_size(const int64_t proj_size); + int64_t get_proj_size() const; }; } // namespace ops } // namespace mindspore diff --git a/mindspore/core/ops/grad/lstm_grad_data.cc b/mindspore/core/ops/grad/lstm_grad_data.cc index 573d26f4..2b25282c 100644 --- a/mindspore/core/ops/grad/lstm_grad_data.cc +++ b/mindspore/core/ops/grad/lstm_grad_data.cc @@ -91,15 +91,23 @@ void LSTMGradData::set_zoneout_hidden(float zoneout_hidden) { float LSTMGradData::get_zoneout_hidden() const { return GetValue(this->GetAttr(kZoneoutHidden)); } +void LSTMGradData::set_proj_size(const int64_t proj_size) { + (void)CheckAndConvertUtils::CheckInteger(kProjection_size, proj_size, kGreaterThan, 0, this->name()); + (void)AddAttr(kProjection_size, api::MakeValue(proj_size)); +} + +int64_t LSTMGradData::get_proj_size() const { return GetValue(GetAttr(kProjection_size)); } + void LSTMGradData::Init(const int64_t input_size, const int64_t hidden_size, const int64_t num_layers, const bool has_bias, const float dropout, const bool bidirectional, const float zoneout_cell, - const float zoneout_hidden) { + const float zoneout_hidden, const int64_t proj_size) { this->set_input_size(input_size); this->set_hidden_size(hidden_size); this->set_num_layers(num_layers); this->set_has_bias(has_bias); this->set_dropout(dropout); this->set_bidirectional(bidirectional); + this->set_proj_size(proj_size); if (bidirectional) { constexpr int k2Directions = 2; this->set_num_directions(k2Directions); diff --git a/mindspore/core/ops/grad/lstm_grad_data.h b/mindspore/core/ops/grad/lstm_grad_data.h index adcf2ee7..f93e3260 100644 --- a/mindspore/core/ops/grad/lstm_grad_data.h +++ b/mindspore/core/ops/grad/lstm_grad_data.h @@ -32,7 +32,7 @@ class MIND_API LSTMGradData : public BaseOperator { LSTMGradData() : BaseOperator(kNameLSTMGradData) {} void Init(const int64_t input_size, const int64_t hidden_size, const int64_t num_layers, const bool has_bias, const float dropout, const bool bidirectional = false, const float zoneout_cell = 0.0f, - const float zoneout_hidden = 0.0f); + const float zoneout_hidden = 0.0f, const int64_t proj_size = 0); void set_input_size(const int64_t input_size); int64_t get_input_size() const; void set_hidden_size(const int64_t hidden_size); @@ -52,6 +52,8 @@ class MIND_API LSTMGradData : public BaseOperator { void set_zoneout_hidden(float zoneout_hidden); float get_zoneout_hidden() const; int64_t get_good_ld(const int64_t dim, const int64_t type_size); + void set_proj_size(const int64_t proj_size); + int64_t get_proj_size() const; }; MIND_API abstract::AbstractBasePtr LstmGradDataInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive, const std::vector &input_args); diff --git a/mindspore/core/ops/grad/lstm_grad_weight.cc b/mindspore/core/ops/grad/lstm_grad_weight.cc index 22b519c3..ce0aca94 100644 --- a/mindspore/core/ops/grad/lstm_grad_weight.cc +++ b/mindspore/core/ops/grad/lstm_grad_weight.cc @@ -88,15 +88,23 @@ void LSTMGradWeight::set_zoneout_hidden(float zoneout_hidden) { float LSTMGradWeight::get_zoneout_hidden() const { return GetValue(this->GetAttr(kZoneoutHidden)); } +void LSTMGradWeight::set_proj_size(const int64_t proj_size) { + (void)CheckAndConvertUtils::CheckInteger(kProjection_size, proj_size, kGreaterThan, 0, this->name()); + (void)AddAttr(kProjection_size, api::MakeValue(proj_size)); +} + +int64_t LSTMGradWeight::get_proj_size() const { return GetValue(GetAttr(kProjection_size)); } + void LSTMGradWeight::Init(const int64_t input_size, const int64_t hidden_size, const int64_t num_layers, const bool has_bias, const float dropout, const bool bidirectional, const float zoneout_cell, - const float zoneout_hidden) { + const float zoneout_hidden, const int64_t proj_size) { this->set_input_size(input_size); this->set_hidden_size(hidden_size); this->set_num_layers(num_layers); this->set_has_bias(has_bias); this->set_dropout(dropout); this->set_bidirectional(bidirectional); + this->set_proj_size(proj_size); if (bidirectional) { constexpr int k2Directions = 2; this->set_num_directions(k2Directions); diff --git a/mindspore/core/ops/grad/lstm_grad_weight.h b/mindspore/core/ops/grad/lstm_grad_weight.h index c2ca6b5e..add816d3 100644 --- a/mindspore/core/ops/grad/lstm_grad_weight.h +++ b/mindspore/core/ops/grad/lstm_grad_weight.h @@ -32,7 +32,7 @@ class MIND_API LSTMGradWeight : public BaseOperator { LSTMGradWeight() : BaseOperator(kNameLSTMGradWeight) {} void Init(const int64_t input_size, const int64_t hidden_size, const int64_t num_layers, const bool has_bias, const float dropout, const bool bidirectional = false, const float zoneout_cell = 0.0f, - const float zoneout_hidden = 0.0f); + const float zoneout_hidden = 0.0f, const int64_t proj_size = 0); void set_input_size(const int64_t input_size); int64_t get_input_size() const; void set_hidden_size(const int64_t hidden_size); @@ -52,6 +52,8 @@ class MIND_API LSTMGradWeight : public BaseOperator { void set_zoneout_hidden(float zoneout_hidden); float get_zoneout_hidden() const; int64_t get_good_ld(const int64_t dim, const int64_t type_size); + void set_proj_size(const int64_t proj_size); + int64_t get_proj_size() const; }; MIND_API abstract::AbstractBasePtr LstmGradWeightInfer(const abstract::AnalysisEnginePtr &, const PrimitivePtr &primitive, diff --git a/mindspore/core/ops/lstm.cc b/mindspore/core/ops/lstm.cc index 43b9241c..937207df 100644 --- a/mindspore/core/ops/lstm.cc +++ b/mindspore/core/ops/lstm.cc @@ -68,6 +68,7 @@ abstract::TupleShapePtr LSTMInferShape(const PrimitivePtr &primitive, const std: int64_t input_x_size = GetValue(primitive->GetAttr(kInput_size)); int64_t num_layers = GetValue(primitive->GetAttr(kNumLayers)); bool bidirectional = GetValue(primitive->GetAttr(kBidirectional)); + int64_t proj_size = GetValue(primitive->GetAttr(kProjection_size)); int64_t num_directions = 1; if (bidirectional) { num_directions = 2; @@ -90,7 +91,8 @@ abstract::TupleShapePtr LSTMInferShape(const PrimitivePtr &primitive, const std: (void)CheckAndConvertUtils::CheckInteger("h_shape[1]", h_input_shape[1], kEqual, x_input_shape[1], prim_name); } - std::vector y_shape = {x_input_shape[0], x_input_shape[1], hidden_size * num_directions}; + auto real_hidden_size = proj_size > 0 ? proj_size : hidden_size; + std::vector y_shape = {x_input_shape[0], x_input_shape[1], real_hidden_size * num_directions}; std::vector h_shape = {h_input_shape}; std::vector c_shape = {c_input_shape}; std::vector reverse_shape = {1, 1}; @@ -135,6 +137,11 @@ void LSTM::set_hidden_size(const int64_t hidden_size) { (void)AddAttr(kHidden_size, api::MakeValue(hidden_size)); } int64_t LSTM::get_hidden_size() const { return GetValue(GetAttr(kHidden_size)); } +void LSTM::set_proj_size(const int64_t proj_size) { + (void)CheckAndConvertUtils::CheckInteger(kProjection_size, proj_size, kGreaterThan, 0, this->name()); + (void)AddAttr(kProjection_size, api::MakeValue(proj_size)); +} +int64_t LSTM::get_proj_size() const { return GetValue(GetAttr(kProjection_size)); } void LSTM::set_num_layers(const int64_t num_layers) { (void)CheckAndConvertUtils::CheckInteger(kNumLayers, num_layers, kGreaterThan, 0, this->name()); (void)AddAttr(kNumLayers, api::MakeValue(num_layers)); diff --git a/mindspore/core/ops/lstm.h b/mindspore/core/ops/lstm.h index 4d3c8756..e32c5781 100644 --- a/mindspore/core/ops/lstm.h +++ b/mindspore/core/ops/lstm.h @@ -51,6 +51,12 @@ class MIND_API LSTM : public BaseOperator { /// /// \return hidden_size. int64_t get_hidden_size() const; + /// \brief Set proj_size. + void set_proj_size(const int64_t proj_size); + /// \brief Get proj_size. + /// + /// \return proj_size. + int64_t get_proj_size() const; /// \brief Set num_layers. void set_num_layers(const int64_t num_layers); /// \brief Get num_layers. diff --git a/mindspore/core/ops/op_name.h b/mindspore/core/ops/op_name.h index ce68079f..ad9066e7 100644 --- a/mindspore/core/ops/op_name.h +++ b/mindspore/core/ops/op_name.h @@ -268,6 +268,7 @@ constexpr auto kWindowSize = "window_size"; constexpr auto kPaddings = "paddings"; constexpr auto kInput_size = "input_size"; constexpr auto kHidden_size = "hidden_size"; +constexpr auto kProjection_size = "proj_size"; constexpr auto kChannelShared = "channel_shared"; constexpr auto kSlope = "slope"; constexpr auto kBase = "base"; diff --git a/mindspore/lite/BUILD.gn b/mindspore/lite/BUILD.gn index f7e465e2..9318d54e 100644 --- a/mindspore/lite/BUILD.gn +++ b/mindspore/lite/BUILD.gn @@ -602,6 +602,8 @@ all_train_sources = [ "src/train/optimizer/fusion/matmul_activation_fusion_pass.cc", "src/train/optimizer/fusion/reshape_gather_reshape_fusion_pass.cc", "src/train/optimizer/fusion/gru_fusion_pass.cc", + "src/train/optimizer/fusion/matmul_add_fusion_pass.cc", + "src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.cc", "src/common/storage.cc", "tools/converter/optimizer.cc", "tools/converter/legacy_optimizer/fusion/fusion_pass.cc", @@ -646,6 +648,7 @@ fp32_train_kernel_sources = [ "src/litert/kernel/cpu/fp32_grad/convolution.cc", "src/litert/kernel/cpu/fp32_grad/convolution_grad_filter.cc", "src/litert/kernel/cpu/fp32_grad/convolution_grad_input.cc", + "src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.cc", "src/litert/kernel/cpu/fp32_grad/deconvolution_grad_filter.cc", "src/litert/kernel/cpu/fp32_grad/dropout.cc", "src/litert/kernel/cpu/fp32_grad/dropout_grad.cc", diff --git a/mindspore/lite/CMakeLists.txt b/mindspore/lite/CMakeLists.txt index 1faf2f38..f2b5809f 100644 --- a/mindspore/lite/CMakeLists.txt +++ b/mindspore/lite/CMakeLists.txt @@ -977,7 +977,7 @@ if(MSLITE_MINDDATA_IMPLEMENT STREQUAL "lite" OR MSLITE_MINDDATA_IMPLEMENT STREQU endif() add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src/common/ops) -if(ANDROID_NDK_TOOLCHAIN_INCLUDED OR TARGET_OHOS_LITE OR TARGET_HIMIX) +if(ANDROID_NDK_TOOLCHAIN_INCLUDED OR TARGET_OHOS_LITE OR TARGET_HIMIX OR TARGET_OHOS) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/tools/converter/micro/coder) endif() diff --git a/mindspore/lite/schema/inner/ops_generated.h b/mindspore/lite/schema/inner/ops_generated.h index c4fd8c15..6c861aa5 100644 --- a/mindspore/lite/schema/inner/ops_generated.h +++ b/mindspore/lite/schema/inner/ops_generated.h @@ -11338,6 +11338,7 @@ struct LSTMT : public flatbuffers::NativeTable { float dropout = 0.0f; float zoneout_cell = 0.0f; float zoneout_hidden = 0.0f; + int64_t proj_size = 0; }; struct LSTM FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { @@ -11355,7 +11356,8 @@ struct LSTM FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VT_NUM_DIRECTIONS = 14, VT_DROPOUT = 16, VT_ZONEOUT_CELL = 18, - VT_ZONEOUT_HIDDEN = 20 + VT_ZONEOUT_HIDDEN = 20, + VT_PROJ_SIZE = 22 }; bool bidirectional() const { return GetField(VT_BIDIRECTIONAL, 0) != 0; @@ -11411,6 +11413,12 @@ struct LSTM FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { bool mutate_zoneout_hidden(float _zoneout_hidden) { return SetField(VT_ZONEOUT_HIDDEN, _zoneout_hidden, 0.0f); } + int64_t proj_size() const { + return GetField(VT_PROJ_SIZE, 0); + } + bool mutate_proj_size(int64_t _proj_size) { + return SetField(VT_PROJ_SIZE, _proj_size, 0); + } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyField(verifier, VT_BIDIRECTIONAL) && @@ -11422,6 +11430,7 @@ struct LSTM FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VerifyField(verifier, VT_DROPOUT) && VerifyField(verifier, VT_ZONEOUT_CELL) && VerifyField(verifier, VT_ZONEOUT_HIDDEN) && + VerifyField(verifier, VT_PROJ_SIZE) && verifier.EndTable(); } LSTMT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; @@ -11460,6 +11469,9 @@ struct LSTMBuilder { void add_zoneout_hidden(float zoneout_hidden) { fbb_.AddElement(LSTM::VT_ZONEOUT_HIDDEN, zoneout_hidden, 0.0f); } + void add_proj_size(int64_t proj_size) { + fbb_.AddElement(LSTM::VT_PROJ_SIZE, proj_size, 0); + } explicit LSTMBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -11481,8 +11493,10 @@ inline flatbuffers::Offset CreateLSTM( int64_t num_directions = 0, float dropout = 0.0f, float zoneout_cell = 0.0f, - float zoneout_hidden = 0.0f) { + float zoneout_hidden = 0.0f, + int64_t proj_size = 0) { LSTMBuilder builder_(_fbb); + builder_.add_proj_size(proj_size); builder_.add_num_directions(num_directions); builder_.add_num_layers(num_layers); builder_.add_hidden_size(hidden_size); @@ -23571,6 +23585,7 @@ inline void LSTM::UnPackTo(LSTMT *_o, const flatbuffers::resolver_function_t *_r { auto _e = dropout(); _o->dropout = _e; } { auto _e = zoneout_cell(); _o->zoneout_cell = _e; } { auto _e = zoneout_hidden(); _o->zoneout_hidden = _e; } + { auto _e = proj_size(); _o->proj_size = _e; } } inline flatbuffers::Offset LSTM::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSTMT* _o, const flatbuffers::rehasher_function_t *_rehasher) { @@ -23590,6 +23605,7 @@ inline flatbuffers::Offset CreateLSTM(flatbuffers::FlatBufferBuilder &_fbb auto _dropout = _o->dropout; auto _zoneout_cell = _o->zoneout_cell; auto _zoneout_hidden = _o->zoneout_hidden; + auto _proj_size = _o->proj_size; return mindspore::schema::CreateLSTM( _fbb, _bidirectional, @@ -23600,7 +23616,8 @@ inline flatbuffers::Offset CreateLSTM(flatbuffers::FlatBufferBuilder &_fbb _num_directions, _dropout, _zoneout_cell, - _zoneout_hidden); + _zoneout_hidden, + _proj_size); } inline LSTMGradT *LSTMGrad::UnPack(const flatbuffers::resolver_function_t *_resolver) const { diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs index 76caf810..920c0d31 100644 --- a/mindspore/lite/schema/ops.fbs +++ b/mindspore/lite/schema/ops.fbs @@ -688,6 +688,7 @@ table LSTM { dropout: float; zoneout_cell: float = 0; zoneout_hidden: float = 0; + proj_size: long = 0; } table LSTMGrad { diff --git a/mindspore/lite/schema/ops_generated.h b/mindspore/lite/schema/ops_generated.h index 2f792706..8d387e9d 100644 --- a/mindspore/lite/schema/ops_generated.h +++ b/mindspore/lite/schema/ops_generated.h @@ -7046,7 +7046,8 @@ struct LSTM FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VT_NUM_DIRECTIONS = 14, VT_DROPOUT = 16, VT_ZONEOUT_CELL = 18, - VT_ZONEOUT_HIDDEN = 20 + VT_ZONEOUT_HIDDEN = 20, + VT_PROJ_SIZE = 22 }; bool bidirectional() const { return GetField(VT_BIDIRECTIONAL, 0) != 0; @@ -7075,6 +7076,9 @@ struct LSTM FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { float zoneout_hidden() const { return GetField(VT_ZONEOUT_HIDDEN, 0.0f); } + int64_t proj_size() const { + return GetField(VT_PROJ_SIZE, 0); + } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyField(verifier, VT_BIDIRECTIONAL) && @@ -7086,6 +7090,7 @@ struct LSTM FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VerifyField(verifier, VT_DROPOUT) && VerifyField(verifier, VT_ZONEOUT_CELL) && VerifyField(verifier, VT_ZONEOUT_HIDDEN) && + VerifyField(verifier, VT_PROJ_SIZE) && verifier.EndTable(); } }; @@ -7121,6 +7126,9 @@ struct LSTMBuilder { void add_zoneout_hidden(float zoneout_hidden) { fbb_.AddElement(LSTM::VT_ZONEOUT_HIDDEN, zoneout_hidden, 0.0f); } + void add_proj_size(int64_t proj_size) { + fbb_.AddElement(LSTM::VT_PROJ_SIZE, proj_size, 0); + } explicit LSTMBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -7142,8 +7150,10 @@ inline flatbuffers::Offset CreateLSTM( int64_t num_directions = 0, float dropout = 0.0f, float zoneout_cell = 0.0f, - float zoneout_hidden = 0.0f) { + float zoneout_hidden = 0.0f, + int64_t proj_size = 0) { LSTMBuilder builder_(_fbb); + builder_.add_proj_size(proj_size); builder_.add_num_directions(num_directions); builder_.add_num_layers(num_layers); builder_.add_hidden_size(hidden_size); diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt index de1781cd..469bcb6b 100644 --- a/mindspore/lite/src/CMakeLists.txt +++ b/mindspore/lite/src/CMakeLists.txt @@ -337,6 +337,8 @@ set(TRAIN_SRC ${CMAKE_CURRENT_SOURCE_DIR}/train/optimizer/common/fusion_utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/train/optimizer/fusion/gru_fusion_pass.cc ${CMAKE_CURRENT_SOURCE_DIR}/train/optimizer/fusion/matmul_activation_fusion_pass.cc + ${CMAKE_CURRENT_SOURCE_DIR}/train/optimizer/fusion/matmul_add_fusion_pass.cc + ${CMAKE_CURRENT_SOURCE_DIR}/train/optimizer/fusion/matmul_matmul_add_fusion_pass.cc ${CMAKE_CURRENT_SOURCE_DIR}/train/optimizer/fusion/reshape_gather_reshape_fusion_pass.cc ${TOOLS_DIR}/converter/optimizer.cc ${TOOLS_DIR}/converter/legacy_optimizer/fusion/fusion_pass.cc diff --git a/mindspore/lite/src/common/ops/ops_def.cc b/mindspore/lite/src/common/ops/ops_def.cc index e5c7f5ca..baa2497a 100644 --- a/mindspore/lite/src/common/ops/ops_def.cc +++ b/mindspore/lite/src/common/ops/ops_def.cc @@ -688,6 +688,7 @@ OP_ATTR(num_directions, long) OP_ATTR(dropout, float) OP_ATTR_WITH_VALUE(zoneout_cell, float, 0) OP_ATTR_WITH_VALUE(zoneout_hidden, float, 0) +OP_ATTR_WITH_VALUE(proj_size, long, 0) OP_SCHEMA_DEF_END(LSTM) OP_SCHEMA_DEF(LSTMGrad) diff --git a/mindspore/lite/src/common/ops/populate/custom_populate.cc b/mindspore/lite/src/common/ops/populate/custom_populate.cc index 13957ed7..6c490130 100644 --- a/mindspore/lite/src/common/ops/populate/custom_populate.cc +++ b/mindspore/lite/src/common/ops/populate/custom_populate.cc @@ -22,6 +22,7 @@ #include "nnacl/custom_masked_fill_parameter.h" #include "nnacl/custom_is_inf_parameter.h" #include "nnacl/custom_tensor_scatter_max_parameter.h" +#include "nnacl/custom_gather_d_grad_v2_parameter.h" using mindspore::schema::PrimitiveType_Custom; namespace mindspore { @@ -128,6 +129,33 @@ OpParameter *CreateCustomMaskedFillParameter() { return reinterpret_cast(param); } +OpParameter *CreateCustomGatherDGradV2Parameter(const schema::Custom *value) { + if (value->attr()->size() < 1) { + return nullptr; + } + auto *param = static_cast(malloc(sizeof(CustomGatherGradV2Parameter))); + if (param == nullptr) { + MS_LOG(ERROR) << "malloc CustomGruParameter failed."; + return nullptr; + } + + std::string dim_str; + auto attrs = value->attr(); + for (size_t i = 0; i < attrs->size(); i++) { + auto attr = attrs->Get(i); + if (attr->name()->str() == "dim") { + auto data = attr->data(); + dim_str = std::string(reinterpret_cast(data->Data()), data->size()); + break; + } + } + + memset(param, 0, sizeof(CustomGatherGradV2Parameter)); + param->dim = std::stoi(dim_str.c_str()); + param->op_parameter_.type_ = PrimType_Inner_CustomGatherDGradV2; + return reinterpret_cast(param); +} + OpParameter *PopulateCustomParameter(const void *prim) { MS_CHECK_TRUE_RET(prim != nullptr, nullptr); auto primitive = static_cast(prim); @@ -167,6 +195,8 @@ OpParameter *PopulateCustomParameter(const void *prim) { return CreateCustomGruParameter(); } else if (type == "CastGatherReduceFusion") { return CreateParam(PrimType_Inner_CastGatherReduceFusion); + } else if (type == "GatherDGradV2") { + return CreateCustomGatherDGradV2Parameter(value); } else if (type == "ThirdPartyModel") { auto *param = static_cast(malloc(sizeof(CustomParameter))); if (param == nullptr) { diff --git a/mindspore/lite/src/common/ops/populate/lstm_populate.cc b/mindspore/lite/src/common/ops/populate/lstm_populate.cc index 522da7ad..b3a85b64 100644 --- a/mindspore/lite/src/common/ops/populate/lstm_populate.cc +++ b/mindspore/lite/src/common/ops/populate/lstm_populate.cc @@ -37,8 +37,12 @@ OpParameter *PopulateLstmParameter(const void *prim) { param->op_parameter_.type_ = primitive->value_type(); param->bidirectional_ = value->bidirectional(); + param->has_bias_ = value->has_bias(); + param->input_size_ = value->input_size(); + param->hidden_size_ = value->hidden_size(); param->zoneout_cell_ = value->zoneout_cell(); param->zoneout_hidden_ = value->zoneout_hidden(); + param->project_size_ = value->proj_size(); return reinterpret_cast(param); } diff --git a/mindspore/lite/src/common/prim_util.cc b/mindspore/lite/src/common/prim_util.cc index 5ded05e9..7263775a 100644 --- a/mindspore/lite/src/common/prim_util.cc +++ b/mindspore/lite/src/common/prim_util.cc @@ -29,11 +29,25 @@ static std::set kTensorListOps = { schema::PrimitiveType_TensorListReserve, schema::PrimitiveType_TensorListSetItem, schema::PrimitiveType_TensorListStack}; -static const char *const kInnerOpNames[C10NUM] = {"Inner_ToFormat", "Inner_GltextureToOpencl", - "Inner_Identity", "Inner_ShapeFusion", - "Inner_GraphKernel", "Inner_SplitReduceConcatFusion", - "Inner_EncoderLayer", "Inner_DecoderLayer", - "Inner_UsePastEmbedding", "Inner_CustomGru"}; +static const char *const kInnerOpNames[C20NUM] = {"Inner_ToFormat", + "Inner_GltextureToOpencl", + "Inner_Identity", + "Inner_ShapeFusion", + "Inner_GraphKernel", + "Inner_SplitReduceConcatFusion", + "Inner_EncoderLayer", + "PrimType_Inner_FseDecode", + "Inner_DecoderLayer", + "Inner_UsePastEmbedding", + "Inner_CustomGru", + "PrimType_Inner_CastGatherReduceFusion", + "PrimType_Inner_ReduceConcatFusion", + "PrimType_Inner_ThirdPartyModel", + "PrimType_Inner_CustomMaskedFill", + "PrimType_Inner_CustomTensorScatterMax", + "PrimType_Inner_CustomIsInf", + "PrimType_Inner_CustomGatherDGradV2"}; + int GetPrimitiveType(const void *primitive, int schema_version) { if (primitive == nullptr) { return -1; diff --git a/mindspore/lite/src/litert/kernel/cpu/BUILD.gn b/mindspore/lite/src/litert/kernel/cpu/BUILD.gn index 65065b5b..7b813314 100644 --- a/mindspore/lite/src/litert/kernel/cpu/BUILD.gn +++ b/mindspore/lite/src/litert/kernel/cpu/BUILD.gn @@ -85,6 +85,9 @@ cpu_kernel_sources = [ "fp32/invert_permutation_fp32.cc", "fp32/l2_norm_fp32.cc", "fp32/lstm_fp32.cc", + "fp32/lstm_fp32_base.cc", + "fp32/lstm_mindir_fp32.cc", + "fp32/lstm_non_mindir_fp32.cc", "fp32/matmul_fp32_arm32.cc", "fp32/matmul_fp32_arm64.cc", "fp32/matmul_fp32_avx512.cc", @@ -174,6 +177,9 @@ fp16_kernel_sources = [ "fp16/instance_norm_fp16.cc", "fp16/layout_transform_fp16.cc", "fp16/lstm_fp16.cc", + "fp16/lstm_fp16_base.cc", + "fp16/lstm_mindir_fp16.cc", + "fp16/lstm_non_mindir_fp16.cc", "fp16/matmul_base_fp16.cc", "fp16/matmul_fp16.cc", "fp16/power_fp16.cc", diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/gru_fp16.cc b/mindspore/lite/src/litert/kernel/cpu/fp16/gru_fp16.cc index 232bbe44..89945e1c 100644 --- a/mindspore/lite/src/litert/kernel/cpu/fp16/gru_fp16.cc +++ b/mindspore/lite/src/litert/kernel/cpu/fp16/gru_fp16.cc @@ -100,10 +100,10 @@ int GruFp16CPUKernel::InitInputWeightBias() { } if (weight_g->data_type() == kNumberTypeFloat32) { PackLstmWeightFp32ToFp16(weight_g_ptr_, reinterpret_cast(weight_g->data()), weight_batch_, - gru_param_->input_size_, gru_param_->hidden_size_, gru_param_->input_col_align_); + gru_param_->input_size_, gru_param_->hidden_size_, gru_param_->input_col_align_, nullptr); } else if (weight_g->data_type() == kNumberTypeFloat16) { PackLstmWeightFp16(weight_g_ptr_, reinterpret_cast(weight_g->data()), weight_batch_, - gru_param_->input_size_, gru_param_->hidden_size_, gru_param_->input_col_align_); + gru_param_->input_size_, gru_param_->hidden_size_, gru_param_->input_col_align_, nullptr); } else { MS_LOG(ERROR) << "Unsupported data type of weight_g tensor for gru."; return RET_ERROR; @@ -120,10 +120,10 @@ int GruFp16CPUKernel::InitInputWeightBias() { memset(input_bias_, 0, weight_batch_ * gru_param_->input_col_align_ * sizeof(float16_t)); if (bias->data_type() == kNumberTypeFloat32) { PackLstmBiasFp32ToFp16(input_bias_, reinterpret_cast(bias->data()), weight_batch_, - gru_param_->hidden_size_, gru_param_->input_col_align_, gru_param_->bidirectional_); + gru_param_->hidden_size_, gru_param_->input_col_align_, gru_param_->bidirectional_, nullptr); } else if (bias->data_type() == kNumberTypeFloat16) { PackLstmBiasFp16(input_bias_, reinterpret_cast(bias->data()), weight_batch_, gru_param_->hidden_size_, - gru_param_->input_col_align_, gru_param_->bidirectional_); + gru_param_->input_col_align_, gru_param_->bidirectional_, nullptr); } else { MS_LOG(ERROR) << "Unsupported data type of bias tensor for gru."; return RET_ERROR; @@ -148,10 +148,10 @@ int GruFp16CPUKernel::InitStateWeightBias() { if (!is_vec_) { if (weight_r->data_type() == kNumberTypeFloat32) { PackLstmWeightFp32ToFp16(weight_r_ptr_, reinterpret_cast(weight_r->data()), weight_batch_, - gru_param_->hidden_size_, gru_param_->hidden_size_, gru_param_->state_col_align_); + gru_param_->hidden_size_, gru_param_->hidden_size_, gru_param_->state_col_align_, nullptr); } else if (weight_r->data_type() == kNumberTypeFloat16) { PackLstmWeightFp16(weight_r_ptr_, reinterpret_cast(weight_r->data()), weight_batch_, - gru_param_->hidden_size_, gru_param_->hidden_size_, gru_param_->state_col_align_); + gru_param_->hidden_size_, gru_param_->hidden_size_, gru_param_->state_col_align_, nullptr); } else { MS_LOG(ERROR) << "Unsupported data type of weight_r tensor for gru."; return RET_ERROR; @@ -179,11 +179,11 @@ int GruFp16CPUKernel::InitStateWeightBias() { if (bias->data_type() == kNumberTypeFloat32) { auto state_bias_data = reinterpret_cast(bias->data()) + gate_num * gru_param_->hidden_size_; PackLstmBiasFp32ToFp16(state_bias_, state_bias_data, weight_batch_, gru_param_->hidden_size_, - gru_param_->state_col_align_, gru_param_->bidirectional_); + gru_param_->state_col_align_, gru_param_->bidirectional_, nullptr); } else if (bias->data_type() == kNumberTypeFloat16) { auto state_bias_data = reinterpret_cast(bias->data()) + gate_num * gru_param_->hidden_size_; PackLstmBiasFp16(state_bias_, state_bias_data, weight_batch_, gru_param_->hidden_size_, - gru_param_->state_col_align_, gru_param_->bidirectional_); + gru_param_->state_col_align_, gru_param_->bidirectional_, nullptr); } else { MS_LOG(ERROR) << "Unsupported data type of bias tensor for gru."; return RET_ERROR; diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16.cc b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16.cc index b583358a..bd99b791 100644 --- a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16.cc +++ b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16.cc @@ -1,5 +1,5 @@ /** - * Copyright 2021 Huawei Technologies Co., Ltd + * Copyright 2021-2023 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,13 +16,9 @@ #include "src/litert/kernel/cpu/fp16/lstm_fp16.h" #include -#include -#include "schema/model_generated.h" +#include "src/litert/kernel/cpu/fp16/lstm_mindir_fp16.h" +#include "src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.h" #include "src/litert/kernel_registry.h" -#include "include/errorcode.h" -#include "nnacl/fp16/lstm_fp16.h" -#include "nnacl/fp16/cast_fp16.h" -#include "nnacl/errorcode.h" using mindspore::kernel::KERNEL_ARCH; using mindspore::lite::KernelRegistrar; @@ -31,389 +27,34 @@ using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_LSTM; namespace mindspore::kernel { -void LstmFp16CPUKernel::FreeTmpBuffer() { - if (weight_i_ptr_ != nullptr) { - free(weight_i_ptr_); - weight_i_ptr_ = nullptr; - } - if (input_bias_ != nullptr) { - free(input_bias_); - input_bias_ = nullptr; - } - if (weight_h_ptr_ != nullptr) { - free(weight_h_ptr_); - weight_h_ptr_ = nullptr; - } - if (state_bias_ != nullptr) { - free(state_bias_); - state_bias_ = nullptr; - } - if (weight_project_ptr_ != nullptr) { - free(weight_project_ptr_); - weight_project_ptr_ = nullptr; - } - if (project_bias_ != nullptr) { - free(project_bias_); - project_bias_ = nullptr; - } -} - -void LstmFp16CPUKernel::FreeRunBuffer() { - ms_context_->allocator->Free(buffer_[packed_input_index]); - ms_context_->allocator->Free(buffer_[input_gate_index]); - if (!is_vec_) { - ms_context_->allocator->Free(buffer_[packed_state_index]); - } - ms_context_->allocator->Free(buffer_[state_gate_index]); - if (!(lstm_param_->zoneout_cell_ >= -FLT_EPSILON && lstm_param_->zoneout_cell_ <= FLT_EPSILON)) { - ms_context_->allocator->Free(buffer_[cell_state_index]); - } - if (!(lstm_param_->zoneout_hidden_ >= -FLT_EPSILON && lstm_param_->zoneout_hidden_ <= FLT_EPSILON)) { - ms_context_->allocator->Free(buffer_[hidden_state_index]); - } -} - -int LstmFp16CPUKernel::InitParam() { - auto input = in_tensors_.front(); - std::vector in_shape = input->shape(); - lstm_param_->seq_len_ = in_shape.at(0); - lstm_param_->batch_ = in_shape.at(1); - lstm_param_->input_size_ = in_shape.at(kNHWC_W); - - auto weight_i = in_tensors_.at(1); - std::vector w_shape = weight_i->shape(); - NNACL_CHECK_ZERO_RETURN_ERR(gate_num); - lstm_param_->hidden_size_ = w_shape.at(1) / gate_num; - - auto weight_h = in_tensors_.at(C2NUM); - auto h_shape = weight_h->shape(); - lstm_param_->project_size_ = h_shape.back(); - - const int twice = 2; - lstm_param_->output_step_ = lstm_param_->bidirectional_ ? twice * lstm_param_->batch_ * lstm_param_->hidden_size_ - : lstm_param_->batch_ * lstm_param_->hidden_size_; - weight_batch_ = lstm_param_->bidirectional_ ? twice * gate_num : gate_num; - lstm_param_->input_row_align_ = UP_ROUND(lstm_param_->seq_len_ * lstm_param_->batch_, C16NUM); - lstm_param_->input_col_align_ = UP_ROUND(lstm_param_->hidden_size_, C8NUM); - - is_vec_ = lstm_param_->batch_ == 1; - lstm_param_->state_row_align_ = is_vec_ ? lstm_param_->batch_ : UP_ROUND(lstm_param_->batch_, C16NUM); - lstm_param_->state_col_align_ = is_vec_ ? lstm_param_->hidden_size_ : UP_ROUND(lstm_param_->hidden_size_, C8NUM); - return RET_OK; -} - -int LstmFp16CPUKernel::InitInputWeightBias() { - // malloc and init input * weight right matrix buffer - // input -- row: seq_len * batch; col: input_size - // weight -- row: hidden_size; col: input_size, need transpose - // result -- row: seq_len * batch; col: hidden_size - auto weight_i = in_tensors_.at(1); - auto weight_i_data = weight_i->data(); - CHECK_NULL_RETURN(weight_i_data); - weight_i_ptr_ = reinterpret_cast( - malloc(weight_batch_ * lstm_param_->input_col_align_ * lstm_param_->input_size_ * sizeof(float16_t))); - if (weight_i_ptr_ == nullptr) { - MS_LOG(ERROR) << "LstmFp16CPUKernel malloc weight_i_ptr_ error."; - return RET_ERROR; - } - if (weight_i->data_type() == kNumberTypeFloat32) { - PackLstmWeightFp32ToFp16(weight_i_ptr_, reinterpret_cast(weight_i_data), weight_batch_, - lstm_param_->input_size_, lstm_param_->hidden_size_, lstm_param_->input_col_align_); - } else if (weight_i->data_type() == kNumberTypeFloat16) { - PackLstmWeightFp16(weight_i_ptr_, reinterpret_cast(weight_i_data), weight_batch_, - lstm_param_->input_size_, lstm_param_->hidden_size_, lstm_param_->input_col_align_); +namespace { +constexpr size_t kMindirInputTensorNum = 4; +} // namespace + +LiteKernel *LstmFp16KernelCreator(const std::vector &inputs, const std::vector &outputs, + OpParameter *parameter, const lite::InnerContext *ctx, const kernel::KernelKey &desc) { + if (parameter == nullptr) { + MS_LOG(ERROR) << "parameter is nullptr."; + return nullptr; + } + if (desc.data_type == kTypeUnknown) { + MS_LOG(WARNING) << "desc data_type is unknown."; + } + LiteKernel *kernel{nullptr}; + if (inputs.size() == kMindirInputTensorNum) { + kernel = new (std::nothrow) + LstmMindirFp16CPUKernel(parameter, inputs, outputs, static_cast(ctx)); } else { - MS_LOG(ERROR) << "Unsupported data type of weight_i tensor for lstm."; - return RET_ERROR; - } - - // input bias - auto bias = in_tensors_.at(FOURTH_INPUT); - auto bias_data = bias->data(); - CHECK_NULL_RETURN(bias_data); - input_bias_ = - reinterpret_cast(malloc(weight_batch_ * lstm_param_->input_col_align_ * sizeof(float16_t))); - if (input_bias_ == nullptr) { - MS_LOG(ERROR) << "LstmFp16CPUKernel malloc input_bias_ error."; - return RET_ERROR; - } - memset(input_bias_, 0, weight_batch_ * lstm_param_->input_col_align_ * sizeof(float16_t)); - if (bias->data_type() == kNumberTypeFloat32) { - PackLstmBiasFp32ToFp16(input_bias_, reinterpret_cast(bias_data), weight_batch_, lstm_param_->hidden_size_, - lstm_param_->input_col_align_, lstm_param_->bidirectional_); - } else if (bias->data_type() == kNumberTypeFloat16) { - PackLstmBiasFp16(input_bias_, reinterpret_cast(bias_data), weight_batch_, lstm_param_->hidden_size_, - lstm_param_->input_col_align_, lstm_param_->bidirectional_); - } else { - MS_LOG(ERROR) << "Unsupported data type of bias tensor for lstm."; - return RET_ERROR; - } - return RET_OK; -} - -int LstmFp16CPUKernel::InitStateWeightBias() { - // malloc and init state * weight right matrix buffer, state * weight will be executed seq_len_ times. - // state -- row: batch; col: hidden_size - // weight -- row: hidden_size; col: hidden_size, need transpose - // result -- row: batch; col: hidden_size - auto weight_h = in_tensors_.at(THIRD_INPUT); - auto weight_h_data = weight_h->data(); - CHECK_NULL_RETURN(weight_h_data); - weight_h_ptr_ = reinterpret_cast( - malloc(weight_batch_ * lstm_param_->state_col_align_ * lstm_param_->project_size_ * sizeof(float16_t))); - if (weight_h_ptr_ == nullptr) { - MS_LOG(ERROR) << "LstmFp16CPUKernel malloc weight_h_ptr_ error."; - return RET_ERROR; - } - - if (!is_vec_) { - if (weight_h->data_type() == kNumberTypeFloat32) { - PackLstmWeightFp32ToFp16(weight_h_ptr_, reinterpret_cast(weight_h_data), weight_batch_, - lstm_param_->project_size_, lstm_param_->hidden_size_, lstm_param_->state_col_align_); - } else if (weight_h->data_type() == kNumberTypeFloat16) { - PackLstmWeightFp16(weight_h_ptr_, reinterpret_cast(weight_h_data), weight_batch_, - lstm_param_->project_size_, lstm_param_->hidden_size_, lstm_param_->state_col_align_); - } else { - MS_LOG(ERROR) << "Unsupported data type of weight_h tensor for lstm."; - return RET_ERROR; - } - } else { - if (weight_h->data_type() == kNumberTypeFloat32) { - Float32ToFloat16(reinterpret_cast(weight_h_data), weight_h_ptr_, weight_h->ElementsNum()); - } else if (weight_h->data_type() == kNumberTypeFloat16) { - memcpy(weight_h_ptr_, reinterpret_cast(weight_h_data), weight_h->Size()); - } else { - MS_LOG(ERROR) << "Unsupported data type of weight_h tensor for lstm."; - return RET_ERROR; - } - } - - // state bias - auto bias = in_tensors_.at(FOURTH_INPUT); - auto bias_data = bias->data(); - CHECK_NULL_RETURN(bias_data); - state_bias_ = - reinterpret_cast(malloc(weight_batch_ * lstm_param_->state_col_align_ * sizeof(float16_t))); - if (state_bias_ == nullptr) { - MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state_bias_ error."; - return RET_ERROR; - } - memset(state_bias_, 0, weight_batch_ * lstm_param_->state_col_align_ * sizeof(float16_t)); - if (bias->data_type() == kNumberTypeFloat32) { - auto state_bias_data = reinterpret_cast(bias_data) + gate_num * lstm_param_->hidden_size_; - PackLstmBiasFp32ToFp16(state_bias_, state_bias_data, weight_batch_, lstm_param_->hidden_size_, - lstm_param_->state_col_align_, lstm_param_->bidirectional_); - } else if (bias->data_type() == kNumberTypeFloat16) { - auto state_bias_data = reinterpret_cast(bias_data) + gate_num * lstm_param_->hidden_size_; - PackLstmBiasFp16(state_bias_, state_bias_data, weight_batch_, lstm_param_->hidden_size_, - lstm_param_->state_col_align_, lstm_param_->bidirectional_); - } else { - MS_LOG(ERROR) << "Unsupported data type of bias tensor for lstm."; - return RET_ERROR; - } - return RET_OK; -} - -int LstmFp16CPUKernel::InitProjectWeight() { - if (in_tensors_.size() < C7NUM) { - return RET_OK; - } - auto weight_pro = in_tensors_.at(SEVENTH_INPUT); - auto shape = weight_pro->shape(); - if (shape.size() != C3NUM) { - MS_LOG(ERROR) << "Project-weight's shape must be 3D."; - return RET_ERROR; - } - auto weight_pro_data = weight_pro->data(); - CHECK_NULL_RETURN(weight_pro_data); - int batch = lstm_param_->bidirectional_ ? C2NUM : C1NUM; - if (shape[0] != batch) { - MS_LOG(ERROR) << "Project-weight's shape[0] must be 1(bidirectional=false) or 2(bidirectional=true)."; - return RET_ERROR; + kernel = new (std::nothrow) + LstmNonMindirFp16CPUKernel(parameter, inputs, outputs, static_cast(ctx)); } - int pro_col_align = is_vec_ ? lstm_param_->project_size_ : UP_ROUND(lstm_param_->project_size_, C8NUM); - weight_project_ptr_ = - reinterpret_cast(malloc(batch * lstm_param_->hidden_size_ * pro_col_align * sizeof(float16_t))); - if (weight_project_ptr_ == nullptr) { - MS_LOG(ERROR) << "LstmFp16CPUKernel malloc weight_project_ptr_ error."; - return RET_ERROR; - } - - if (!is_vec_) { - if (weight_pro->data_type() == kNumberTypeFloat32) { - PackLstmWeightFp32ToFp16(weight_project_ptr_, reinterpret_cast(weight_pro_data), batch, - lstm_param_->hidden_size_, lstm_param_->project_size_, pro_col_align); - } else if (weight_pro->data_type() == kNumberTypeFloat16) { - PackLstmWeightFp16(weight_project_ptr_, reinterpret_cast(weight_pro_data), batch, - lstm_param_->hidden_size_, lstm_param_->project_size_, pro_col_align); - } else { - MS_LOG(ERROR) << "Unsupported data type of weight_project tensor for lstm."; - return RET_ERROR; - } - } else { - if (weight_pro->data_type() == kNumberTypeFloat32) { - Float32ToFloat16(reinterpret_cast(weight_pro_data), weight_project_ptr_, weight_pro->ElementsNum()); - } else if (weight_pro->data_type() == kNumberTypeFloat16) { - memcpy(weight_project_ptr_, weight_pro_data, weight_pro->Size()); - } else { - MS_LOG(ERROR) << "Unsupported data type of weight_project tensor for lstm."; - return RET_ERROR; - } - } - size_t bias_size = UP_ROUND(lstm_param_->project_size_, C8NUM) * sizeof(float16_t); - project_bias_ = reinterpret_cast(malloc(bias_size)); - if (project_bias_ == nullptr) { - MS_LOG(ERROR) << "LstmFp16CPUKernel malloc project_bias_ error."; - return RET_ERROR; - } - (void)memset(project_bias_, 0, bias_size); - return RET_OK; -} - -int LstmFp16CPUKernel::Prepare() { - CHECK_LESS_RETURN(in_tensors_.size(), C6NUM); - for (size_t i = 0; i < in_tensors_.size(); i++) { - CHECK_NULL_RETURN(in_tensors_.at(i)); - } - CHECK_LESS_RETURN(out_tensors_.size(), C3NUM); - for (size_t i = 0; i < out_tensors_.size(); i++) { - CHECK_NULL_RETURN(out_tensors_.at(i)); - } - CHECK_NULL_RETURN(lstm_param_); - if (!InferShapeDone()) { - return RET_OK; - } - return ReSize(); -} - -int LstmFp16CPUKernel::ReSize() { - auto ret = InitParam(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Lstm fp16 InitParam error."; - return RET_ERROR; - } - - FreeTmpBuffer(); - ret = InitInputWeightBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Lstm fp16 InitInputWeightBias error."; - FreeTmpBuffer(); - return RET_ERROR; - } - - ret = InitStateWeightBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Lstm fp16 InitStateWeightBias error."; - FreeTmpBuffer(); - return RET_ERROR; - } - - ret = InitProjectWeight(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Lstm fp16 InitProjectWeight error."; - FreeTmpBuffer(); - return RET_ERROR; - } - return RET_OK; -} - -int LstmFp16CPUKernel::MallocRunBuffer() { - for (int i = 0; i < C7NUM; i++) { - buffer_[i] = nullptr; - } - buffer_[packed_input_index] = reinterpret_cast( - ms_context_->allocator->Malloc(lstm_param_->input_row_align_ * lstm_param_->input_size_ * sizeof(float16_t))); - if (buffer_[packed_input_index] == nullptr) { - MS_LOG(ERROR) << "LstmFp16CPUKernel malloc input * weight left matirx error."; - return RET_ERROR; - } - - buffer_[input_gate_index] = reinterpret_cast(ms_context_->allocator->Malloc( - gate_num * lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * sizeof(float16_t))); - if (buffer_[input_gate_index] == nullptr) { - MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state * weight left matirx error."; - return RET_ERROR; - } - - if (!is_vec_) { - buffer_[packed_state_index] = reinterpret_cast( - ms_context_->allocator->Malloc(lstm_param_->state_row_align_ * lstm_param_->project_size_ * sizeof(float16_t))); - if (buffer_[packed_state_index] == nullptr) { - MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state * weight left matirx error."; - return RET_ERROR; - } - } - - buffer_[state_gate_index] = reinterpret_cast( - ms_context_->allocator->Malloc(gate_num * lstm_param_->batch_ * lstm_param_->hidden_size_ * sizeof(float16_t))); - if (buffer_[state_gate_index] == nullptr) { - MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state gate buffer_ error."; - return RET_ERROR; - } - - if (!(lstm_param_->zoneout_cell_ >= -FLT_EPSILON && lstm_param_->zoneout_cell_ <= FLT_EPSILON)) { - int buffer_size = lstm_param_->batch_ * lstm_param_->hidden_size_ * sizeof(float16_t); - buffer_[cell_state_index] = reinterpret_cast(ms_context_->allocator->Malloc(buffer_size)); - if (buffer_[cell_state_index] == nullptr) { - MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state_buffer for cell error."; - return RET_ERROR; - } - } - if (!(lstm_param_->zoneout_hidden_ >= -FLT_EPSILON && lstm_param_->zoneout_hidden_ <= FLT_EPSILON)) { - int buffer_size = lstm_param_->batch_ * lstm_param_->project_size_ * sizeof(float16_t); - buffer_[hidden_state_index] = reinterpret_cast(ms_context_->allocator->Malloc(buffer_size)); - if (buffer_[hidden_state_index] == nullptr) { - MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state_buffer for hidden error."; - return RET_ERROR; - } - } - if (!is_vec_ && in_tensors_.size() == C7NUM) { - buffer_[project_input_index] = reinterpret_cast( - ms_context_->allocator->Malloc(lstm_param_->state_row_align_ * lstm_param_->hidden_size_ * sizeof(float16_t))); - if (buffer_[project_input_index] == nullptr) { - MS_LOG(ERROR) << "LstmFp16CPUKernel malloc project_buffer for hidden error."; - return RET_ERROR; - } - } - return RET_OK; -} - -int LstmFp16CPUKernel::Run() { - auto input = in_tensors_.at(0); - auto input_ptr = reinterpret_cast(input->data()); - CHECK_NULL_RETURN(input_ptr); - auto output = out_tensors_.at(0); - auto output_ptr = reinterpret_cast(output->data()); - CHECK_NULL_RETURN(output_ptr); - - auto hidden_state = in_tensors_.at(FIFTH_INPUT); - CHECK_NULL_RETURN(hidden_state->data()); - auto cell_state = in_tensors_.at(SIXTH_INPUT); - CHECK_NULL_RETURN(cell_state->data()); - - auto output_hidden_state = out_tensors_[1]; - CHECK_NULL_RETURN(output_hidden_state->data()); - memcpy(output_hidden_state->data(), hidden_state->data(), hidden_state->ElementsNum() * sizeof(float16_t)); - auto output_cell_state = out_tensors_[THIRD_INPUT]; - CHECK_NULL_RETURN(output_cell_state->data()); - memcpy(output_cell_state->data(), cell_state->data(), cell_state->ElementsNum() * sizeof(float16_t)); - - auto ret = MallocRunBuffer(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "LstmFp16CPUKernel MallocRunBuffer error."; - FreeRunBuffer(); - return RET_ERROR; + if (kernel == nullptr) { + MS_LOG(ERROR) << "kernel: " << parameter->name_ << "is nullptr."; + free(parameter); + return nullptr; } - CHECK_NULL_RETURN(weight_i_ptr_); - CHECK_NULL_RETURN(weight_h_ptr_); - CHECK_NULL_RETURN(input_bias_); - CHECK_NULL_RETURN(state_bias_); - LstmFp16(output_ptr, input_ptr, weight_i_ptr_, weight_h_ptr_, input_bias_, state_bias_, weight_project_ptr_, - project_bias_, reinterpret_cast(output_hidden_state->data()), - reinterpret_cast(output_cell_state->data()), buffer_, lstm_param_); - FreeRunBuffer(); - return RET_OK; + return kernel; } -REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_LSTM, LiteKernelCreator) +REG_KERNEL(kCPU, kNumberTypeFloat16, PrimitiveType_LSTM, LstmFp16KernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16_base.cc b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16_base.cc new file mode 100644 index 00000000..767fdef3 --- /dev/null +++ b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16_base.cc @@ -0,0 +1,270 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/cpu/fp16/lstm_fp16_base.h" +#include +#include "nnacl/fp16/lstm_fp16.h" + +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; + +namespace mindspore::kernel { +namespace { +constexpr int kGateNum = 4; +constexpr int kTempInputBufferIndex = 0; +constexpr int kTempInputGateBufferIndex = 1; +constexpr int kTempStateBufferIndex = 2; +constexpr int kTempStateGateBufferIndex = 3; +constexpr int kTempCellStateBufferIndex = 4; +constexpr int kTempHiddenStateBufferIndex = 5; +constexpr int kTempProjectInputBufferIndex = 6; +} // namespace + +LstmFp16BaseCPUKernel::~LstmFp16BaseCPUKernel() { FreePackBuffer(); } + +int LstmFp16BaseCPUKernel::Prepare() { + for (size_t i = 0; i < in_tensors_.size(); ++i) { + CHECK_NULL_RETURN(in_tensors_[i]); + } + CHECK_LESS_RETURN(out_tensors_.size(), C3NUM); + for (size_t i = 0; i < out_tensors_.size(); ++i) { + CHECK_NULL_RETURN(out_tensors_[i]); + } + CHECK_NULL_RETURN(lstm_param_); + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int LstmFp16BaseCPUKernel::ReSize() { + auto ret = InitParam(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LstmFp16 InitParam failed."; + return RET_ERROR; + } + if (running_pack_) { + return RET_OK; + } + return PackWeightAndBias(); +} + +int LstmFp16BaseCPUKernel::Run() { + auto input_ptr = reinterpret_cast(in_tensors_[FIRST_INPUT]->data()); + CHECK_NULL_RETURN(input_ptr); + auto output_ptr = reinterpret_cast(out_tensors_[FIRST_INPUT]->data()); + CHECK_NULL_RETURN(output_ptr); + + auto hidden_init = in_tensors_[hidden_init_index_]->data(); + CHECK_NULL_RETURN(hidden_init); + auto cell_init = in_tensors_[cell_init_index_]->data(); + CHECK_NULL_RETURN(cell_init); + + auto output_hidden = out_tensors_[SECOND_INPUT]->data(); + CHECK_NULL_RETURN(output_hidden); + (void)memcpy(output_hidden, hidden_init, in_tensors_[hidden_init_index_]->ElementsNum() * sizeof(float16_t)); + auto output_cell = out_tensors_[THIRD_INPUT]->data(); + CHECK_NULL_RETURN(output_cell); + (void)memcpy(output_cell, cell_init, in_tensors_[cell_init_index_]->ElementsNum() * sizeof(float16_t)); + + if (running_pack_) { + auto ret = PackWeightAndBias(); + if (ret != lite::RET_OK) { + MS_LOG(ERROR) << "LstmFp16 PackWeightAndBias failed."; + return ret; + } + } + auto ret = MallocRunBuffer(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LstmFp16CPUKernel MallocRunBuffer error."; + FreeRunBuffer(); + if (running_pack_) { + FreePackBuffer(); + } + return RET_ERROR; + } + LstmFp16(output_ptr, input_ptr, weight_i_ptr_, weight_h_ptr_, input_bias_, state_bias_, weight_project_ptr_, + project_bias_, reinterpret_cast(output_hidden), reinterpret_cast(output_cell), + running_buffer_, lstm_param_); + FreeRunBuffer(); + if (running_pack_) { + FreePackBuffer(); + } + return RET_OK; +} + +int LstmFp16BaseCPUKernel::InitParam() { + auto in_shape = in_tensors_[FIRST_INPUT]->shape(); + MS_CHECK_TRUE_MSG(in_shape.size() == C3NUM, lite::RET_INPUT_TENSOR_ERROR, + "The dims of LSTM's first input must be 3."); + lstm_param_->seq_len_ = in_shape[0]; + lstm_param_->batch_ = in_shape[1]; + lstm_param_->input_size_ = in_shape.back(); + + auto h_init_shape = in_tensors_.at(hidden_init_index_)->shape(); + auto c_init_shape = in_tensors_.at(cell_init_index_)->shape(); + lstm_param_->hidden_size_ = c_init_shape.back(); + lstm_param_->output_size_ = h_init_shape.back(); + + lstm_param_->output_step_ = lstm_param_->bidirectional_ ? C2NUM * lstm_param_->batch_ * lstm_param_->output_size_ + : lstm_param_->batch_ * lstm_param_->output_size_; + weight_segment_num_ = lstm_param_->bidirectional_ ? C2NUM * kGateNum : kGateNum; +#ifdef ENABLE_ARM64 + lstm_param_->input_row_align_ = UP_ROUND(lstm_param_->seq_len_ * lstm_param_->batch_, C1NUM); + lstm_param_->input_col_align_ = UP_ROUND(lstm_param_->hidden_size_, C4NUM); + + lstm_param_->state_row_align_ = UP_ROUND(lstm_param_->batch_, C1NUM); + lstm_param_->state_col_align_ = UP_ROUND(lstm_param_->hidden_size_, C4NUM); + lstm_param_->proj_col_align_ = UP_ROUND(lstm_param_->output_size_, C4NUM); + weight_need_pack_ = true; +#else + lstm_param_->input_row_align_ = UP_ROUND(lstm_param_->seq_len_ * lstm_param_->batch_, C16NUM); + lstm_param_->input_col_align_ = UP_ROUND(lstm_param_->hidden_size_, C8NUM); + + lstm_param_->state_row_align_ = + lstm_param_->batch_ == 1 ? lstm_param_->batch_ : UP_ROUND(lstm_param_->batch_, C16NUM); + lstm_param_->state_col_align_ = + lstm_param_->batch_ == 1 ? lstm_param_->hidden_size_ : UP_ROUND(lstm_param_->hidden_size_, C8NUM); + lstm_param_->proj_col_align_ = + lstm_param_->batch_ == 1 ? lstm_param_->output_size_ : UP_ROUND(lstm_param_->output_size_, C8NUM); + weight_need_pack_ = lstm_param_->batch_ != 1; +#endif + return RET_OK; +} + +int LstmFp16BaseCPUKernel::PackWeightAndBias() { + FreePackBuffer(); + auto ret = InitInputWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LstmFp16 InitInputWeightBias failed."; + FreePackBuffer(); + return RET_ERROR; + } + + ret = InitStateWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LstmFp16 InitStateWeightBias failed."; + FreePackBuffer(); + return RET_ERROR; + } + + ret = InitProjectWeight(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LstmFp16 InitProjectWeight failed."; + FreePackBuffer(); + return RET_ERROR; + } + return RET_OK; +} + +void LstmFp16BaseCPUKernel::FreePackBuffer() { + for (auto buffer : pack_buffer_) { + if (buffer) { + free(buffer); + } + } + pack_buffer_.clear(); +} + +int LstmFp16BaseCPUKernel::MallocRunBuffer() { + for (int i = 0; i < C7NUM; i++) { + running_buffer_[i] = nullptr; + } + bool need_pack_input = true; +#ifdef ENABLE_ARM64 + need_pack_input = lstm_param_->seq_len_ * lstm_param_->batch_ >= C4NUM; +#endif + if (need_pack_input) { + running_buffer_[kTempInputBufferIndex] = reinterpret_cast( + ms_context_->allocator->Malloc(lstm_param_->input_row_align_ * lstm_param_->input_size_ * sizeof(float16_t))); + if (running_buffer_[kTempInputBufferIndex] == nullptr) { + MS_LOG(ERROR) << "LstmFp16CPUKernel malloc input * weight left matirx error."; + return RET_ERROR; + } + } + + running_buffer_[kTempInputGateBufferIndex] = reinterpret_cast(ms_context_->allocator->Malloc( + kGateNum * lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * sizeof(float16_t))); + if (running_buffer_[kTempInputGateBufferIndex] == nullptr) { + MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state * weight left matirx error."; + return RET_ERROR; + } + + need_pack_input = lstm_param_->batch_ != 1; +#ifdef ENABLE_ARM64 + need_pack_input = lstm_param_->batch_ >= C4NUM; +#endif + if (need_pack_input) { + running_buffer_[kTempStateBufferIndex] = reinterpret_cast( + ms_context_->allocator->Malloc(lstm_param_->state_row_align_ * lstm_param_->output_size_ * sizeof(float16_t))); + if (running_buffer_[kTempStateBufferIndex] == nullptr) { + MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state * weight left matirx error."; + return RET_ERROR; + } + } + + running_buffer_[kTempStateGateBufferIndex] = reinterpret_cast( + ms_context_->allocator->Malloc(kGateNum * lstm_param_->batch_ * lstm_param_->hidden_size_ * sizeof(float16_t))); + if (running_buffer_[kTempStateGateBufferIndex] == nullptr) { + MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state gate buffer_ error."; + return RET_ERROR; + } + + if (!(lstm_param_->zoneout_cell_ >= -FLT_EPSILON && lstm_param_->zoneout_cell_ <= FLT_EPSILON)) { + int buffer_size = lstm_param_->batch_ * lstm_param_->hidden_size_ * sizeof(float16_t); + running_buffer_[kTempCellStateBufferIndex] = + reinterpret_cast(ms_context_->allocator->Malloc(buffer_size)); + if (running_buffer_[kTempCellStateBufferIndex] == nullptr) { + MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state_buffer for cell error."; + return RET_ERROR; + } + } + if (!(lstm_param_->zoneout_hidden_ >= -FLT_EPSILON && lstm_param_->zoneout_hidden_ <= FLT_EPSILON)) { + int buffer_size = lstm_param_->batch_ * lstm_param_->output_size_ * sizeof(float16_t); + running_buffer_[kTempHiddenStateBufferIndex] = + reinterpret_cast(ms_context_->allocator->Malloc(buffer_size)); + if (running_buffer_[kTempHiddenStateBufferIndex] == nullptr) { + MS_LOG(ERROR) << "LstmFp16CPUKernel malloc state_buffer for hidden error."; + return RET_ERROR; + } + } + + if (need_pack_input && in_tensors_.size() == C7NUM) { + running_buffer_[kTempProjectInputBufferIndex] = reinterpret_cast( + ms_context_->allocator->Malloc(lstm_param_->state_row_align_ * lstm_param_->hidden_size_ * sizeof(float16_t))); + if (running_buffer_[kTempProjectInputBufferIndex] == nullptr) { + MS_LOG(ERROR) << "LstmFp16CPUKernel malloc project_buffer for hidden error."; + return RET_ERROR; + } + } + return RET_OK; +} + +void LstmFp16BaseCPUKernel::FreeRunBuffer() { + ms_context_->allocator->Free(running_buffer_[kTempInputBufferIndex]); + ms_context_->allocator->Free(running_buffer_[kTempInputGateBufferIndex]); + if (lstm_param_->batch_ != 1) { + ms_context_->allocator->Free(running_buffer_[kTempStateBufferIndex]); + } + ms_context_->allocator->Free(running_buffer_[kTempStateGateBufferIndex]); + if (!(lstm_param_->zoneout_cell_ >= -FLT_EPSILON && lstm_param_->zoneout_cell_ <= FLT_EPSILON)) { + ms_context_->allocator->Free(running_buffer_[kTempCellStateBufferIndex]); + } + if (!(lstm_param_->zoneout_hidden_ >= -FLT_EPSILON && lstm_param_->zoneout_hidden_ <= FLT_EPSILON)) { + ms_context_->allocator->Free(running_buffer_[kTempHiddenStateBufferIndex]); + } +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16_base.h b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16_base.h new file mode 100644 index 00000000..0bcb9e94 --- /dev/null +++ b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_fp16_base.h @@ -0,0 +1,68 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_FP16_BASE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_FP16_BASE_H_ + +#include +#include "src/litert/lite_kernel.h" +#include "nnacl/lstm_parameter.h" + +namespace mindspore::kernel { +class LstmFp16BaseCPUKernel : public LiteKernel { + public: + LstmFp16BaseCPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : LiteKernel(parameter, inputs, outputs, ctx) { + lstm_param_ = reinterpret_cast(op_parameter_); + } + + ~LstmFp16BaseCPUKernel() override; + + int Prepare() override; + int ReSize() override; + int Run() override; + + protected: + virtual int InitInputWeightBias() = 0; + virtual int InitStateWeightBias() = 0; + virtual int InitProjectWeight() = 0; + + bool running_pack_{false}; + bool weight_need_pack_{false}; + int hidden_init_index_{0}; + int cell_init_index_{0}; + int weight_segment_num_{0}; + float16_t *weight_i_ptr_{nullptr}; + float16_t *weight_h_ptr_{nullptr}; + float16_t *weight_project_ptr_{nullptr}; + float16_t *input_bias_{nullptr}; + float16_t *state_bias_{nullptr}; + float16_t *project_bias_{nullptr}; + LstmParameter *lstm_param_{nullptr}; + float16_t *running_buffer_[C7NUM] = {nullptr}; + std::vector pack_buffer_; + + private: + int PackWeightAndBias(); + int InitParam(); + void FreePackBuffer(); + void FreeRunBuffer(); + int MallocRunBuffer(); +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_FP16_BASE_H_ diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_mindir_fp16.cc b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_mindir_fp16.cc new file mode 100644 index 00000000..cf4071eb --- /dev/null +++ b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_mindir_fp16.cc @@ -0,0 +1,35 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/cpu/fp16/lstm_mindir_fp16.h" + +namespace mindspore::kernel { +namespace { +constexpr size_t kMindirInputTensorNum = 4; +} // namespace + +int LstmMindirFp16CPUKernel::Prepare() { + CHECK_NOT_EQUAL_RETURN(in_tensors_.size(), kMindirInputTensorNum); + running_pack_ = trainable_ || !in_tensors_[FOURTH_INPUT]->IsConst(); + return LstmFp16BaseCPUKernel::Prepare(); +} + +int LstmMindirFp16CPUKernel::InitInputWeightBias() { return lite::RET_NOT_SUPPORT; } + +int LstmMindirFp16CPUKernel::InitStateWeightBias() { return lite::RET_NOT_SUPPORT; } + +int LstmMindirFp16CPUKernel::InitProjectWeight() { return lite::RET_NOT_SUPPORT; } +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_mindir_fp16.h b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_mindir_fp16.h new file mode 100644 index 00000000..bd8500d0 --- /dev/null +++ b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_mindir_fp16.h @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_MINDIR_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_MINDIR_FP16_H_ + +#include +#include "src/litert/kernel/cpu/fp16/lstm_fp16_base.h" + +namespace mindspore::kernel { +/* + * 1. LSTM without project, output_size = hidden_size + * h_init: second input, shape is [bidirectional, batch_size, hidden_size] + * c_init: third input, shape is [bidirectional, batch_size, hidden_size] + * weight_bias: forth input, weight_ih + weight_hh + bias, the gate order is IFGO + * + * 2. LSTM with project, output_size = project_size + * don't support + * h_init: second input, shape is [bidirectional, batch_size, hidden_size] + * c_init: third input, shape is [bidirectional, batch_size, hidden_size] + * weight_bias: forth input, weight_ih + weight_hh + proj + bias, the gate order is IFGO + */ +class LstmMindirFp16CPUKernel : public LstmFp16BaseCPUKernel { + public: + LstmMindirFp16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : LstmFp16BaseCPUKernel(parameter, inputs, outputs, ctx) { + hidden_init_index_ = SECOND_INPUT; + cell_init_index_ = THIRD_INPUT; + } + + ~LstmMindirFp16CPUKernel() override = default; + + int Prepare() override; + + protected: + int InitInputWeightBias() override; + int InitStateWeightBias() override; + int InitProjectWeight() override; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_MINDIR_FP16_H_ diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.cc b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.cc new file mode 100644 index 00000000..473fe9b0 --- /dev/null +++ b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.cc @@ -0,0 +1,194 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.h" +#include "nnacl/fp16/lstm_fp16.h" +#include "nnacl/fp16/cast_fp16.h" + +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_OK; + +namespace mindspore::kernel { +namespace { +constexpr int kGateNum = 4; +constexpr size_t kInputTensorNumMin = 6; +} // namespace + +int LstmNonMindirFp16CPUKernel::Prepare() { + CHECK_LESS_RETURN(in_tensors_.size(), kInputTensorNumMin); + running_pack_ = train_mode_; + for (size_t i = 1; i <= FOURTH_INPUT; ++i) { + running_pack_ = running_pack_ || !in_tensors_[i]->IsConst(); + } + return LstmFp16BaseCPUKernel::Prepare(); +} + +int LstmNonMindirFp16CPUKernel::InitInputWeightBias() { + // malloc and init input * weight right matrix buffer + // input -- row: seq_len * batch; col: input_size + // weight -- row: hidden_size; col: input_size, need transpose + // result -- row: seq_len * batch; col: hidden_size + auto weight_i = in_tensors_.at(1); + auto weight_i_data = weight_i->data(); + CHECK_NULL_RETURN(weight_i_data); + weight_i_ptr_ = reinterpret_cast( + malloc(weight_segment_num_ * lstm_param_->input_col_align_ * lstm_param_->input_size_ * sizeof(float16_t))); + MS_CHECK_TRUE_MSG(weight_i_ptr_ != nullptr, lite::RET_NULL_PTR, + "LstmNonMindirCPUKernel malloc weight_i_ptr_ failed."); + pack_buffer_.push_back(weight_i_ptr_); + if (weight_i->data_type() == kNumberTypeFloat32) { + PackLstmWeightFp32ToFp16(weight_i_ptr_, reinterpret_cast(weight_i_data), weight_segment_num_, + lstm_param_->input_size_, lstm_param_->hidden_size_, lstm_param_->input_col_align_, + nullptr); + } else if (weight_i->data_type() == kNumberTypeFloat16) { + PackLstmWeightFp16(weight_i_ptr_, reinterpret_cast(weight_i_data), weight_segment_num_, + lstm_param_->input_size_, lstm_param_->hidden_size_, lstm_param_->input_col_align_, nullptr); + } else { + MS_LOG(ERROR) << "Unsupported data type of weight_i tensor for lstm."; + return RET_ERROR; + } + + // input bias + auto bias = in_tensors_.at(FOURTH_INPUT); + auto bias_data = bias->data(); + CHECK_NULL_RETURN(bias_data); + input_bias_ = + reinterpret_cast(malloc(weight_segment_num_ * lstm_param_->input_col_align_ * sizeof(float16_t))); + MS_CHECK_TRUE_MSG(input_bias_ != nullptr, lite::RET_NULL_PTR, "LstmNonMindirCPUKernel malloc input_bias_ failed."); + pack_buffer_.push_back(input_bias_); + (void)memset(input_bias_, 0, weight_segment_num_ * lstm_param_->input_col_align_ * sizeof(float16_t)); + if (bias->data_type() == kNumberTypeFloat32) { + PackLstmBiasFp32ToFp16(input_bias_, reinterpret_cast(bias_data), weight_segment_num_, + lstm_param_->hidden_size_, lstm_param_->input_col_align_, lstm_param_->bidirectional_, + nullptr); + } else if (bias->data_type() == kNumberTypeFloat16) { + PackLstmBiasFp16(input_bias_, reinterpret_cast(bias_data), weight_segment_num_, + lstm_param_->hidden_size_, lstm_param_->input_col_align_, lstm_param_->bidirectional_, nullptr); + } else { + MS_LOG(ERROR) << "Unsupported data type of bias tensor for lstm."; + return RET_ERROR; + } + return RET_OK; +} + +int LstmNonMindirFp16CPUKernel::InitStateWeightBias() { + // malloc and init state * weight right matrix buffer, state * weight will be executed seq_len_ times. + // state -- row: batch; col: hidden_size + // weight -- row: hidden_size; col: hidden_size, need transpose + // result -- row: batch; col: hidden_size + auto weight_h = in_tensors_.at(THIRD_INPUT); + auto weight_h_data = weight_h->data(); + CHECK_NULL_RETURN(weight_h_data); + weight_h_ptr_ = reinterpret_cast( + malloc(weight_segment_num_ * lstm_param_->state_col_align_ * lstm_param_->output_size_ * sizeof(float16_t))); + MS_CHECK_TRUE_MSG(weight_h_ptr_ != nullptr, lite::RET_NULL_PTR, + "LstmNonMindirCPUKernel malloc weight_h_ptr_ failed."); + + if (weight_need_pack_) { + if (weight_h->data_type() == kNumberTypeFloat32) { + PackLstmWeightFp32ToFp16(weight_h_ptr_, reinterpret_cast(weight_h_data), weight_segment_num_, + lstm_param_->output_size_, lstm_param_->hidden_size_, lstm_param_->state_col_align_, + nullptr); + } else if (weight_h->data_type() == kNumberTypeFloat16) { + PackLstmWeightFp16(weight_h_ptr_, reinterpret_cast(weight_h_data), weight_segment_num_, + lstm_param_->output_size_, lstm_param_->hidden_size_, lstm_param_->state_col_align_, nullptr); + } else { + MS_LOG(ERROR) << "Unsupported data type of weight_h tensor for lstm."; + return RET_ERROR; + } + } else { + if (weight_h->data_type() == kNumberTypeFloat32) { + Float32ToFloat16(reinterpret_cast(weight_h_data), weight_h_ptr_, weight_h->ElementsNum()); + } else if (weight_h->data_type() == kNumberTypeFloat16) { + (void)memcpy(weight_h_ptr_, reinterpret_cast(weight_h_data), weight_h->Size()); + } else { + MS_LOG(ERROR) << "Unsupported data type of weight_h tensor for lstm."; + return RET_ERROR; + } + } + + // state bias + auto bias = in_tensors_[FOURTH_INPUT]; + auto bias_data = bias->data(); + CHECK_NULL_RETURN(bias_data); + state_bias_ = + reinterpret_cast(malloc(weight_segment_num_ * lstm_param_->state_col_align_ * sizeof(float16_t))); + MS_CHECK_TRUE_MSG(state_bias_ != nullptr, lite::RET_NULL_PTR, "LstmNonMindirCPUKernel malloc state_bias_ failed."); + (void)memset(state_bias_, 0, weight_segment_num_ * lstm_param_->state_col_align_ * sizeof(float16_t)); + if (bias->data_type() == kNumberTypeFloat32) { + auto state_bias_data = reinterpret_cast(bias_data) + kGateNum * lstm_param_->hidden_size_; + PackLstmBiasFp32ToFp16(state_bias_, state_bias_data, weight_segment_num_, lstm_param_->hidden_size_, + lstm_param_->state_col_align_, lstm_param_->bidirectional_, nullptr); + } else if (bias->data_type() == kNumberTypeFloat16) { + auto state_bias_data = reinterpret_cast(bias_data) + kGateNum * lstm_param_->hidden_size_; + PackLstmBiasFp16(state_bias_, state_bias_data, weight_segment_num_, lstm_param_->hidden_size_, + lstm_param_->state_col_align_, lstm_param_->bidirectional_, nullptr); + } else { + MS_LOG(ERROR) << "Unsupported data type of bias tensor for lstm."; + return RET_ERROR; + } + return RET_OK; +} + +int LstmNonMindirFp16CPUKernel::InitProjectWeight() { + if (in_tensors_.size() < C7NUM) { + return RET_OK; + } + auto weight_pro = in_tensors_[SEVENTH_INPUT]; + auto shape = weight_pro->shape(); + MS_CHECK_TRUE_MSG(shape.size() == C3NUM, lite::RET_ERROR, "Project-weight's shape must be 3D."); + auto weight_pro_data = weight_pro->data(); + CHECK_NULL_RETURN(weight_pro_data); + int batch = lstm_param_->bidirectional_ ? C2NUM : C1NUM; + if (shape[0] != batch) { + MS_LOG(ERROR) << "Project-weight's shape[0] must be 1(bidirectional=false) or 2(bidirectional=true)."; + return RET_ERROR; + } + int pro_col_align = lstm_param_->proj_col_align_; + weight_project_ptr_ = + reinterpret_cast(malloc(batch * lstm_param_->hidden_size_ * pro_col_align * sizeof(float16_t))); + MS_CHECK_TRUE_MSG(weight_project_ptr_ != nullptr, lite::RET_NULL_PTR, + "LstmNonMindirCPUKernel malloc weight_project_ptr_ failed."); + + if (weight_need_pack_) { + if (weight_pro->data_type() == kNumberTypeFloat32) { + PackLstmWeightFp32ToFp16(weight_project_ptr_, reinterpret_cast(weight_pro_data), batch, + lstm_param_->hidden_size_, lstm_param_->output_size_, pro_col_align, nullptr); + } else if (weight_pro->data_type() == kNumberTypeFloat16) { + PackLstmWeightFp16(weight_project_ptr_, reinterpret_cast(weight_pro_data), batch, + lstm_param_->hidden_size_, lstm_param_->output_size_, pro_col_align, nullptr); + } else { + MS_LOG(ERROR) << "Unsupported data type of weight_project tensor for lstm."; + return RET_ERROR; + } + } else { + if (weight_pro->data_type() == kNumberTypeFloat32) { + Float32ToFloat16(reinterpret_cast(weight_pro_data), weight_project_ptr_, weight_pro->ElementsNum()); + } else if (weight_pro->data_type() == kNumberTypeFloat16) { + (void)memcpy(weight_project_ptr_, weight_pro_data, weight_pro->Size()); + } else { + MS_LOG(ERROR) << "Unsupported data type of weight_project tensor for lstm."; + return RET_ERROR; + } + } + size_t bias_size = UP_ROUND(lstm_param_->output_size_, C8NUM) * sizeof(float16_t); + project_bias_ = reinterpret_cast(malloc(bias_size)); + MS_CHECK_TRUE_MSG(project_bias_ != nullptr, lite::RET_NULL_PTR, + "LstmNonMindirCPUKernel malloc project_bias_ failed."); + (void)memset(project_bias_, 0, bias_size); + return RET_OK; +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.h b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.h new file mode 100644 index 00000000..132ef1cf --- /dev/null +++ b/mindspore/lite/src/litert/kernel/cpu/fp16/lstm_non_mindir_fp16.h @@ -0,0 +1,59 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_NON_MINDIR_FP16_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_NON_MINDIR_FP16_H_ + +#include +#include "src/litert/kernel/cpu/fp16/lstm_fp16_base.h" + +namespace mindspore::kernel { +/* + * 1. LSTM without project, output_size = hidden_size + * weight_ih: second input, shape is [bidirectional, 4 * hidden_size, input_size] + * weight_hh: third input, shape is [bidirectional, 4 * hidden_size, hidden_size] + * bias: forth input, shape is [bidirectional, 8 * hidden_size] + * h_init: fifth input, shape is [bidirectional, batch_size, hidden_size] + * c_init: sixth input, shape is [bidirectional, batch_size, hidden_size] + * + * 2. LSTM with project, output_size = project_size + * weight_ih: second input, shape is [bidirectional, 4 * hidden_size, input_size] + * weight_hh: third input, shape is [bidirectional, 4 * hidden_size, project_size] + * bias: forth input, shape is [bidirectional, 8 * hidden_size] + * h_init: fifth input, shape is [bidirectional, batch_size, project_size] + * c_init: sixth input, shape is [bidirectional, batch_size, hidden_size] + * weight_pro: seventh input, shape is [bidirectional, project_size, hidden_size] + */ +class LstmNonMindirFp16CPUKernel : public LstmFp16BaseCPUKernel { + public: + LstmNonMindirFp16CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : LstmFp16BaseCPUKernel(parameter, inputs, outputs, ctx) { + hidden_init_index_ = FIFTH_INPUT; + cell_init_index_ = SIXTH_INPUT; + } + + ~LstmNonMindirFp16CPUKernel() override = default; + + int Prepare() override; + + protected: + int InitInputWeightBias() override; + int InitStateWeightBias() override; + int InitProjectWeight() override; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP16_LSTM_NON_MINDIR_FP16_H_ diff --git a/mindspore/lite/src/litert/kernel/cpu/fp16/matmul_base_fp16.cc b/mindspore/lite/src/litert/kernel/cpu/fp16/matmul_base_fp16.cc index 8adb97b9..d6f94fd9 100644 --- a/mindspore/lite/src/litert/kernel/cpu/fp16/matmul_base_fp16.cc +++ b/mindspore/lite/src/litert/kernel/cpu/fp16/matmul_base_fp16.cc @@ -187,13 +187,13 @@ void MatmulBaseFP16CPUKernel::InitMatrixA(const void *src_ptr) { float16_t *dst = a_pack_ptr_ + i * params_->deep_ * params_->row_align_; if (params_->a_transpose_) { #ifdef ENABLE_ARM64 - RowMajor2RowNMajorFp16((const float16_t *)src, dst, params_->deep_, params_->row_); + RowMajor2RowNMajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32); #else RowMajor2Row12MajorFp16(src, dst, params_->deep_, params_->row_, src_data_type == kNumberTypeFloat32); #endif } else { #ifdef ENABLE_ARM64 - RowMajor2ColNMajorFp16((const float16_t *)src, dst, params_->row_, params_->deep_); + RowMajor2ColNMajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32); #else RowMajor2Col12MajorFp16(src, dst, params_->row_, params_->deep_, src_data_type == kNumberTypeFloat32); #endif diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32.cc index 0b67f2c2..67f42265 100644 --- a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32.cc +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32.cc @@ -1,5 +1,5 @@ /** - * Copyright 2020 Huawei Technologies Co., Ltd + * Copyright 2020-2023 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,14 +14,11 @@ * limitations under the License. */ -#include "src/litert/kernel/cpu/fp32/lstm_fp32.h" -#include #include -#include "schema/model_generated.h" +#include "src/litert//kernel/cpu/fp32/lstm_mindir_fp32.h" +#include "src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.h" #include "src/litert/kernel_registry.h" #include "include/errorcode.h" -#include "nnacl/fp32/pack_fp32.h" -#include "nnacl/fp32/matmul_fp32.h" using mindspore::kernel::KERNEL_ARCH; using mindspore::lite::KernelRegistrar; @@ -32,664 +29,31 @@ using mindspore::schema::PrimitiveType_LSTM; namespace mindspore::kernel { namespace { -constexpr int kOutputHiddenStatusIndex = 1; -constexpr int kOutputCellStatusIndex = 2; -} // namespace - -int LstmInputMulWeightRun(void *cdata, int task_id, float, float) { - auto kernel = reinterpret_cast(cdata); - CHECK_NULL_RETURN(kernel); - kernel->InputWeightMatMul(task_id); - return RET_OK; -} - -int LstmSequenceLoopRun(void *cdata, int task_id, float, float) { - auto kernel = reinterpret_cast(cdata); - CHECK_NULL_RETURN(kernel); - auto ret = kernel->DoSequenceLoop(task_id); - if (ret != RET_OK) { - MS_LOG(ERROR) << "LSTM: Do Sequence-loop failed."; - } - return ret; -} - -void LstmCPUKernel::FreeRunBuffer() { - for (auto data : buffer_running_malloc_) { - ms_context_->allocator->Free(data); - } - buffer_running_malloc_.clear(); -} - -int LstmCPUKernel::InitInputWeightBias() { - // malloc and init input * weight right matrix buffer - // input -- row: seq_len * batch; col: input_size - // weight -- row: hidden_size; col: input_size, need transpose - // result -- row: seq_len * batch; col: hidden_size - weight_i_ptr_ = reinterpret_cast(ms_context_->allocator->Malloc( - weight_batch_ * lstm_param_->input_col_align_ * lstm_param_->input_size_ * sizeof(float))); - if (weight_i_ptr_ == nullptr) { - MS_LOG(ERROR) << "LstmCPUKernel malloc weight_i_ptr_ error."; - return RET_ERROR; - } - buffer_running_malloc_.push_back(weight_i_ptr_); - int i_index = (in_tensors_.size() == mindir_input_tensors) ? combined_weights_index : onnx_weight_i_index; - const int *weights_order = (in_tensors_.size() == mindir_input_tensors) ? weights_order_IFOG : nullptr; - auto weight_i = in_tensors_.at(i_index); - auto weight_i_data = reinterpret_cast(weight_i->data()); - - CHECK_NULL_RETURN(weight_i_data); - int cw_size = (lstm_param_->input_size_ * lstm_param_->hidden_size_); - int hh_size = (lstm_param_->hidden_size_ * lstm_param_->hidden_size_); - int b_size = (lstm_param_->hidden_size_); - bool has_bias = (weight_batch_ * (cw_size + hh_size) < weight_i->ElementsNum()) ? true : false; - int stride = (gpu_orig_state_) ? gate_num * (cw_size + hh_size) : gate_num * (cw_size); - PackLstmWeightWithStride(weight_i_ptr_, weight_i_data, weight_batch_, lstm_param_->input_size_, - lstm_param_->hidden_size_, lstm_param_->input_col_align_, lstm_param_->bidirectional_, - stride, weights_order); - // input bias - input_bias_ = reinterpret_cast( - ms_context_->allocator->Malloc(weight_batch_ * lstm_param_->input_col_align_ * sizeof(float))); - if (input_bias_ == nullptr) { - MS_LOG(ERROR) << "LstmCPUKernel malloc input_bias_ error."; - return RET_ERROR; - } - memset(input_bias_, 0, weight_batch_ * lstm_param_->input_col_align_ * sizeof(float)); - buffer_running_malloc_.push_back(input_bias_); - - int offset = weight_batch_ * (cw_size + hh_size); - float *bias_data = (has_bias) ? weight_i_data + offset : nullptr; - int dir_mul = lstm_param_->bidirectional_ ? C2NUM : C1NUM; - int b_stride = (gpu_orig_state_) ? gate_num * (dir_mul * b_size) : gate_num * (b_size); - if (in_tensors_.size() > mindir_input_tensors) { - bias_data = reinterpret_cast(in_tensors_.at(onnx_bias_index)->data()); - CHECK_NULL_RETURN(bias_data); - PackLstmBias(input_bias_, bias_data, weight_batch_, lstm_param_->hidden_size_, lstm_param_->input_col_align_, - lstm_param_->bidirectional_, weights_order); - } else { - if (bias_data != nullptr) { - PackLstmBiasWithStride(input_bias_, bias_data, weight_batch_, lstm_param_->hidden_size_, - lstm_param_->input_col_align_, lstm_param_->bidirectional_, b_stride, weights_order); - } - } - return RET_OK; -} - -int LstmCPUKernel::InitStateWeightBias() { - // malloc and init state * weight right matrix buffer, state * weight will be executed seq_len_ times. - // state -- row: batch; col: hidden_size - // weight -- row: hidden_size; col: hidden_size, need transpose - // result -- row: batch; col: hidden_size - int weight_i_size = weight_batch_ * lstm_param_->hidden_size_ * lstm_param_->input_size_; - int h_index = (in_tensors_.size() == mindir_input_tensors) ? combined_weights_index : onnx_weight_h_index; - auto weight_h = in_tensors_.at(h_index); - auto weight_h_data = (reinterpret_cast(weight_h->data())); - - int cw_size = (lstm_param_->input_size_ * lstm_param_->hidden_size_); - int hh_size = (lstm_param_->hidden_size_ * lstm_param_->project_size_); - int b_size = (lstm_param_->hidden_size_); - int stride = (gpu_orig_state_) ? gate_num * (cw_size + hh_size) : gate_num * (hh_size); - - if (in_tensors_.size() == mindir_input_tensors) { - if (gpu_orig_state_) { - weight_h_data += gate_num * cw_size; - } else { - weight_h_data += weight_i_size; - } - } - CHECK_NULL_RETURN(weight_h_data); - if (!state_is_vec_) { - weight_h_ptr_ = reinterpret_cast(ms_context_->allocator->Malloc( - weight_batch_ * lstm_param_->state_col_align_ * lstm_param_->project_size_ * sizeof(float))); - if (weight_h_ptr_ == nullptr) { - MS_LOG(ERROR) << "LstmCPUKernel malloc weight_h_ptr_ error."; - return RET_ERROR; - } - buffer_running_malloc_.push_back(weight_h_ptr_); - const int *weights_order = (in_tensors_.size() == mindir_input_tensors) ? weights_order_IFOG : nullptr; - PackLstmWeightWithStride(weight_h_ptr_, weight_h_data, weight_batch_, lstm_param_->project_size_, - lstm_param_->hidden_size_, lstm_param_->state_col_align_, lstm_param_->bidirectional_, - stride, weights_order); - } else { -#ifdef ENABLE_AVX - weight_h_ptr_ = reinterpret_cast(ms_context_->allocator->Malloc( - weight_batch_ * lstm_param_->state_col_align_ * lstm_param_->project_size_ * sizeof(float))); - if (weight_h_ptr_ == nullptr) { - MS_LOG(ERROR) << "LstmCPUKernel malloc weight_h_ptr_ error."; - return RET_ERROR; - } - buffer_running_malloc_.push_back(weight_h_ptr_); - for (int i = 0; i < weight_batch_; i++) { - const float *src_batch = weight_h_data + i * lstm_param_->hidden_size_ * lstm_param_->project_size_; - float *dst_batch = weight_h_ptr_ + i * lstm_param_->state_col_align_ * lstm_param_->project_size_; - RowMajor2Col32Major(src_batch, dst_batch, lstm_param_->hidden_size_, lstm_param_->project_size_); - } -#else - weight_h_ptr_ = weight_h_data; -#endif - } - - // state bias - int weight_h_size = weight_batch_ * lstm_param_->hidden_size_ * lstm_param_->hidden_size_; - int bias_size = weight_batch_ * lstm_param_->hidden_size_; - state_bias_ = reinterpret_cast( - ms_context_->allocator->Malloc(weight_batch_ * lstm_param_->state_col_align_ * sizeof(float))); - if (state_bias_ == nullptr) { - MS_LOG(ERROR) << "LstmCPUKernel malloc state_bias_ error."; - return RET_ERROR; - } - memset(state_bias_, 0, weight_batch_ * lstm_param_->state_col_align_ * sizeof(float)); - buffer_running_malloc_.push_back(state_bias_); - // if ONNX, secend bias is also present order IOFG - if (in_tensors_.size() > mindir_input_tensors) { - float *state_bias = - reinterpret_cast(in_tensors_.at(onnx_bias_index)->data()) + gate_num * lstm_param_->hidden_size_; - CHECK_NULL_RETURN(state_bias); - PackLstmBias(state_bias_, state_bias, weight_batch_, lstm_param_->hidden_size_, lstm_param_->state_col_align_, - lstm_param_->bidirectional_, nullptr); - } else if (weight_h->ElementsNum() - weight_i_size - weight_h_size - C2NUM * bias_size == 0) { - // mindir from device "GPU", secend bias is also present order IFOG - int dir_mul = lstm_param_->bidirectional_ ? C2NUM : C1NUM; - int bias_offset = (gpu_orig_state_) ? gate_num * ((dir_mul - C1NUM) * cw_size + dir_mul * hh_size + b_size) - : weight_h_size + bias_size; - float *state_bias = weight_h_data + bias_offset; - int b_stride = (gpu_orig_state_) ? gate_num * (b_size * C2NUM) : gate_num * b_size; - PackLstmBiasWithStride(state_bias_, state_bias, weight_batch_, lstm_param_->hidden_size_, - lstm_param_->state_col_align_, lstm_param_->bidirectional_, b_stride, weights_order_IFOG); - } - return RET_OK; -} - -int LstmCPUKernel::InitProjectWeight() { - if (in_tensors_.size() < C7NUM) { - return RET_OK; - } - auto weight_pro = in_tensors_.at(SEVENTH_INPUT); - auto shape = weight_pro->shape(); - if (shape.size() != C3NUM) { - MS_LOG(ERROR) << "Project-weight's shape must be 3D."; - return RET_ERROR; - } - auto weight_pro_data = reinterpret_cast(weight_pro->data()); - CHECK_NULL_RETURN(weight_pro_data); - int batch = lstm_param_->bidirectional_ ? C2NUM : C1NUM; - if (shape[0] != batch) { - MS_LOG(ERROR) << "Project-weight's shape[0] must be 1(bidirectional=false) or 2(bidirectional=true)."; - return RET_ERROR; - } - int col_align = UP_ROUND(lstm_param_->project_size_, col_tile_); - if (!state_is_vec_) { - weight_project_ptr_ = reinterpret_cast( - ms_context_->allocator->Malloc(batch * lstm_param_->hidden_size_ * col_align * sizeof(float))); - if (weight_project_ptr_ == nullptr) { - MS_LOG(ERROR) << "LstmCPUKernel malloc weight_project_ptr_ error."; - return RET_ERROR; - } - buffer_running_malloc_.push_back(weight_project_ptr_); - PackLstmWeightWithStride(weight_project_ptr_, weight_pro_data, batch, lstm_param_->hidden_size_, - lstm_param_->project_size_, col_align, lstm_param_->bidirectional_, - lstm_param_->hidden_size_ * lstm_param_->project_size_, nullptr); - } else { -#ifdef ENABLE_AVX - weight_project_ptr_ = reinterpret_cast( - ms_context_->allocator->Malloc(batch * lstm_param_->hidden_size_ * col_align * sizeof(float))); - if (weight_project_ptr_ == nullptr) { - MS_LOG(ERROR) << "LstmCPUKernel malloc weight_project_ptr_ error."; - return RET_ERROR; - } - buffer_running_malloc_.push_back(weight_project_ptr_); - for (int i = 0; i < batch; ++i) { - const float *src_batch = weight_pro_data + i * lstm_param_->hidden_size_ * lstm_param_->project_size_; - float *dst_batch = weight_project_ptr_ + i * lstm_param_->hidden_size_ * col_align; - RowMajor2Col32Major(src_batch, dst_batch, lstm_param_->project_size_, lstm_param_->hidden_size_); - } -#else - weight_project_ptr_ = weight_pro_data; -#endif - } - return RET_OK; -} - -int LstmCPUKernel::InitParam() { - auto input = in_tensors_.front(); - std::vector in_shape = input->shape(); - lstm_param_->seq_len_ = in_shape.at(FIRST_INPUT); - lstm_param_->batch_ = in_shape.at(SECOND_INPUT); - lstm_param_->input_size_ = in_shape.at(THIRD_INPUT); - - auto weight_i = in_tensors_.at(onnx_weight_i_index); - std::vector w_shape = weight_i->shape(); - if (in_tensors_.size() == mindir_input_tensors) { - hidden_state_input_index_ = mindir_hidden_state_input_index; - cell_state_input_index_ = mindir_cell_state_input_index; - lstm_param_->hidden_size_ = w_shape.at(THIRD_INPUT); - lstm_param_->project_size_ = lstm_param_->hidden_size_; - } else { - lstm_param_->hidden_size_ = w_shape.at(SECOND_INPUT) / gate_num; - auto weight_h = in_tensors_[THIRD_INPUT]; - auto h_shape = weight_h->shape(); - lstm_param_->project_size_ = h_shape.back(); - } - - lstm_param_->output_step_ = lstm_param_->bidirectional_ ? C2NUM * lstm_param_->batch_ * lstm_param_->hidden_size_ - : lstm_param_->batch_ * lstm_param_->hidden_size_; - weight_batch_ = lstm_param_->bidirectional_ ? C2NUM * gate_num : gate_num; - state_is_vec_ = lstm_param_->batch_ == 1; - // determine FB origin - gpu_orig_state_ = false; - if (in_tensors_.size() == mindir_input_tensors) { - gpu_orig_state_ = gpu_orig_cfg_; - auto weight_t = in_tensors_.at(combined_weights_index); - int cw_size = (lstm_param_->input_size_ * lstm_param_->hidden_size_); - int hh_size = (lstm_param_->hidden_size_ * lstm_param_->hidden_size_); - int b_size = (lstm_param_->hidden_size_); - bool has_bias = (weight_batch_ * (cw_size + hh_size) < weight_t->ElementsNum()) ? true : false; - // if bias exist we can determine the gpu_orig_state_ - if (has_bias) { - gpu_orig_state_ = - (weight_batch_ * (cw_size + hh_size + C2NUM * b_size) == weight_t->ElementsNum()) ? true : false; - } - } - -#ifdef ENABLE_AVX - row_tile_ = C6NUM; - col_tile_ = C16NUM; -#elif defined(ENABLE_ARM32) - row_tile_ = C12NUM; - col_tile_ = C4NUM; -#elif defined(ENABLE_SSE) - row_tile_ = C4NUM; - col_tile_ = C8NUM; -#else - row_tile_ = C12NUM; - col_tile_ = C8NUM; -#endif - lstm_param_->input_row_align_ = UP_ROUND(lstm_param_->seq_len_ * lstm_param_->batch_, row_tile_); - lstm_param_->input_col_align_ = UP_ROUND(lstm_param_->hidden_size_, col_tile_); - input_thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(lstm_param_->input_col_align_, col_tile_)); - MS_CHECK_FALSE(input_thread_count_ == 0, RET_ERROR); - input_thread_stride_ = UP_DIV(UP_DIV(lstm_param_->input_col_align_, col_tile_), input_thread_count_); - - state_row_tile_ = row_tile_; - state_col_tile_ = col_tile_; -#ifdef ENABLE_AVX - if (state_is_vec_) { - state_row_tile_ = 1; - state_col_tile_ = C8NUM; - } -#endif - - lstm_param_->state_row_align_ = state_is_vec_ ? 1 : UP_ROUND(lstm_param_->batch_, state_row_tile_); -#ifdef ENABLE_AVX - lstm_param_->state_col_align_ = UP_ROUND(lstm_param_->hidden_size_, state_col_tile_); -#else - lstm_param_->state_col_align_ = - state_is_vec_ ? lstm_param_->hidden_size_ : UP_ROUND(lstm_param_->hidden_size_, state_col_tile_); -#endif - return RET_OK; +constexpr size_t kMindirInputTensorNum = 4; } - -int LstmCPUKernel::Prepare() { - CHECK_LESS_RETURN(in_tensors_.size(), mindir_input_tensors); - for (size_t i = 0; i < in_tensors_.size(); i++) { - CHECK_NULL_RETURN(in_tensors_.at(i)); - } - CHECK_LESS_RETURN(out_tensors_.size(), DIMENSION_3D); - for (size_t i = 0; i < out_tensors_.size(); i++) { - CHECK_NULL_RETURN(out_tensors_.at(i)); - } - CHECK_NULL_RETURN(lstm_param_); - if (!InferShapeDone()) { - return RET_OK; - } - return ReSize(); -} - -int LstmCPUKernel::ReSize() { - auto ret = InitParam(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "LstmCPUKernel InitParam error."; - return RET_ERROR; +LiteKernel *LstmFp32KernelCreator(const std::vector &inputs, const std::vector &outputs, + OpParameter *parameter, const lite::InnerContext *ctx, const kernel::KernelKey &desc) { + if (parameter == nullptr) { + MS_LOG(ERROR) << "parameter is nullptr."; + return nullptr; } - - return RET_OK; -} - -int LstmCPUKernel::MallocRunBuffer(bool is_double) { - bool need_zone = lstm_param_->zoneout_cell_ < -FLT_EPSILON || lstm_param_->zoneout_cell_ > FLT_EPSILON; - size_t whole_size = 0; - std::vector segments; - int scale = is_double ? C2NUM : 1; - size_t segment = gate_num * lstm_param_->seq_len_ * lstm_param_->batch_ * - lstm_param_->hidden_size_; // 0: input * weight for result matrix - segments.push_back(segment); - whole_size += segment * scale; - - segment = state_is_vec_ - ? 0 - : lstm_param_->state_row_align_ * lstm_param_->project_size_; // 1: state * weight for left matirx - segments.push_back(segment); - whole_size += segment * scale; - - segment = gate_num * lstm_param_->batch_ * lstm_param_->hidden_size_; // 2: state gate buffer - segments.push_back(segment); - whole_size += segment * scale; - - segment = need_zone ? lstm_param_->batch_ * lstm_param_->hidden_size_ : 0; // 3: state_buffer for cell - segments.push_back(segment); - whole_size += segment * scale; - - segment = need_zone ? lstm_param_->batch_ * lstm_param_->project_size_ : 0; // 4: state_buffer for hidden - segments.push_back(segment); - whole_size += segment * scale; - - segment = 0; -#ifdef ENABLE_AVX - bool output_need_packed = lstm_param_->hidden_size_ % state_col_tile_; - if (state_is_vec_ && output_need_packed) { // vec matmul need to malloc dst - int out_channel = lstm_param_->hidden_size_; - int oc_block_num = UP_DIV(out_channel, state_col_tile_); - MS_ASSERT(ms_context_->allocator != nullptr); - segment = lstm_param_->batch_ * oc_block_num * state_col_tile_; // 5: tmp output data + if (desc.data_type == kTypeUnknown) { + MS_LOG(WARNING) << "desc data_type is unknown."; } -#endif - segments.push_back(segment); - whole_size += segment * scale; - - if (in_tensors_.size() == C7NUM) { - segment = state_is_vec_ ? 0 : lstm_param_->state_row_align_ * lstm_param_->hidden_size_ * scale; - segments.push_back(segment); // 6: project-layer input - whole_size += segment; - segment = 0; -#ifdef ENABLE_AVX - segment = - output_need_packed ? lstm_param_->batch_ * UP_ROUND(lstm_param_->project_size_, state_col_tile_) * scale : 0; -#endif - segments.push_back(segment); // 7: project-layer output - whole_size += segment; + LiteKernel *kernel{nullptr}; + if (inputs.size() == kMindirInputTensorNum) { + kernel = new (std::nothrow) + LstmMindirFp32CPUKernel(parameter, inputs, outputs, static_cast(ctx)); } else { - (void)segments.insert(segments.end(), C2NUM, 0); - } - - segment = 0; - if (!(in_tensors_.size() > mindir_input_tensors)) { - segment = lstm_param_->batch_ * lstm_param_->hidden_size_; - } - segments.push_back(segment); - whole_size += segment * scale; - - segment = - lstm_param_->input_row_align_ * lstm_param_->input_size_; // input * weight for left matrix, which only once - whole_size += segment; - - auto whole_memory = reinterpret_cast(ms_context_->allocator->Malloc(whole_size * sizeof(float))); - MS_CHECK_TRUE_MSG(whole_memory != nullptr, RET_ERROR, "LSTM: malloc failed."); - buffer_running_malloc_.push_back(whole_memory); - MS_ASSERT(segments.size() == C9NUM); - auto Allocate = [&whole_memory, &segments](float **buffer) mutable { - for (int i = 0; i < C9NUM; ++i) { - buffer[i] = nullptr; - if (segments[i] == 0) { - continue; - } - buffer[i] = whole_memory; - whole_memory += segments[i]; - } - }; - Allocate(buffer_forward_); - if (is_double) { - Allocate(buffer_backward_); - } - packed_input_ = whole_memory; - return RET_OK; -} - -void LstmCPUKernel::InputWeightMatMul(int task_id) const { - int current_start_oc = task_id * input_thread_stride_ * col_tile_; - int current_rest_oc = 0; - current_rest_oc = lstm_param_->hidden_size_ - current_start_oc; - int cur_oc = MSMIN(input_thread_stride_ * col_tile_, current_rest_oc); - if (cur_oc <= 0) { - return; - } - - auto b = weight_loop_ + current_start_oc * lstm_param_->input_size_; - auto c = gate_loop_ + current_start_oc; - auto bias = (bias_loop_ == nullptr) ? nullptr : bias_loop_ + current_start_oc; - MatMulOpt(packed_input_, b, c, bias, ActType_No, lstm_param_->input_size_, - lstm_param_->seq_len_ * lstm_param_->batch_, cur_oc, lstm_param_->hidden_size_, OutType_Nhwc); -} - -int LstmCPUKernel::DoSequenceLoop(int task_id) { - if (task_id == 0) { - LstmForwardLoop(buffer_forward_); - return RET_OK; - } - if (task_id == 1) { - LstmBackwardLoop(buffer_backward_); - return RET_OK; - } - return RET_ERROR; -} - -int LstmCPUKernel::LstmPreProcessWithInput(const float *weight_i, const float *input_bias, float *dst) { - for (int i = 0; i < gate_num; i++) { - weight_loop_ = weight_i + lstm_param_->input_size_ * lstm_param_->input_col_align_ * i; - bias_loop_ = input_bias + lstm_param_->input_col_align_ * i; - gate_loop_ = dst + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * i; - auto ret = ParallelLaunch(this->ms_context_, LstmInputMulWeightRun, this, input_thread_count_); - if (ret != RET_OK) { - return RET_ERROR; - } - } - return RET_OK; -} - -void LstmCPUKernel::LstmUnidirectional(float *output, const float *weight_h, const float *state_bias, - float *hidden_state, float *cell_state, const float *weight_project, - float *intermediate_states, float *buffer[], bool is_backward) { - float *gate = buffer[input_gate_index]; - float *input_gate = gate; - float *forget_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * C2NUM; - float *cell_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * C3NUM; - float *output_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_; - float *tmp = buffer[tmp_hidden_output_index]; - int dir_mult = lstm_param_->bidirectional_ ? C2NUM : C1NUM; - for (int t = 0; t < lstm_param_->seq_len_; t++) { - int real_t = is_backward ? lstm_param_->seq_len_ - t - C1NUM : t; - float *input_gate_t = input_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t; - float *forget_gate_t = forget_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t; - float *cell_gate_t = cell_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t; - float *output_gate_t = output_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t; - // if ONNX - if (in_tensors_.size() > mindir_input_tensors) { - // Sequence, DirMul, Batch, Hidden - float *output_ptr = output + real_t * lstm_param_->output_step_; - - LstmStepUnit(output_ptr, input_gate_t, forget_gate_t, cell_gate_t, output_gate_t, weight_h, state_bias, - weight_project, hidden_state, cell_state, buffer, lstm_param_); - } else { - // Sequence, Batch, DirMul, Hidden - LstmStepUnit(tmp, input_gate_t, forget_gate_t, cell_gate_t, output_gate_t, weight_h, state_bias, nullptr, - hidden_state, cell_state, buffer, lstm_param_); - int seq_offset = real_t * lstm_param_->batch_ * dir_mult * lstm_param_->hidden_size_; - for (int b = 0; b < lstm_param_->batch_; b++) { - int batch_offset = b * dir_mult * lstm_param_->hidden_size_; - float *output_ptr = output + seq_offset + batch_offset; - memcpy(output_ptr, tmp + b * lstm_param_->hidden_size_, lstm_param_->hidden_size_ * sizeof(float)); - } - } - if (intermediate_states) { - RecordStates(hidden_state, cell_state, input_gate_t, output_gate_t, forget_gate_t, cell_gate_t, - intermediate_states, real_t); - } - } -} - -void LstmCPUKernel::RecordStates(const float *hidden_state, float *cell_state, float *input_gate, - const float *output_gate, float *forget_gate, const float *cell_gate, - float *intermediate_states, int step) { - float *states = intermediate_states; - auto state_size = lstm_param_->batch_ * lstm_param_->hidden_size_; - if (state_size < 0) { - MS_LOG(ERROR) << "state size should be greater than or equal to zero."; - return; - } - auto stride = step * lstm_param_->output_step_; - auto seq_stride = lstm_param_->seq_len_ * lstm_param_->output_step_; - memcpy(states + stride, hidden_state, state_size * sizeof(float)); - stride += seq_stride; - memcpy(states + stride, cell_state, state_size * sizeof(float)); - stride += seq_stride; - memcpy(states + stride, input_gate, state_size * sizeof(float)); - stride += seq_stride; - memcpy(states + stride, output_gate, state_size * sizeof(float)); - stride += seq_stride; - memcpy(states + stride, forget_gate, state_size * sizeof(float)); - stride += seq_stride; - memcpy(states + stride, cell_gate, state_size * sizeof(float)); -} - -void LstmCPUKernel::LstmForwardLoop(float *buffer[]) { - auto *output = reinterpret_cast(out_tensors_.at(0)->data()); - auto *hidden_state = reinterpret_cast(out_tensors_.at(1)->data()); - auto *cell_state = reinterpret_cast(out_tensors_.at(C2NUM)->data()); - LstmUnidirectional(output, weight_h_ptr_, state_bias_, hidden_state, cell_state, weight_project_ptr_, - intermediate_states_, buffer, false); -} - -void LstmCPUKernel::LstmBackwardLoop(float *buffer[]) { - auto *output = reinterpret_cast(out_tensors_.at(0)->data()); - auto *hidden_state = reinterpret_cast(out_tensors_.at(1)->data()); - auto *cell_state = reinterpret_cast(out_tensors_.at(C2NUM)->data()); - const float *backward_weight_h = weight_h_ptr_ + gate_num * lstm_param_->state_col_align_ * lstm_param_->hidden_size_; - const float *backward_state_bias = state_bias_ + gate_num * lstm_param_->state_col_align_; - float *backward_output = output + lstm_param_->batch_ * lstm_param_->hidden_size_; - if (in_tensors_.size() == mindir_input_tensors) { - backward_output = output + lstm_param_->hidden_size_; - } - float *backward_cell_state = cell_state + lstm_param_->batch_ * lstm_param_->hidden_size_; - float *backward_hidden_state = hidden_state + lstm_param_->batch_ * lstm_param_->hidden_size_; - float *intermediate_states = nullptr; - if (intermediate_states_) { - intermediate_states = intermediate_states_ + lstm_param_->batch_ * lstm_param_->hidden_size_; - } - float *backward_weight_project = - weight_project_ptr_ - ? weight_project_ptr_ + lstm_param_->hidden_size_ * UP_ROUND(lstm_param_->project_size_, col_tile_) - : nullptr; - LstmUnidirectional(backward_output, backward_weight_h, backward_state_bias, backward_hidden_state, - backward_cell_state, backward_weight_project, intermediate_states, buffer, true); -} - -int LstmCPUKernel::ExecuteUnidirectionalOrSingleThread() { - auto ret = LstmPreProcessWithInput(weight_i_ptr_, input_bias_, buffer_forward_[input_gate_index]); - if (ret != RET_OK) { - MS_LOG(ERROR) << "LSTM Forward: Input-MatMul running failed."; - return RET_ERROR; - } - LstmForwardLoop(buffer_forward_); - - // backward - if (lstm_param_->bidirectional_) { - const float *backward_weight_i = - weight_i_ptr_ + gate_num * lstm_param_->input_col_align_ * lstm_param_->input_size_; - const float *backward_input_bias = input_bias_ + gate_num * lstm_param_->input_col_align_; - ret = LstmPreProcessWithInput(backward_weight_i, backward_input_bias, buffer_forward_[input_gate_index]); - if (ret != RET_OK) { - MS_LOG(ERROR) << "LSTM Backward: Input-MatMul running failed."; - return RET_ERROR; - } - LstmBackwardLoop(buffer_forward_); - } - return RET_OK; -} - -int LstmCPUKernel::ExecuteBidirectionalWithMultiThread() { - auto ret = LstmPreProcessWithInput(weight_i_ptr_, input_bias_, buffer_forward_[input_gate_index]); - if (ret != RET_OK) { - MS_LOG(ERROR) << "LSTM Forward: Input-MatMul running failed."; - return RET_ERROR; - } - const float *backward_weight_i = weight_i_ptr_ + gate_num * lstm_param_->input_col_align_ * lstm_param_->input_size_; - const float *backward_input_bias = input_bias_ + gate_num * lstm_param_->input_col_align_; - ret = LstmPreProcessWithInput(backward_weight_i, backward_input_bias, buffer_backward_[input_gate_index]); - if (ret != RET_OK) { - MS_LOG(ERROR) << "LSTM Backward: Input-MatMul running failed."; - return RET_ERROR; - } - ret = ParallelLaunch(this->ms_context_, LstmSequenceLoopRun, this, C2NUM); - if (ret != RET_OK) { - MS_LOG(ERROR) << "LSTM: Do sequence-loop failed."; - } - return ret; -} - -int LstmCPUKernel::Run() { - auto input = in_tensors_.at(0); - auto output = out_tensors_.at(0); - CHECK_NULL_RETURN(input); - CHECK_NULL_RETURN(output); - auto input_ptr = reinterpret_cast(input->data()); - CHECK_NULL_RETURN(input_ptr); - auto output_ptr = reinterpret_cast(output->data()); - CHECK_NULL_RETURN(output_ptr); - - auto hidden_state = in_tensors_.at(hidden_state_input_index_); - CHECK_NULL_RETURN(hidden_state->data()); - auto cell_state = in_tensors_.at(cell_state_input_index_); - CHECK_NULL_RETURN(cell_state->data()); - - auto output_hidden_state = out_tensors_[kOutputHiddenStatusIndex]; - CHECK_NULL_RETURN(output_hidden_state->data()); - (void)memcpy(output_hidden_state->data(), hidden_state->data(), hidden_state->ElementsNum() * sizeof(float)); - auto output_cell_state = out_tensors_[kOutputCellStatusIndex]; - CHECK_NULL_RETURN(output_cell_state->data()); - (void)memcpy(output_cell_state->data(), cell_state->data(), cell_state->ElementsNum() * sizeof(float)); - - auto ret = InitInputWeightBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "LstmCPUKernel InitInputWeightBias error."; - FreeRunBuffer(); - return RET_ERROR; - } - - ret = InitStateWeightBias(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "LstmCPUKernel InitStateWeightBias error."; - FreeRunBuffer(); - return RET_ERROR; + kernel = new (std::nothrow) + LstmNonMindirFp32CPUKernel(parameter, inputs, outputs, static_cast(ctx)); } - - ret = InitProjectWeight(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "LstmCPUKernel InitProjectWeight error."; - FreeRunBuffer(); - return RET_ERROR; - } - bool is_bidirectional_with_multi_thread = thread_num_ != 1 && lstm_param_->bidirectional_; - ret = MallocRunBuffer(is_bidirectional_with_multi_thread); - if (ret != RET_OK) { - MS_LOG(ERROR) << "LstmCPUKernel MallocRunBuffer Error."; - FreeRunBuffer(); - return RET_ERROR; - } - - PackLstmInput(input_ptr, packed_input_, lstm_param_->seq_len_ * lstm_param_->batch_, lstm_param_->input_size_); - if (IsTrain() && IsTrainable()) { - intermediate_states_ = reinterpret_cast(out_tensors_[out_intermediate_states_index]->data()); + if (kernel == nullptr) { + MS_LOG(ERROR) << "kernel: " << parameter->name_ << "is nullptr."; + free(parameter); + return nullptr; } - CHECK_NULL_RETURN(weight_h_ptr_); - CHECK_NULL_RETURN(weight_i_ptr_); - CHECK_NULL_RETURN(input_bias_); - CHECK_NULL_RETURN(state_bias_); - if (is_bidirectional_with_multi_thread) { - ret = ExecuteBidirectionalWithMultiThread(); - } else { - ret = ExecuteUnidirectionalOrSingleThread(); - } - FreeRunBuffer(); - return ret; + return kernel; } - -REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_LSTM, LiteKernelCreator) +REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_LSTM, LstmFp32KernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32_base.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32_base.cc new file mode 100644 index 00000000..bd0f0e7d --- /dev/null +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32_base.cc @@ -0,0 +1,398 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/cpu/fp32/lstm_fp32_base.h" +#include +#include "include/errorcode.h" +#include "nnacl/fp32/pack_fp32.h" +#include "nnacl/fp32/matmul_fp32.h" + +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_MEMORY_FAILED; +using mindspore::lite::RET_OK; + +namespace mindspore::kernel { +namespace { +constexpr size_t kMindirInputTensorNum = 4; +constexpr int kGateNum = 4; +constexpr int kOutIntermediateStatesIndex = 3; +constexpr int kInputGateIndex = 0; +} // namespace + +int LstmSequenceLoopRun(void *cdata, int task_id, float, float) { + auto kernel = reinterpret_cast(cdata); + CHECK_NULL_RETURN(kernel); + auto ret = kernel->DoSequenceLoop(task_id); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LSTM: Do Sequence-loop failed."; + } + return ret; +} + +int LstmFp32BaseCPUKernel::Prepare() { + MS_CHECK_TRUE_MSG(in_tensors_.size() == kMindirInputTensorNum || in_tensors_.size() >= C6NUM, + lite::RET_INPUT_TENSOR_ERROR, "Lstm's input-num is invalid."); + for (size_t i = 0; i < in_tensors_.size(); i++) { + CHECK_NULL_RETURN(in_tensors_.at(i)); + } + CHECK_LESS_RETURN(out_tensors_.size(), DIMENSION_3D); + for (size_t i = 0; i < out_tensors_.size(); i++) { + CHECK_NULL_RETURN(out_tensors_.at(i)); + } + CHECK_NULL_RETURN(lstm_param_); + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int LstmFp32BaseCPUKernel::ReSize() { + auto input = in_tensors_.front(); + std::vector in_shape = input->shape(); + MS_CHECK_TRUE_MSG(in_shape.size() == C3NUM, lite::RET_INPUT_TENSOR_ERROR, + "The dims of LSTM's first input must be 3."); + lstm_param_->seq_len_ = in_shape.at(FIRST_INPUT); + lstm_param_->batch_ = in_shape.at(SECOND_INPUT); + lstm_param_->input_size_ = in_shape.at(THIRD_INPUT); + + auto h_init_shape = in_tensors_.at(hidden_init_index_)->shape(); + auto c_init_shape = in_tensors_.at(cell_init_index_)->shape(); + lstm_param_->hidden_size_ = c_init_shape.back(); + lstm_param_->output_size_ = h_init_shape.back(); + + lstm_param_->output_step_ = lstm_param_->bidirectional_ ? C2NUM * lstm_param_->batch_ * lstm_param_->output_size_ + : lstm_param_->batch_ * lstm_param_->output_size_; + weight_segment_num_ = lstm_param_->bidirectional_ ? C2NUM * kGateNum : kGateNum; + +#ifdef ENABLE_AVX + row_tile_ = C6NUM; + col_tile_ = C16NUM; +#elif defined(ENABLE_ARM32) + row_tile_ = C12NUM; + col_tile_ = C4NUM; +#elif defined(ENABLE_SSE) + row_tile_ = C4NUM; + col_tile_ = C8NUM; +#else + row_tile_ = C12NUM; + col_tile_ = C8NUM; +#endif + lstm_param_->input_row_align_ = UP_ROUND(lstm_param_->seq_len_ * lstm_param_->batch_, row_tile_); + lstm_param_->input_col_align_ = UP_ROUND(lstm_param_->hidden_size_, col_tile_); + + state_row_tile_ = row_tile_; + state_col_tile_ = col_tile_; +#ifdef ENABLE_AVX + if (lstm_param_->batch_ == 1) { + state_row_tile_ = 1; + state_col_tile_ = C8NUM; + } +#endif + + lstm_param_->state_row_align_ = lstm_param_->batch_ == 1 ? 1 : UP_ROUND(lstm_param_->batch_, state_row_tile_); +#ifdef ENABLE_AVX + lstm_param_->state_col_align_ = UP_ROUND(lstm_param_->hidden_size_, state_col_tile_); + lstm_param_->proj_col_align_ = UP_ROUND(lstm_param_->output_size_, state_col_tile_); +#else + lstm_param_->state_col_align_ = + lstm_param_->batch_ == 1 ? lstm_param_->hidden_size_ : UP_ROUND(lstm_param_->hidden_size_, state_col_tile_); + lstm_param_->proj_col_align_ = + lstm_param_->batch_ == 1 ? lstm_param_->output_size_ : UP_ROUND(lstm_param_->output_size_, state_col_tile_); +#endif + return RET_OK; +} + +int LstmFp32BaseCPUKernel::Run() { + auto input = in_tensors_.at(FIRST_INPUT); + auto output = out_tensors_.at(FIRST_INPUT); + auto input_ptr = reinterpret_cast(input->data()); + CHECK_NULL_RETURN(input_ptr); + auto output_ptr = reinterpret_cast(output->data()); + CHECK_NULL_RETURN(output_ptr); + + auto hidden_state = in_tensors_.at(hidden_init_index_); + CHECK_NULL_RETURN(hidden_state->data()); + auto cell_state = in_tensors_.at(cell_init_index_); + CHECK_NULL_RETURN(cell_state->data()); + + auto output_hidden_state = out_tensors_[SECOND_INPUT]; + CHECK_NULL_RETURN(output_hidden_state->data()); + (void)memcpy(output_hidden_state->data(), hidden_state->data(), hidden_state->ElementsNum() * sizeof(float)); + auto output_cell_state = out_tensors_[THIRD_INPUT]; + CHECK_NULL_RETURN(output_cell_state->data()); + (void)memcpy(output_cell_state->data(), cell_state->data(), cell_state->ElementsNum() * sizeof(float)); + + auto ret = InitInputWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LstmCPUKernel InitInputWeightBias error."; + FreeRunBuffer(); + return RET_ERROR; + } + + ret = InitStateWeightBias(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LstmCPUKernel InitStateWeightBias error."; + FreeRunBuffer(); + return RET_ERROR; + } + + ret = InitProjectWeight(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LstmCPUKernel InitProjectWeight error."; + FreeRunBuffer(); + return RET_ERROR; + } + bool is_bidirectional_with_multi_thread = thread_num_ != 1 && lstm_param_->bidirectional_; + ret = MallocRunBuffer(is_bidirectional_with_multi_thread); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LstmCPUKernel MallocRunBuffer Error."; + FreeRunBuffer(); + return RET_ERROR; + } + + PackLstmInput(input_ptr, packed_input_, lstm_param_->seq_len_ * lstm_param_->batch_, lstm_param_->input_size_); + if (IsTrain() && IsTrainable()) { + intermediate_states_ = reinterpret_cast(out_tensors_[kOutIntermediateStatesIndex]->data()); + } + CHECK_NULL_RETURN(weight_h_ptr_); + CHECK_NULL_RETURN(weight_i_ptr_); + CHECK_NULL_RETURN(input_bias_); + CHECK_NULL_RETURN(state_bias_); + if (is_bidirectional_with_multi_thread) { + ret = ExecuteBidirectionalWithMultiThread(); + } else { + ret = ExecuteUnidirectionalOrSingleThread(); + } + FreeRunBuffer(); + return ret; +} + +void LstmFp32BaseCPUKernel::FreeRunBuffer() { + for (auto data : running_buffer_) { + ms_context_->allocator->Free(data); + } + running_buffer_.clear(); +} + +int LstmFp32BaseCPUKernel::MallocRunBuffer(bool is_double) { + bool need_zone = lstm_param_->zoneout_cell_ < -FLT_EPSILON || lstm_param_->zoneout_cell_ > FLT_EPSILON; + size_t whole_size = 0; + std::vector segments; + int scale = is_double ? C2NUM : 1; + size_t segment = kGateNum * lstm_param_->seq_len_ * lstm_param_->batch_ * + lstm_param_->hidden_size_; // 0: input * weight for result matrix + segments.push_back(segment); + whole_size += segment * scale; + + segment = lstm_param_->batch_ == 1 + ? 0 + : lstm_param_->state_row_align_ * lstm_param_->output_size_; // 1: state * weight for left matirx + segments.push_back(segment); + whole_size += segment * scale; + + segment = kGateNum * lstm_param_->batch_ * lstm_param_->hidden_size_; // 2: state gate buffer + segments.push_back(segment); + whole_size += segment * scale; + + segment = need_zone ? lstm_param_->batch_ * lstm_param_->hidden_size_ : 0; // 3: state_buffer for cell + segments.push_back(segment); + whole_size += segment * scale; + + segment = need_zone ? lstm_param_->batch_ * lstm_param_->output_size_ : 0; // 4: state_buffer for hidden + segments.push_back(segment); + whole_size += segment * scale; + + segment = 0; +#ifdef ENABLE_AVX + bool output_need_packed = lstm_param_->hidden_size_ % state_col_tile_; + if (lstm_param_->batch_ == 1 && output_need_packed) { // vec matmul need to malloc dst + int out_channel = lstm_param_->hidden_size_; + int oc_block_num = UP_DIV(out_channel, state_col_tile_); + MS_ASSERT(ms_context_->allocator != nullptr); + segment = lstm_param_->batch_ * oc_block_num * state_col_tile_; // 5: tmp output data + } +#endif + segments.push_back(segment); + whole_size += segment * scale; + + if (in_tensors_.size() == C7NUM || lstm_param_->project_size_ != 0) { + segment = lstm_param_->batch_ == 1 ? 0 : lstm_param_->state_row_align_ * lstm_param_->hidden_size_ * scale; + segments.push_back(segment); // 6: project-layer input + whole_size += segment; + segment = 0; +#ifdef ENABLE_AVX + segment = + output_need_packed ? lstm_param_->batch_ * UP_ROUND(lstm_param_->output_size_, state_col_tile_) * scale : 0; +#endif + segments.push_back(segment); // 7: project-layer output + whole_size += segment; + } else { + (void)segments.insert(segments.end(), C2NUM, 0); + } + + segment = 0; + if (in_tensors_.size() == kMindirInputTensorNum) { + segment = lstm_param_->batch_ * lstm_param_->output_size_; + } + segments.push_back(segment); + whole_size += segment * scale; + + segment = + lstm_param_->input_row_align_ * lstm_param_->input_size_; // input * weight for left matrix, which only once + whole_size += segment; + + auto whole_memory = reinterpret_cast(ms_context_->allocator->Malloc(whole_size * sizeof(float))); + MS_CHECK_TRUE_MSG(whole_memory != nullptr, RET_ERROR, "LSTM: malloc failed."); + running_buffer_.push_back(whole_memory); + MS_ASSERT(segments.size() == C9NUM); + auto Allocate = [&whole_memory, &segments](float **buffer) mutable { + for (int i = 0; i < C9NUM; ++i) { + buffer[i] = nullptr; + if (segments[i] == 0) { + continue; + } + buffer[i] = whole_memory; + whole_memory += segments[i]; + } + }; + Allocate(buffer_forward_); + if (is_double) { + Allocate(buffer_backward_); + } + packed_input_ = whole_memory; + return RET_OK; +} + +int LstmFp32BaseCPUKernel::ExecuteBidirectionalWithMultiThread() { + auto ret = LstmPreProcessWithInput(weight_i_ptr_, input_bias_, buffer_forward_[kInputGateIndex]); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LSTM Forward: Input-MatMul running failed."; + return RET_ERROR; + } + const float *backward_weight_i = weight_i_ptr_ + kGateNum * lstm_param_->input_col_align_ * lstm_param_->input_size_; + const float *backward_input_bias = input_bias_ + kGateNum * lstm_param_->input_col_align_; + ret = LstmPreProcessWithInput(backward_weight_i, backward_input_bias, buffer_backward_[kInputGateIndex]); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LSTM Backward: Input-MatMul running failed."; + return RET_ERROR; + } + ret = ParallelLaunch(this->ms_context_, LstmSequenceLoopRun, this, C2NUM); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LSTM: Do sequence-loop failed."; + } + return ret; +} + +int LstmFp32BaseCPUKernel::ExecuteUnidirectionalOrSingleThread() { + auto ret = LstmPreProcessWithInput(weight_i_ptr_, input_bias_, buffer_forward_[kInputGateIndex]); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LSTM Forward: Input-MatMul running failed."; + return RET_ERROR; + } + LstmForwardLoop(buffer_forward_); + + // backward + if (lstm_param_->bidirectional_) { + const float *backward_weight_i = + weight_i_ptr_ + kGateNum * lstm_param_->input_col_align_ * lstm_param_->input_size_; + const float *backward_input_bias = input_bias_ + kGateNum * lstm_param_->input_col_align_; + ret = LstmPreProcessWithInput(backward_weight_i, backward_input_bias, buffer_forward_[kInputGateIndex]); + if (ret != RET_OK) { + MS_LOG(ERROR) << "LSTM Backward: Input-MatMul running failed."; + return RET_ERROR; + } + LstmBackwardLoop(buffer_forward_); + } + return RET_OK; +} + +int LstmFp32BaseCPUKernel::LstmPreProcessWithInput(const float *weight_i, const float *input_bias, float *dst) { + const float *weight{nullptr}; + const float *bias{nullptr}; + float *gate{nullptr}; + int thread_num = MSMIN(op_parameter_->thread_num_, UP_DIV(lstm_param_->input_col_align_, col_tile_)); + MS_CHECK_FALSE(thread_num == 0, RET_ERROR); + int stride = UP_DIV(UP_DIV(lstm_param_->input_col_align_, col_tile_), thread_num); + auto MatMulCoreFunc = [this, &weight, &bias, &gate, &stride](void *, int task_id, float, float) { + int current_start_oc = task_id * stride * col_tile_; + int current_rest_oc = 0; + current_rest_oc = lstm_param_->hidden_size_ - current_start_oc; + int cur_oc = MSMIN(stride * col_tile_, current_rest_oc); + if (cur_oc <= 0) { + return RET_OK; + } + + auto b = weight + current_start_oc * lstm_param_->input_size_; + auto c = gate + current_start_oc; + auto bias_ = (bias == nullptr) ? nullptr : bias + current_start_oc; + MatMulOpt(packed_input_, b, c, bias_, ActType_No, lstm_param_->input_size_, + lstm_param_->seq_len_ * lstm_param_->batch_, cur_oc, lstm_param_->hidden_size_, OutType_Nhwc); + return RET_OK; + }; + for (int i = 0; i < kGateNum; i++) { + weight = weight_i + lstm_param_->input_size_ * lstm_param_->input_col_align_ * i; + bias = input_bias + lstm_param_->input_col_align_ * i; + gate = dst + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * i; + auto ret = ParallelLaunch(this->ms_context_, MatMulCoreFunc, nullptr, thread_num); + if (ret != RET_OK) { + return RET_ERROR; + } + } + return RET_OK; +} + +int LstmFp32BaseCPUKernel::DoSequenceLoop(int task_id) { + if (task_id == 0) { + LstmForwardLoop(buffer_forward_); + return RET_OK; + } + if (task_id == 1) { + LstmBackwardLoop(buffer_backward_); + return RET_OK; + } + return RET_ERROR; +} + +void LstmFp32BaseCPUKernel::LstmForwardLoop(float *buffer[]) { + auto *output = reinterpret_cast(out_tensors_.at(FIRST_INPUT)->data()); + auto *hidden_state = reinterpret_cast(out_tensors_.at(SECOND_INPUT)->data()); + auto *cell_state = reinterpret_cast(out_tensors_.at(THIRD_INPUT)->data()); + LstmUnidirectional(output, weight_h_ptr_, state_bias_, hidden_state, cell_state, weight_project_ptr_, + intermediate_states_, buffer, false); +} + +void LstmFp32BaseCPUKernel::LstmBackwardLoop(float *buffer[]) { + auto *output = reinterpret_cast(out_tensors_.at(0)->data()); + auto *hidden_state = reinterpret_cast(out_tensors_.at(1)->data()); + auto *cell_state = reinterpret_cast(out_tensors_.at(C2NUM)->data()); + const float *backward_weight_h = weight_h_ptr_ + kGateNum * lstm_param_->state_col_align_ * lstm_param_->output_size_; + const float *backward_state_bias = state_bias_ + kGateNum * lstm_param_->state_col_align_; + float *backward_output = output + lstm_param_->batch_ * lstm_param_->output_size_; + if (in_tensors_.size() == kMindirInputTensorNum) { + backward_output = output + lstm_param_->output_size_; + } + float *backward_cell_state = cell_state + lstm_param_->batch_ * lstm_param_->hidden_size_; + float *backward_hidden_state = hidden_state + lstm_param_->batch_ * lstm_param_->output_size_; + float *intermediate_states = nullptr; + if (intermediate_states_) { + intermediate_states = intermediate_states_ + lstm_param_->batch_ * lstm_param_->output_size_; + } + float *backward_weight_project = + weight_project_ptr_ ? weight_project_ptr_ + lstm_param_->hidden_size_ * lstm_param_->proj_col_align_ : nullptr; + LstmUnidirectional(backward_output, backward_weight_h, backward_state_bias, backward_hidden_state, + backward_cell_state, backward_weight_project, intermediate_states, buffer, true); +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32_base.h b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32_base.h new file mode 100644 index 00000000..c3c10cea --- /dev/null +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_fp32_base.h @@ -0,0 +1,78 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_FP32_BASE_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_FP32_BASE_H_ + +#include +#include "src/litert/lite_kernel.h" +#include "nnacl/fp32/lstm_fp32.h" + +namespace mindspore::kernel { +class LstmFp32BaseCPUKernel : public LiteKernel { + public: + LstmFp32BaseCPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : LiteKernel(parameter, inputs, outputs, ctx) { + lstm_param_ = reinterpret_cast(op_parameter_); + } + + ~LstmFp32BaseCPUKernel() override = default; + + int Prepare() override; + int ReSize() override; + int Run() override; + int DoSequenceLoop(int task_id); + + protected: + virtual int InitInputWeightBias() = 0; + virtual int InitStateWeightBias() = 0; + virtual int InitProjectWeight() = 0; + virtual void LstmUnidirectional(float *output, const float *weight_h, const float *state_bias, float *hidden_state, + float *cell_state, const float *weight_project, float *intermediate_states, + float *buffer[], bool is_backward) = 0; + + int hidden_init_index_{0}; + int cell_init_index_{0}; + int row_tile_{0}; + int col_tile_{0}; + int state_row_tile_{0}; + int state_col_tile_{0}; + int weight_segment_num_{0}; + float *weight_i_ptr_{nullptr}; + float *weight_h_ptr_{nullptr}; + float *weight_project_ptr_{nullptr}; + float *input_bias_{nullptr}; + float *state_bias_{nullptr}; + LstmParameter *lstm_param_{nullptr}; + std::vector running_buffer_; + + private: + void FreeRunBuffer(); + int MallocRunBuffer(bool is_double); + int ExecuteBidirectionalWithMultiThread(); + int ExecuteUnidirectionalOrSingleThread(); + int LstmPreProcessWithInput(const float *weight_i, const float *input_bias, float *dst); + void LstmForwardLoop(float *buffer[]); + void LstmBackwardLoop(float *buffer[]); + float *packed_input_{nullptr}; + float *intermediate_states_{nullptr}; + float *buffer_forward_[C9NUM] = {nullptr}; + float *buffer_backward_[C9NUM] = {nullptr}; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_FP32_BASE_H_ diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_mindir_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_mindir_fp32.cc new file mode 100644 index 00000000..476d5940 --- /dev/null +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_mindir_fp32.cc @@ -0,0 +1,266 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/cpu/fp32/lstm_mindir_fp32.h" +#include "nnacl/fp32/pack_fp32.h" + +namespace mindspore::kernel { +namespace { +constexpr int kInputGateIndex = 0; +constexpr int kTempHiddenOutputIndex = 8; +constexpr int kGateNum = 4; +constexpr int kWeightsIndex = 3; +const int kWeightsOrderMap[8] = {0, 2, 3, 1, 4, 6, 7, 5}; // IFGO order to IOFG order +} // namespace + +int LstmMindirFp32CPUKernel::ReSize() { + auto ret = LstmFp32BaseCPUKernel::ReSize(); + if (ret != lite::RET_OK) { + MS_LOG(ERROR) << "LstmMindirFp32CPUKernel resize failed."; + return ret; + } + // determine FB origin + gpu_orig_state_ = false; + auto weight_t = in_tensors_.at(kWeightsIndex); + MS_CHECK_INT_MUL_NOT_OVERFLOW(lstm_param_->hidden_size_, lstm_param_->input_size_, lite::RET_ERROR); + int hi_unit_size = lstm_param_->hidden_size_ * lstm_param_->input_size_; + MS_CHECK_INT_MUL_NOT_OVERFLOW(weight_segment_num_, hi_unit_size, lite::RET_ERROR); + int hi_whole_size = weight_segment_num_ * hi_unit_size; + MS_CHECK_INT_MUL_NOT_OVERFLOW(lstm_param_->hidden_size_, lstm_param_->output_size_, lite::RET_ERROR); + int hh_unit_size = lstm_param_->hidden_size_ * lstm_param_->output_size_; + MS_CHECK_INT_MUL_NOT_OVERFLOW(weight_segment_num_, hh_unit_size, lite::RET_ERROR); + int hh_whole_size = weight_segment_num_ * hh_unit_size; + int scale = lstm_param_->bidirectional_ ? C2NUM : C1NUM; + MS_CHECK_INT_MUL_NOT_OVERFLOW(lstm_param_->hidden_size_, lstm_param_->project_size_, lite::RET_ERROR); + int hp_unit_size = lstm_param_->hidden_size_ * lstm_param_->project_size_; + MS_CHECK_INT_MUL_NOT_OVERFLOW(scale, hp_unit_size, lite::RET_ERROR); + int hp_whole_size = scale * hp_unit_size; + MS_CHECK_INT_MUL_NOT_OVERFLOW(weight_segment_num_ * C2NUM, lstm_param_->hidden_size_, lite::RET_ERROR); + int bias_whole_size = weight_segment_num_ * C2NUM * lstm_param_->hidden_size_; + auto whole_size = weight_t->ElementsNum(); + bool has_bias = (hi_whole_size + hh_whole_size + hp_whole_size < whole_size) ? true : false; + // if bias exist we can determine the gpu_orig_state_ + if (has_bias) { + gpu_orig_state_ = (hi_whole_size + hh_whole_size + hp_whole_size + bias_whole_size == whole_size) ? true : false; + } else { + bias_whole_size = 0; + } + if (gpu_orig_state_) { + return lite::RET_OK; + } + bias_whole_size /= C2NUM; + if (hi_whole_size + hh_whole_size + hp_whole_size + bias_whole_size != whole_size) { + MS_LOG(ERROR) << "LstmMindir is invalid when original model exports from CPU."; + return lite::RET_INPUT_TENSOR_ERROR; + } + return lite::RET_OK; +} + +int LstmMindirFp32CPUKernel::InitInputWeightBias() { + // malloc and init input * weight right matrix buffer + // input -- row: seq_len * batch; col: input_size + // weight -- row: hidden_size; col: input_size, need transpose + // result -- row: seq_len * batch; col: hidden_size + weight_i_ptr_ = reinterpret_cast(ms_context_->allocator->Malloc( + weight_segment_num_ * lstm_param_->input_col_align_ * lstm_param_->input_size_ * sizeof(float))); + MS_CHECK_TRUE_MSG(weight_i_ptr_ != nullptr, lite::RET_NULL_PTR, "LstmMindirCPUKernel malloc weight_i_ptr_ failed."); + running_buffer_.push_back(weight_i_ptr_); + auto weight_data = reinterpret_cast(in_tensors_.at(kWeightsIndex)->data()); + CHECK_NULL_RETURN(weight_data); + + int hi_unit_size = lstm_param_->input_size_ * lstm_param_->hidden_size_; + int hh_unit_size = lstm_param_->hidden_size_ * lstm_param_->output_size_; + int stride = (gpu_orig_state_) ? kGateNum * (hi_unit_size + hh_unit_size) : kGateNum * hi_unit_size; + PackLstmWeightWithStride(weight_i_ptr_, weight_data, weight_segment_num_, lstm_param_->input_size_, + lstm_param_->hidden_size_, lstm_param_->input_col_align_, lstm_param_->bidirectional_, + stride, kWeightsOrderMap); + // input bias + auto bias_size = weight_segment_num_ * lstm_param_->input_col_align_ * sizeof(float); + input_bias_ = reinterpret_cast(ms_context_->allocator->Malloc(bias_size)); + MS_CHECK_TRUE_MSG(input_bias_ != nullptr, lite::RET_NULL_PTR, "LstmMindirCPUKernel malloc input_bias_ failed."); + memset(input_bias_, 0, bias_size); + running_buffer_.push_back(input_bias_); + if (!lstm_param_->has_bias_) { + return RET_OK; + } + int scale = lstm_param_->bidirectional_ ? C2NUM : C1NUM; + int offset = weight_segment_num_ * (hi_unit_size + hh_unit_size) + + scale * lstm_param_->project_size_ * lstm_param_->hidden_size_; + float *bias_data = weight_data + offset; + int b_stride = + (gpu_orig_state_) ? kGateNum * (scale * lstm_param_->hidden_size_) : kGateNum * (lstm_param_->hidden_size_); + PackLstmBiasWithStride(input_bias_, bias_data, weight_segment_num_, lstm_param_->hidden_size_, + lstm_param_->input_col_align_, lstm_param_->bidirectional_, b_stride, kWeightsOrderMap); + return RET_OK; +} + +int LstmMindirFp32CPUKernel::InitStateWeightBias() { + // malloc and init state * weight right matrix buffer, state * weight will be executed seq_len_ times. + // state -- row: batch; col: hidden_size + // weight -- row: hidden_size; col: hidden_size, need transpose + // result -- row: batch; col: hidden_size + auto weight_data = (reinterpret_cast(in_tensors_.at(kWeightsIndex)->data())); + CHECK_NULL_RETURN(weight_data); + + int hi_unit_size = lstm_param_->input_size_ * lstm_param_->hidden_size_; + int hh_unit_size = lstm_param_->hidden_size_ * lstm_param_->output_size_; + int stride = (gpu_orig_state_) ? kGateNum * (hi_unit_size + hh_unit_size) : kGateNum * hh_unit_size; + + auto weight_h_data = weight_data + (gpu_orig_state_ ? kGateNum * hi_unit_size : weight_segment_num_ * hi_unit_size); + + auto weight_unit_pack_size = sizeof(float) * lstm_param_->state_col_align_ * lstm_param_->output_size_; + auto weight_pack_size = weight_segment_num_ * weight_unit_pack_size; + weight_h_ptr_ = reinterpret_cast(ms_context_->allocator->Malloc(weight_pack_size)); + MS_CHECK_TRUE_MSG(weight_h_ptr_ != nullptr, lite::RET_NULL_PTR, "LstmMindirCPUKernel malloc weight_h_ptr_ failed."); + running_buffer_.push_back(weight_h_ptr_); + if (lstm_param_->batch_ != 1) { + PackLstmWeightWithStride(weight_h_ptr_, weight_h_data, weight_segment_num_, lstm_param_->output_size_, + lstm_param_->hidden_size_, lstm_param_->state_col_align_, lstm_param_->bidirectional_, + stride, kWeightsOrderMap); + } else { + for (int i = 0; i < weight_segment_num_; i++) { + const float *src_batch = weight_h_data + i * lstm_param_->hidden_size_ * lstm_param_->output_size_; + float *dst_batch = + weight_h_ptr_ + kWeightsOrderMap[i] * lstm_param_->state_col_align_ * lstm_param_->output_size_; +#ifdef ENABLE_AVX + RowMajor2Col32Major(src_batch, dst_batch, lstm_param_->hidden_size_, lstm_param_->output_size_); +#else + (void)memcpy(dst_batch, src_batch, weight_unit_pack_size); +#endif + } + } + + // state bias + auto bias_pack_size = weight_segment_num_ * lstm_param_->state_col_align_ * sizeof(float); + state_bias_ = reinterpret_cast(ms_context_->allocator->Malloc(bias_pack_size)); + MS_CHECK_TRUE_MSG(state_bias_ != nullptr, lite::RET_NULL_PTR, "LstmMindirCPUKernel malloc state_bias_ failed."); + memset(state_bias_, 0, bias_pack_size); + running_buffer_.push_back(state_bias_); + if (!lstm_param_->has_bias_ || !gpu_orig_state_) { + return RET_OK; + } + + int hi_whole_size = weight_segment_num_ * lstm_param_->hidden_size_ * lstm_param_->input_size_; + int hh_whole_size = weight_segment_num_ * lstm_param_->hidden_size_ * lstm_param_->output_size_; + int proj_size = + (lstm_param_->bidirectional_ ? C2NUM : C1NUM) * lstm_param_->project_size_ * lstm_param_->hidden_size_; + // mindir from device "GPU", secend bias is also present order IFOG + int bias_offset = hi_whole_size + hh_whole_size + proj_size + lstm_param_->hidden_size_ * kGateNum; + float *state_bias = weight_data + bias_offset; + int b_stride = kGateNum * lstm_param_->hidden_size_ * C2NUM; + PackLstmBiasWithStride(state_bias_, state_bias, weight_segment_num_, lstm_param_->hidden_size_, + lstm_param_->state_col_align_, lstm_param_->bidirectional_, b_stride, kWeightsOrderMap); + return RET_OK; +} + +int LstmMindirFp32CPUKernel::InitProjectWeight() { + if (lstm_param_->project_size_ == 0) { + return RET_OK; + } + auto weight_data = (reinterpret_cast(in_tensors_.at(kWeightsIndex)->data())); + CHECK_NULL_RETURN(weight_data); + int hi_whole_size = weight_segment_num_ * lstm_param_->hidden_size_ * lstm_param_->input_size_; + int hh_whole_size = weight_segment_num_ * lstm_param_->hidden_size_ * lstm_param_->output_size_; + auto weight_proj_data = weight_data + hi_whole_size + hh_whole_size; + int batch = lstm_param_->bidirectional_ ? C2NUM : C1NUM; + auto pack_size = batch * lstm_param_->hidden_size_ * lstm_param_->proj_col_align_ * sizeof(float); + if (lstm_param_->batch_ != 1) { + weight_project_ptr_ = reinterpret_cast(ms_context_->allocator->Malloc(pack_size)); + MS_CHECK_TRUE_MSG(weight_project_ptr_ != nullptr, lite::RET_NULL_PTR, + "LstmNonMindirCPUKernel malloc weight_project_ptr_ failed."); + running_buffer_.push_back(weight_project_ptr_); + PackLstmWeightWithStride(weight_project_ptr_, weight_proj_data, batch, lstm_param_->hidden_size_, + lstm_param_->output_size_, lstm_param_->proj_col_align_, lstm_param_->bidirectional_, + lstm_param_->hidden_size_ * lstm_param_->output_size_, nullptr); + } else { +#ifdef ENABLE_AVX + weight_project_ptr_ = reinterpret_cast(ms_context_->allocator->Malloc(pack_size)); + MS_CHECK_TRUE_MSG(weight_project_ptr_ != nullptr, lite::RET_NULL_PTR, + "LstmNonMindirCPUKernel malloc weight_project_ptr_ failed."); + running_buffer_.push_back(weight_project_ptr_); + for (int i = 0; i < batch; ++i) { + const float *src_batch = weight_proj_data + i * lstm_param_->hidden_size_ * lstm_param_->output_size_; + float *dst_batch = weight_project_ptr_ + i * lstm_param_->hidden_size_ * lstm_param_->proj_col_align_; + RowMajor2Col32Major(src_batch, dst_batch, lstm_param_->output_size_, lstm_param_->hidden_size_); + } +#else + weight_project_ptr_ = weight_proj_data; +#endif + } + return RET_OK; +} + +void LstmMindirFp32CPUKernel::LstmUnidirectional(float *output, const float *weight_h, const float *state_bias, + float *hidden_state, float *cell_state, const float *weight_project, + float *intermediate_states, float **buffer, bool is_backward) { + float *gate = buffer[kInputGateIndex]; + float *input_gate = gate; + float *forget_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * C2NUM; + float *cell_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * C3NUM; + float *output_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_; + float *tmp = buffer[kTempHiddenOutputIndex]; + int dir_mult = lstm_param_->bidirectional_ ? C2NUM : C1NUM; + for (int t = 0; t < lstm_param_->seq_len_; t++) { + int real_t = is_backward ? lstm_param_->seq_len_ - t - C1NUM : t; + float *input_gate_t = input_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t; + float *forget_gate_t = forget_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t; + float *cell_gate_t = cell_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t; + float *output_gate_t = output_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t; + // Sequence, Batch, DirMul, Hidden + LstmStepUnit(tmp, input_gate_t, forget_gate_t, cell_gate_t, output_gate_t, weight_h, state_bias, weight_project, + hidden_state, cell_state, buffer, lstm_param_); + int seq_offset = real_t * lstm_param_->batch_ * dir_mult * lstm_param_->output_size_; + for (int b = 0; b < lstm_param_->batch_; b++) { + int batch_offset = b * dir_mult * lstm_param_->output_size_; + float *output_ptr = output + seq_offset + batch_offset; + memcpy(output_ptr, tmp + b * lstm_param_->output_size_, lstm_param_->output_size_ * sizeof(float)); + } + if (intermediate_states) { + RecordStates(hidden_state, cell_state, input_gate_t, output_gate_t, forget_gate_t, cell_gate_t, + intermediate_states, real_t); + } + } +} + +void LstmMindirFp32CPUKernel::RecordStates(const float *hidden_state, float *cell_state, float *input_gate, + const float *output_gate, float *forget_gate, const float *cell_gate, + float *intermediate_states, int step) { + float *states = intermediate_states; + auto hidden_size = lstm_param_->batch_ * lstm_param_->output_size_; + auto state_size = lstm_param_->batch_ * lstm_param_->hidden_size_; + if (state_size < 0) { + MS_LOG(ERROR) << "state size should be greater than or equal to zero."; + return; + } + auto hidden_stride = step * lstm_param_->output_step_; + auto hidden_seq_stride = lstm_param_->seq_len_ * lstm_param_->output_step_; + auto other_output_step = lstm_param_->bidirectional_ ? C2NUM * lstm_param_->batch_ * lstm_param_->hidden_size_ + : lstm_param_->batch_ * lstm_param_->hidden_size_; + auto stride = step * other_output_step; + auto seq_stride = lstm_param_->seq_len_ * other_output_step; + memcpy(states + hidden_stride, hidden_state, hidden_size * sizeof(float)); + stride += hidden_seq_stride; + memcpy(states + stride, cell_state, state_size * sizeof(float)); + stride += seq_stride; + memcpy(states + stride, input_gate, state_size * sizeof(float)); + stride += seq_stride; + memcpy(states + stride, output_gate, state_size * sizeof(float)); + stride += seq_stride; + memcpy(states + stride, forget_gate, state_size * sizeof(float)); + stride += seq_stride; + memcpy(states + stride, cell_gate, state_size * sizeof(float)); +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_mindir_fp32.h b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_mindir_fp32.h new file mode 100644 index 00000000..84cdd38e --- /dev/null +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_mindir_fp32.h @@ -0,0 +1,63 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_MINDIR_FP32_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_MINDIR_FP32_H_ + +#include +#include "src/litert/kernel/cpu/fp32/lstm_fp32_base.h" + +namespace mindspore::kernel { +/* + * 1. LSTM without project, output_size = hidden_size + * h_init: second input, shape is [bidirectional, batch_size, hidden_size] + * c_init: third input, shape is [bidirectional, batch_size, hidden_size] + * weight_bias: forth input, weight_ih + weight_hh + bias, the gate order is IFGO + * + * 2. LSTM with project, output_size = project_size + * h_init: second input, shape is [bidirectional, batch_size, project_size] + * c_init: third input, shape is [bidirectional, batch_size, hidden_size] + * weight_bias: forth input, weight_ih + weight_hh + proj + bias, the gate order is IFGO + */ +class LstmMindirFp32CPUKernel : public LstmFp32BaseCPUKernel { + public: + LstmMindirFp32CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : LstmFp32BaseCPUKernel(parameter, inputs, outputs, ctx) { + hidden_init_index_ = SECOND_INPUT; + cell_init_index_ = THIRD_INPUT; + } + + ~LstmMindirFp32CPUKernel() override = default; + + int ReSize() override; + + protected: + int InitInputWeightBias() override; + int InitStateWeightBias() override; + int InitProjectWeight() override; + void LstmUnidirectional(float *output, const float *weight_h, const float *state_bias, float *hidden_state, + float *cell_state, const float *weight_project, float *intermediate_states, float *buffer[], + bool is_backward) override; + + private: + void RecordStates(const float *hidden_state, float *cell_state, float *input_gate, const float *output_gate, + float *forget_gate, const float *cell_gate, float *intermediate_states, int step); + bool gpu_orig_state_{false}; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_MINDIR_FP32_H_ diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.cc new file mode 100644 index 00000000..62f9f2b7 --- /dev/null +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.cc @@ -0,0 +1,173 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.h" +#include "nnacl/fp32/pack_fp32.h" + +namespace mindspore::kernel { +namespace { +constexpr int kInputGateIndex = 0; +constexpr int kGateNum = 4; +constexpr int kWeightInputIndex = 1; +constexpr int kWeightHiddenindex = 2; +constexpr int kCombinedBiasIndex = 3; +} // namespace + +int LstmNonMindirFp32CPUKernel::InitInputWeightBias() { + // malloc and init input * weight right matrix buffer + // input -- row: seq_len * batch; col: input_size + // weight -- row: hidden_size; col: input_size, need transpose + // result -- row: seq_len * batch; col: hidden_size + weight_i_ptr_ = reinterpret_cast(ms_context_->allocator->Malloc( + weight_segment_num_ * lstm_param_->input_col_align_ * lstm_param_->input_size_ * sizeof(float))); + MS_CHECK_TRUE_MSG(weight_i_ptr_ != nullptr, lite::RET_NULL_PTR, + "LstmNonMindirCPUKernel malloc weight_i_ptr_ failed."); + running_buffer_.push_back(weight_i_ptr_); + auto weight_i = in_tensors_.at(kWeightInputIndex); + auto weight_i_data = reinterpret_cast(weight_i->data()); + CHECK_NULL_RETURN(weight_i_data); + + int stride = kGateNum * lstm_param_->input_size_ * lstm_param_->hidden_size_; + PackLstmWeightWithStride(weight_i_ptr_, weight_i_data, weight_segment_num_, lstm_param_->input_size_, + lstm_param_->hidden_size_, lstm_param_->input_col_align_, lstm_param_->bidirectional_, + stride, nullptr); + // input bias + input_bias_ = reinterpret_cast( + ms_context_->allocator->Malloc(weight_segment_num_ * lstm_param_->input_col_align_ * sizeof(float))); + MS_CHECK_TRUE_MSG(input_bias_ != nullptr, lite::RET_NULL_PTR, "LstmNonMindirCPUKernel malloc input_bias_ failed."); + memset(input_bias_, 0, weight_segment_num_ * lstm_param_->input_col_align_ * sizeof(float)); + running_buffer_.push_back(input_bias_); + auto bias_data = reinterpret_cast(in_tensors_.at(kCombinedBiasIndex)->data()); + CHECK_NULL_RETURN(bias_data); + PackLstmBias(input_bias_, bias_data, weight_segment_num_, lstm_param_->hidden_size_, lstm_param_->input_col_align_, + lstm_param_->bidirectional_, nullptr); + return RET_OK; +} + +int LstmNonMindirFp32CPUKernel::InitStateWeightBias() { + // malloc and init state * weight right matrix buffer, state * weight will be executed seq_len_ times. + // state -- row: batch; col: hidden_size + // weight -- row: hidden_size; col: hidden_size, need transpose + // result -- row: batch; col: hidden_size + auto weight_h = in_tensors_.at(kWeightHiddenindex); + auto weight_h_data = reinterpret_cast(weight_h->data()); + CHECK_NULL_RETURN(weight_h_data); + + int stride = kGateNum * lstm_param_->hidden_size_ * lstm_param_->output_size_; + auto weight_pack_size = + weight_segment_num_ * lstm_param_->state_col_align_ * lstm_param_->output_size_ * sizeof(float); + if (lstm_param_->batch_ != 1) { + weight_h_ptr_ = reinterpret_cast(ms_context_->allocator->Malloc(weight_pack_size)); + MS_CHECK_TRUE_MSG(weight_h_ptr_ != nullptr, lite::RET_NULL_PTR, + "LstmNonMindirCPUKernel malloc weight_h_ptr_ failed."); + running_buffer_.push_back(weight_h_ptr_); + PackLstmWeightWithStride(weight_h_ptr_, weight_h_data, weight_segment_num_, lstm_param_->output_size_, + lstm_param_->hidden_size_, lstm_param_->state_col_align_, lstm_param_->bidirectional_, + stride, nullptr); + } else { +#ifdef ENABLE_AVX + weight_h_ptr_ = reinterpret_cast(ms_context_->allocator->Malloc(weight_pack_size)); + MS_CHECK_TRUE_MSG(weight_h_ptr_ != nullptr, lite::RET_NULL_PTR, + "LstmNonMindirCPUKernel malloc weight_h_ptr_ failed."); + running_buffer_.push_back(weight_h_ptr_); + for (int i = 0; i < weight_segment_num_; i++) { + const float *src_batch = weight_h_data + i * lstm_param_->hidden_size_ * lstm_param_->output_size_; + float *dst_batch = weight_h_ptr_ + i * lstm_param_->state_col_align_ * lstm_param_->output_size_; + RowMajor2Col32Major(src_batch, dst_batch, lstm_param_->hidden_size_, lstm_param_->output_size_); + } +#else + weight_h_ptr_ = weight_h_data; +#endif + } + + // state bias + auto bias_pack_size = weight_segment_num_ * lstm_param_->state_col_align_ * sizeof(float); + state_bias_ = reinterpret_cast(ms_context_->allocator->Malloc(bias_pack_size)); + MS_CHECK_TRUE_MSG(state_bias_ != nullptr, lite::RET_NULL_PTR, "LstmNonMindirCPUKernel malloc state_bias_ failed."); + memset(state_bias_, 0, bias_pack_size); + running_buffer_.push_back(state_bias_); + // if ONNX, secend bias is also present order IOFG + auto bias_data = reinterpret_cast(in_tensors_.at(kCombinedBiasIndex)->data()); + CHECK_NULL_RETURN(bias_data); + auto *state_bias = bias_data + kGateNum * lstm_param_->hidden_size_; + PackLstmBias(state_bias_, state_bias, weight_segment_num_, lstm_param_->hidden_size_, lstm_param_->state_col_align_, + lstm_param_->bidirectional_, nullptr); + return RET_OK; +} + +int LstmNonMindirFp32CPUKernel::InitProjectWeight() { + if (in_tensors_.size() < C7NUM) { + return RET_OK; + } + auto weight_pro = in_tensors_.at(SEVENTH_INPUT); + auto shape = weight_pro->shape(); + MS_CHECK_TRUE_MSG(shape.size() == C3NUM, lite::RET_ERROR, "Project-weight's shape must be 3D."); + auto weight_pro_data = reinterpret_cast(weight_pro->data()); + CHECK_NULL_RETURN(weight_pro_data); + int batch = lstm_param_->bidirectional_ ? C2NUM : C1NUM; + if (shape[0] != batch) { + MS_LOG(ERROR) << "Project-weight's shape[0] must be 1(bidirectional=false) or 2(bidirectional=true)."; + return lite::RET_ERROR; + } + int col_align = UP_ROUND(lstm_param_->output_size_, col_tile_); + auto pack_size = batch * lstm_param_->hidden_size_ * col_align * sizeof(float); + if (lstm_param_->batch_ != 1) { + weight_project_ptr_ = reinterpret_cast(ms_context_->allocator->Malloc(pack_size)); + MS_CHECK_TRUE_MSG(weight_project_ptr_ != nullptr, lite::RET_NULL_PTR, + "LstmNonMindirCPUKernel malloc weight_project_ptr_ failed."); + running_buffer_.push_back(weight_project_ptr_); + PackLstmWeightWithStride(weight_project_ptr_, weight_pro_data, batch, lstm_param_->hidden_size_, + lstm_param_->output_size_, col_align, lstm_param_->bidirectional_, + lstm_param_->hidden_size_ * lstm_param_->output_size_, nullptr); + } else { +#ifdef ENABLE_AVX + weight_project_ptr_ = reinterpret_cast(ms_context_->allocator->Malloc(pack_size)); + MS_CHECK_TRUE_MSG(weight_project_ptr_ != nullptr, lite::RET_NULL_PTR, + "LstmNonMindirCPUKernel malloc weight_project_ptr_ failed."); + running_buffer_.push_back(weight_project_ptr_); + for (int i = 0; i < batch; ++i) { + const float *src_batch = weight_pro_data + i * lstm_param_->hidden_size_ * lstm_param_->output_size_; + float *dst_batch = weight_project_ptr_ + i * lstm_param_->hidden_size_ * col_align; + RowMajor2Col32Major(src_batch, dst_batch, lstm_param_->output_size_, lstm_param_->hidden_size_); + } +#else + weight_project_ptr_ = weight_pro_data; +#endif + } + return RET_OK; +} + +void LstmNonMindirFp32CPUKernel::LstmUnidirectional(float *output, const float *weight_h, const float *state_bias, + float *hidden_state, float *cell_state, const float *weight_project, + float *intermediate_states, float **buffer, bool is_backward) { + float *gate = buffer[kInputGateIndex]; + float *input_gate = gate; + float *forget_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * C2NUM; + float *cell_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_ * C3NUM; + float *output_gate = gate + lstm_param_->seq_len_ * lstm_param_->batch_ * lstm_param_->hidden_size_; + for (int t = 0; t < lstm_param_->seq_len_; t++) { + int real_t = is_backward ? lstm_param_->seq_len_ - t - C1NUM : t; + float *input_gate_t = input_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t; + float *forget_gate_t = forget_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t; + float *cell_gate_t = cell_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t; + float *output_gate_t = output_gate + lstm_param_->batch_ * lstm_param_->hidden_size_ * real_t; + // Sequence, DirMul, Batch, Hidden + float *output_ptr = output + real_t * lstm_param_->output_step_; + LstmStepUnit(output_ptr, input_gate_t, forget_gate_t, cell_gate_t, output_gate_t, weight_h, state_bias, + weight_project, hidden_state, cell_state, buffer, lstm_param_); + } +} +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.h b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.h new file mode 100644 index 00000000..b16e9175 --- /dev/null +++ b/mindspore/lite/src/litert/kernel/cpu/fp32/lstm_non_mindir_fp32.h @@ -0,0 +1,61 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_NON_MINDIR_FP32_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_NON_MINDIR_FP32_H_ + +#include +#include "src/litert/kernel/cpu/fp32/lstm_fp32_base.h" + +namespace mindspore::kernel { +/* + * 1. LSTM without project, output_size = hidden_size + * weight_ih: second input, shape is [bidirectional, 4 * hidden_size, input_size] + * weight_hh: third input, shape is [bidirectional, 4 * hidden_size, hidden_size] + * bias: forth input, shape is [bidirectional, 8 * hidden_size] + * h_init: fifth input, shape is [bidirectional, batch_size, hidden_size] + * c_init: sixth input, shape is [bidirectional, batch_size, hidden_size] + * + * 2. LSTM with project, output_size = project_size + * weight_ih: second input, shape is [bidirectional, 4 * hidden_size, input_size] + * weight_hh: third input, shape is [bidirectional, 4 * hidden_size, project_size] + * bias: forth input, shape is [bidirectional, 8 * hidden_size] + * h_init: fifth input, shape is [bidirectional, batch_size, project_size] + * c_init: sixth input, shape is [bidirectional, batch_size, hidden_size] + * weight_pro: seventh input, shape is [bidirectional, project_size, hidden_size] + */ +class LstmNonMindirFp32CPUKernel : public LstmFp32BaseCPUKernel { + public: + LstmNonMindirFp32CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : LstmFp32BaseCPUKernel(parameter, inputs, outputs, ctx) { + hidden_init_index_ = FIFTH_INPUT; + cell_init_index_ = SIXTH_INPUT; + } + + ~LstmNonMindirFp32CPUKernel() override = default; + + protected: + int InitInputWeightBias() override; + int InitStateWeightBias() override; + int InitProjectWeight() override; + void LstmUnidirectional(float *output, const float *weight_h, const float *state_bias, float *hidden_state, + float *cell_state, const float *weight_project, float *intermediate_states, float *buffer[], + bool is_backward) override; +}; +} // namespace mindspore::kernel + +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_LSTM_NON_MINDIR_FP32_H_ diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.cc b/mindspore/lite/src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.cc new file mode 100644 index 00000000..60d3f213 --- /dev/null +++ b/mindspore/lite/src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.cc @@ -0,0 +1,147 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.h" +#include "src/litert//kernel_registry.h" +#include "include/errorcode.h" +#include "src/common/log_adapter.h" +#include "nnacl/custom_gather_d_grad_v2_parameter.h" + +using mindspore::lite::KernelRegistrar; +using mindspore::lite::RET_ERROR; +using mindspore::lite::RET_NOT_SUPPORT; +using mindspore::lite::RET_OK; + +namespace mindspore::kernel { +namespace { +constexpr size_t index_idx_{1}; +constexpr size_t grad_idx_{2}; +size_t get_element_num(const std::vector &shape) { + return std::accumulate(shape.begin(), shape.end(), static_cast(1), std::multiplies()); +} + +void GatherDGradCopyTask(size_t cur, std::vector *pos, float *input, int *index, const int &dim, float *output, + const std::vector &output_shape, const std::vector &out_cargo_size, + const std::vector &input_cargo_size) { + for (int i = 0; i < output_shape[cur]; ++i) { + (*pos)[cur] = i; + if (cur == output_shape.size() - 1) { + int input_offset = 0; + int out_offset = 0; + // out offset + for (size_t j = 0; j < output_shape.size(); ++j) { + out_offset += (*pos)[j] * out_cargo_size[j]; + } + // input offset + int cur_index = (*pos)[dim]; + (*pos)[dim] = index[out_offset]; + for (size_t j = 0; j < output_shape.size(); ++j) { + input_offset += (*pos)[j] * input_cargo_size[j]; + } + // do copy + input[input_offset] += output[out_offset]; + (*pos)[dim] = cur_index; + } else { + // CopyTask + GatherDGradCopyTask(cur + 1, pos, input, index, dim, output, output_shape, out_cargo_size, input_cargo_size); + } + } +} +} // namespace + +CustomGatherDGradV2CPUKernel::~CustomGatherDGradV2CPUKernel() {} + +int CustomGatherDGradV2CPUKernel::Prepare() { + CHECK_LESS_RETURN(in_tensors_.size(), C3NUM); + CHECK_LESS_RETURN(out_tensors_.size(), C1NUM); + if (InitParamter() != RET_OK) { + MS_LOG(ERROR) << "Init Built-in CustomGatherGradV2 Parameter failed." << name_; + return RET_ERROR; + } + if (!InferShapeDone()) { + return RET_OK; + } + return ReSize(); +} + +int CustomGatherDGradV2CPUKernel::InitParamter() { + auto param = reinterpret_cast(op_parameter_); + axis_ = param->dim; + return RET_OK; +} + +int CustomGatherDGradV2CPUKernel::ReSize() { + index_shape_ = in_tensors_[index_idx_]->shape(); + grad_shape_ = in_tensors_[grad_idx_]->shape(); + output_shape_ = out_tensors_[0]->shape(); + if (grad_shape_.size() != index_shape_.size() || output_shape_.size() != index_shape_.size()) { + MS_LOG(ERROR) << "For '" << name_ << "', the dimension of grad and output must be the equal to the " + << "dimension of index: " << index_shape_.size() + << ", but got the dimension of grad: " << grad_shape_.size() + << ", the dimension of output: " << output_shape_.size(); + return RET_ERROR; + } + + return RET_OK; +} + +int CustomGatherDGradV2CPUKernel::Run() { + auto *index = reinterpret_cast(in_tensors_[index_idx_]->data()); + auto *grad = reinterpret_cast(in_tensors_[grad_idx_]->data()); + auto out = reinterpret_cast(out_tensors_[0]->data()); + int output_rank = output_shape_.size(); + if (axis_ >= output_rank || axis_ < -output_rank) { + MS_LOG(ERROR) << "For '" << name_ << "', the value of 'dim' must be in [" << -output_rank << ", " << output_rank + << "), but got: " << axis_; + } + if (axis_ < 0) { + axis_ = axis_ + output_rank; + } + + // check index + size_t index_size = get_element_num(index_shape_); + int max_index = output_shape_[axis_]; + for (size_t i = 0; i < index_size; ++i) { + if (index[i] >= max_index || index[i] < -max_index) { + MS_LOG(ERROR) << "For '" << name_ << "', the value of 'index' must be in [" << -max_index << ", " << max_index + << "), but got: " << index[i]; + } + if (index[i] < 0) { + index[i] = max_index + index[i]; + } + } + auto out_size = get_element_num(output_shape_); + memset(out, 0, out_size * sizeof(float)); + + // out_cargo_size + std::vector out_cargo_size = std::vector(output_shape_.size(), 1); + for (int i = static_cast(out_cargo_size.size()) - 2; i >= 0; --i) { + out_cargo_size[i] = output_shape_[i + 1] * out_cargo_size[i + 1]; + } + // grad_cargo_size + std::vector grad_cargo_size = std::vector(grad_shape_.size(), 1); + for (int i = static_cast(grad_cargo_size.size()) - 2; i >= 0; --i) { + grad_cargo_size[i] = grad_shape_[i + 1] * grad_cargo_size[i + 1]; + } + + // copy task + std::vector pos(index_shape_.size(), 0); + GatherDGradCopyTask(0, &pos, out, index, axis_, grad, index_shape_, grad_cargo_size, out_cargo_size); + return RET_OK; +} + +REG_KERNEL(kCPU, kNumberTypeFloat32, PrimType_Inner_CustomGatherDGradV2, + LiteKernelCreator) +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.h b/mindspore/lite/src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.h new file mode 100644 index 00000000..25666023 --- /dev/null +++ b/mindspore/lite/src/litert/kernel/cpu/fp32_grad/custom_gather_d_grad_v2_fp32.h @@ -0,0 +1,42 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_GRAD_CUSTOM_GATHER_D_GRAD_V2_H_ +#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_GRAD_CUSTOM_GATHER_D_GRAD_V2_H_ +#include +#include "src/litert/lite_kernel.h" + +namespace mindspore::kernel { +class CustomGatherDGradV2CPUKernel : public LiteKernel { + public: + CustomGatherDGradV2CPUKernel(OpParameter *parameter, const std::vector &inputs, + const std::vector &outputs, const lite::InnerContext *ctx) + : LiteKernel(parameter, inputs, outputs, ctx) {} + ~CustomGatherDGradV2CPUKernel() override; + int Prepare() override; + int ReSize() override; + int Run() override; + + private: + int InitParamter(); + + std::vector index_shape_; + std::vector grad_shape_; + std::vector output_shape_; + int axis_{0}; +}; +} // namespace mindspore::kernel +#endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_CPU_FP32_GRAD_CUSTOM_GATHER_D_GRAD_V2_H_ diff --git a/mindspore/lite/src/train/graph_fusion.cc b/mindspore/lite/src/train/graph_fusion.cc index 48c037b2..7982f818 100644 --- a/mindspore/lite/src/train/graph_fusion.cc +++ b/mindspore/lite/src/train/graph_fusion.cc @@ -25,6 +25,8 @@ #include "src/train/optimizer/fusion/reshape_gather_reshape_fusion_pass.h" #include "tools/converter/legacy_optimizer/graph/isolated_node_remove_pass.h" #include "tools/converter/legacy_optimizer/graph/subgraph_node_pass.h" +#include "src/train/optimizer/fusion/matmul_add_fusion_pass.h" +#include "src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.h" namespace mindspore { namespace lite { @@ -52,7 +54,9 @@ STATUS GraphFusion::Run(schema::MetaGraphT *graph) { Optimizer fusion_optimizer; fusion_optimizer.AddPass(new (std::nothrow) ReshapeGatherReshapeFusionPass()); fusion_optimizer.AddPass(new (std::nothrow) MatMulBiasAddFusionPass()); + fusion_optimizer.AddPass(new (std::nothrow) MatMulAddFusionPass()); fusion_optimizer.AddPass(new (std::nothrow) MatMulActivationFusionPass()); + fusion_optimizer.AddPass(new (std::nothrow) MatMulMatMulAddFusionPass()); fusion_optimizer.AddPass(new (std::nothrow) IsolatedNodeRemovePass()); fusion_optimizer.AddPass(new (std::nothrow) SubgraphNodePass(old_nodes)); auto status = fusion_optimizer.Run(graph); diff --git a/mindspore/lite/src/train/optimizer/fusion/matmul_add_fusion_pass.cc b/mindspore/lite/src/train/optimizer/fusion/matmul_add_fusion_pass.cc new file mode 100644 index 00000000..34bed911 --- /dev/null +++ b/mindspore/lite/src/train/optimizer/fusion/matmul_add_fusion_pass.cc @@ -0,0 +1,127 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "src/train/optimizer/fusion/matmul_add_fusion_pass.h" +#include +#include +#include +#include +#include "schema/inner/model_generated.h" +#include "tools/common/meta_graph_utils.h" +namespace { +constexpr int kNumAddMatchPathLen = 2; +constexpr std::string_view MulName = "MATMUL"; +constexpr std::string_view AddName = "ADD"; +} // namespace +namespace mindspore { +namespace lite { +namespace { +int CalNewCnodeBias(const std::unique_ptr &add_weight_tensor, + const std::unique_ptr &matmul_bias_tensor) { + if (add_weight_tensor->dataType != kNumberTypeFloat32 || matmul_bias_tensor->dataType != kNumberTypeFloat32) { + MS_LOG(INFO) << "only support float32 data type"; + return RET_ERROR; + } + std::vector matmul_bias_shape = matmul_bias_tensor->dims; + std::vector add_weight_shape = add_weight_tensor->dims; + MS_CHECK_TRUE_RET(matmul_bias_shape == add_weight_shape, RET_ERROR); + auto add_weight_data = reinterpret_cast(add_weight_tensor->data.data()); + auto matmul_bias_data = reinterpret_cast(matmul_bias_tensor->data.data()); + int num = static_cast(matmul_bias_tensor->data.size() / sizeof(float)); + for (int i = 0; i < num; ++i) { + matmul_bias_data[i] += add_weight_data[i]; + } + return RET_OK; +} +} // namespace +STATUS MatMulAddFusionPass::Run(MetaGraphT *graph) { return FusionPass::Run(graph); } +STATUS MatMulAddFusionPass::DefinePattern() { + auto mul_op = std::make_shared(); + MS_CHECK_TRUE_RET(mul_op != nullptr, RET_NULL_PTR); + mul_op->id = MulName; + mul_op->types = {schema::PrimitiveType_MatMulFusion}; + auto add_op = std::make_shared(); + MS_CHECK_TRUE_RET(add_op != nullptr, RET_NULL_PTR); + add_op->id = AddName; + add_op->types = {schema::PrimitiveType_AddFusion}; + add_op->left = mul_op; + std::unique_ptr fusion_pattern(new (std::nothrow) FusionPattern("MatMulAddFusion")); + if (fusion_pattern == nullptr) { + MS_LOG(ERROR) << "new fusion_pattern failed"; + return RET_ERROR; + } + fusion_pattern->AddPatternOp(mul_op); + fusion_pattern->AddPatternOp(add_op); + fusion_pattern->Finish(); + this->patterns.emplace_back(fusion_pattern.release()); + return RET_OK; +} +STATUS MatMulAddFusionPass::DoFusion(MetaGraphT *graph, const std::string &pattern_name, + const std::unordered_map> &matched_path) { + MS_CHECK_TRUE_RET(graph != nullptr, RET_NULL_PTR); + if (matched_path.size() != kNumAddMatchPathLen) { + MS_LOG(ERROR) << "MatMul-Add-Fusion should have two NodeIndex in matchedPair"; + return RET_PARAM_INVALID; + } + auto mul_path_iter = matched_path.find(std::string(MulName)); + MS_CHECK_TRUE_RET(mul_path_iter != matched_path.end(), RET_NO_CHANGE); + auto &mul_path = mul_path_iter->second; + MS_CHECK_TRUE_RET(mul_path != nullptr, RET_NULL_PTR); + auto add_path_iter = matched_path.find(std::string(AddName)); + MS_CHECK_TRUE_RET(add_path_iter != matched_path.end(), RET_NO_CHANGE); + auto &add_path = add_path_iter->second; + MS_CHECK_TRUE_RET(add_path != nullptr, RET_NULL_PTR); + auto mul_index = mul_path->nodeIdx; + auto add_index = add_path->nodeIdx; + auto &mul_node = graph->nodes.at(mul_index); + MS_CHECK_TRUE_RET(mul_node != nullptr, RET_NULL_PTR); + auto &add_node = graph->nodes.at(add_index); + MS_CHECK_TRUE_RET(add_node != nullptr, RET_NULL_PTR); + if (mul_node->quantType == schema::QuantType_QUANT_ALL || mul_node->quantType == schema::QuantType_QUANT_DYNAMIC || + add_node->quantType == schema::QuantType_QUANT_ALL || add_node->quantType == schema::QuantType_QUANT_DYNAMIC) { + MS_LOG(DEBUG) << "cannot fusion."; + return RET_NO_CHANGE; + } + MS_CHECK_TRUE_RET(mul_node->primitive != nullptr, RET_NULL_PTR); + auto matmul_type = mul_node->primitive->value.AsMatMulFusion(); + MS_CHECK_TRUE_RET(matmul_type->activation_type == ActivationType::ActivationType_NO_ACTIVATION, RET_NO_CHANGE); + auto add_param_shape = graph->allTensors.at(add_node->inputIndex.at(SECOND_INPUT))->dims; + MS_CHECK_TRUE_MSG(add_param_shape.size() == DIMENSION_1D, RET_NO_CHANGE, "only support bias with shape size of 1."); + if (mul_node->inputIndex.size() == C3NUM) { + auto &mul_bias_tensor = graph->allTensors.at(mul_node->inputIndex.at(THIRD_INPUT)); + if (mul_bias_tensor->data.data() == nullptr) { + MS_LOG(INFO) << mul_node->name << "'s bias is not const"; + return RET_NO_CHANGE; + } + auto &add_weight_tensor = graph->allTensors.at(add_node->inputIndex.at(SECOND_INPUT)); + if (CalNewCnodeBias(add_weight_tensor, mul_bias_tensor) != RET_OK) { + MS_LOG(INFO) << add_node->name << " failed to fusion with " << mul_node->name; + return RET_NO_CHANGE; + } + } + auto add_tensor_index = add_node->inputIndex.at(SECOND_INPUT); + if (mul_node->inputIndex.size() == C2NUM) { + mul_node->inputIndex.push_back(add_tensor_index); + } + mul_node->outputIndex = {add_node->outputIndex}; + // cannot delete node here, otherwise will destroy order in other pattern's node index + // make it an isolated node to be removed in IsolatedNodeRemovePass + add_node->inputIndex.clear(); + add_node->outputIndex.clear(); + return RET_OK; +} +MatMulAddFusionPass::~MatMulAddFusionPass() = default; +} // namespace lite +} // namespace mindspore diff --git a/mindspore/lite/src/train/optimizer/fusion/matmul_add_fusion_pass.h b/mindspore/lite/src/train/optimizer/fusion/matmul_add_fusion_pass.h new file mode 100644 index 00000000..8eb4ab2e --- /dev/null +++ b/mindspore/lite/src/train/optimizer/fusion/matmul_add_fusion_pass.h @@ -0,0 +1,37 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_LEGACY_OPTIMIZER_FUSION_MATMUL_ADD_FUSION_PASS_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_LEGACY_OPTIMIZER_FUSION_MATMUL_ADD_FUSION_PASS_H_ +#include +#include +#include +#include +#include +#include "tools/converter/legacy_optimizer/fusion/fusion_pass.h" +namespace mindspore { +namespace lite { +class MatMulAddFusionPass : public FusionPass { + public: + MatMulAddFusionPass() = default; + ~MatMulAddFusionPass() override; + STATUS DefinePattern() override; + STATUS DoFusion(MetaGraphT *graph, const std::string &pattern_name, + const std::unordered_map> &matched_path) override; + STATUS Run(MetaGraphT *graph) override; +}; +} // namespace lite +} // namespace mindspore +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_LEGACY_OPTIMIZER_FUSION_MATMUL_ADD_FUSION_PASS_H_ diff --git a/mindspore/lite/src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.cc b/mindspore/lite/src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.cc new file mode 100644 index 00000000..d1a63c2d --- /dev/null +++ b/mindspore/lite/src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.cc @@ -0,0 +1,163 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.h" +#include +#include +#include +#include +#include "schema/inner/model_generated.h" +#include "tools/common/meta_graph_utils.h" +#include "src/train/optimizer/common/fusion_utils.h" +namespace { +constexpr std::string_view kFirstMatMulName = "MATMUL1"; +constexpr std::string_view kSecondMatMulName = "MATMUL2"; +constexpr std::string_view kAddName = "ADD"; +} // namespace +namespace mindspore { +namespace lite { +/* + * The subgraph such as the following. + * any any + * / \ | + * matmul matmul matmul + * \ / ----> | + * add any + * | + * any + */ +namespace { +int CalNewMatMulNode(MetaGraphT *graph, const std::unique_ptr &matmul_node1, + const std::unique_ptr &matmul_node2) { + auto &matrix_b_1 = graph->allTensors.at(matmul_node1->inputIndex.at(opt::kInputIndexOne)); + auto &matrix_b_2 = graph->allTensors.at(matmul_node2->inputIndex.at(opt::kInputIndexOne)); + if (matrix_b_1->dims != matrix_b_2->dims) { + MS_LOG(INFO) << "currently, matmul fusion only support the same shape tensor"; + return RET_ERROR; + } + if (matrix_b_1->dataType != kNumberTypeFloat32 || matrix_b_2->dataType != kNumberTypeFloat32) { + MS_LOG(INFO) << "only support float32 data type"; + return RET_ERROR; + } + auto matrix_b_1_data = reinterpret_cast(matrix_b_1->data.data()); + auto matrix_b_2_data = reinterpret_cast(matrix_b_2->data.data()); + int num_b = static_cast(matrix_b_1->data.size() / sizeof(float)); + for (int j = 0; j < num_b; ++j) { + matrix_b_1_data[j] += matrix_b_2_data[j]; + } + return RET_OK; +} +} // namespace +STATUS MatMulMatMulAddFusionPass::DefinePattern() { + auto matmul_op1 = std::make_shared(); + MS_CHECK_TRUE_RET(matmul_op1 != nullptr, RET_NULL_PTR); + matmul_op1->id = kFirstMatMulName; + matmul_op1->types = {schema::PrimitiveType_MatMulFusion}; + auto matmul_op2 = std::make_shared(); + MS_CHECK_TRUE_RET(matmul_op2 != nullptr, RET_NULL_PTR); + matmul_op2->id = kSecondMatMulName; + matmul_op2->types = {schema::PrimitiveType_MatMulFusion}; + auto add_op = std::make_shared(); + MS_CHECK_TRUE_RET(add_op != nullptr, RET_NULL_PTR); + add_op->id = kAddName; + add_op->types = {schema::PrimitiveType_AddFusion}; + add_op->left = matmul_op1; + add_op->right = matmul_op2; + auto fusion_pattern = std::make_unique("MatMulMatMulAddFusion"); + MS_CHECK_TRUE_MSG(fusion_pattern != nullptr, RET_NULL_PTR, "new fusion_pattern failed"); + fusion_pattern->AddPatternOp(matmul_op1); + fusion_pattern->AddPatternOp(matmul_op2); + fusion_pattern->AddPatternOp(add_op); + fusion_pattern->Finish(); + this->patterns.emplace_back(fusion_pattern.release()); + return RET_OK; +} + +STATUS MatMulMatMulAddFusionPass::DoFusion(MetaGraphT *graph, const std::string &pattern_name, + const std::unordered_map> &matched_path) { + MS_CHECK_TRUE_RET(graph != nullptr, RET_NULL_PTR); + if (matched_path.size() != opt::kMatchPathLenThree) { + MS_LOG(INFO) << "MatMul-MatMul-Add-Fusion should have three NodeIndex in matchedPair"; + return RET_PARAM_INVALID; + } + + size_t matmul_index1 = 0; + auto ret = opt::GetMatchNodeIndex(graph, matched_path, std::string(kFirstMatMulName), &matmul_index1); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "cannot get matmul_index1"); + auto &matmul_node1 = graph->nodes.at(matmul_index1); + MS_CHECK_TRUE_MSG(matmul_node1 != nullptr, RET_NULL_PTR, "matmul_node1 is nullptr"); + size_t matmul_index2 = 0; + ret = opt::GetMatchNodeIndex(graph, matched_path, std::string(kSecondMatMulName), &matmul_index2); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "cannot get matmul_index2"); + auto &matmul_node2 = graph->nodes.at(matmul_index2); + MS_CHECK_TRUE_MSG(matmul_node2 != nullptr, RET_NULL_PTR, "matmul_node2 is nullptr"); + MS_CHECK_TRUE_MSG(matmul_node1->inputIndex.size() > C1NUM && matmul_node2->inputIndex.size() > C1NUM, + RET_PARAM_INVALID, "matmul should have two input at least"); + if (matmul_node1->inputIndex.size() < matmul_node2->inputIndex.size()) { + matmul_node1.swap(matmul_node2); + } + size_t add_index = 0; + ret = opt::GetMatchNodeIndex(graph, matched_path, std::string(kAddName), &add_index); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "cannot get add_index"); + auto &add_node = graph->nodes.at(add_index); + MS_CHECK_TRUE_MSG(add_node != nullptr, RET_NULL_PTR, "add_node is nullptr"); + + if (matmul_node1->quantType == schema::QuantType_QUANT_ALL || + matmul_node1->quantType == schema::QuantType_QUANT_DYNAMIC || + matmul_node2->quantType == schema::QuantType_QUANT_ALL || + matmul_node2->quantType == schema::QuantType_QUANT_DYNAMIC || + add_node->quantType == schema::QuantType_QUANT_ALL || add_node->quantType == schema::QuantType_QUANT_DYNAMIC) { + MS_LOG(DEBUG) << "cannot fusion with quant node"; + return RET_NO_CHANGE; + } + MS_CHECK_TRUE_RET(matmul_node1->primitive != nullptr, RET_NULL_PTR); + auto matmul_type1 = matmul_node1->primitive->value.AsMatMulFusion()->activation_type; + MS_CHECK_TRUE_RET(matmul_node2->primitive != nullptr, RET_NULL_PTR); + auto matmul_type2 = matmul_node2->primitive->value.AsMatMulFusion()->activation_type; + MS_CHECK_TRUE_RET(add_node->primitive != nullptr, RET_NULL_PTR); + auto add_type = add_node->primitive->value.AsAddFusion()->activation_type; + MS_CHECK_TRUE_RET(matmul_type1 == ActivationType::ActivationType_NO_ACTIVATION && + matmul_type2 == ActivationType::ActivationType_NO_ACTIVATION && + add_type == ActivationType::ActivationType_NO_ACTIVATION, + RET_NO_CHANGE); + + if (matmul_node1->inputIndex.at(FIRST_INPUT) != matmul_node2->inputIndex.at(FIRST_INPUT)) { + MS_LOG(INFO) << "matmul should have the same first input"; + return RET_NO_CHANGE; + } + auto &matmul_left_b = graph->allTensors[matmul_node1->inputIndex.at(SECOND_INPUT)]; + auto &matmul_right_b = graph->allTensors[matmul_node2->inputIndex.at(SECOND_INPUT)]; + if (matmul_left_b->data.empty() || matmul_right_b->data.empty()) { + return RET_NO_CHANGE; + } + if (CalNewMatMulNode(graph, matmul_node1, matmul_node2) != RET_OK) { + MS_LOG(INFO) << "failed to fusion two matmul"; + return RET_NO_CHANGE; + } + + matmul_node1->outputIndex = {add_node->outputIndex}; + // cannot delete node here, otherwise will destroy order in other pattern's node index + // make it an isolated node to be removed in IsolatedNodeRemovePass + matmul_node2->inputIndex.clear(); + matmul_node2->outputIndex.clear(); + add_node->inputIndex.clear(); + add_node->outputIndex.clear(); + return RET_OK; +} + +MatMulMatMulAddFusionPass::~MatMulMatMulAddFusionPass() = default; +} // namespace lite +} // namespace mindspore diff --git a/mindspore/lite/src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.h b/mindspore/lite/src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.h new file mode 100644 index 00000000..9ee6d711 --- /dev/null +++ b/mindspore/lite/src/train/optimizer/fusion/matmul_matmul_add_fusion_pass.h @@ -0,0 +1,43 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_SRC_TRAIN_OPTIMIZER_FUSION_MATMUL_MATMUL_ADD_FUSION_PASS_H_ +#define MINDSPORE_LITE_SRC_TRAIN_OPTIMIZER_FUSION_MATMUL_MATMUL_ADD_FUSION_PASS_H_ + +#include +#include +#include +#include +#include +#include "tools/converter/legacy_optimizer/fusion/fusion_pass.h" + +namespace mindspore { +namespace lite { +class MatMulMatMulAddFusionPass : public FusionPass { + public: + MatMulMatMulAddFusionPass() = default; + + ~MatMulMatMulAddFusionPass() override; + + STATUS DefinePattern() override; + + STATUS DoFusion(MetaGraphT *graph, const std::string &pattern_name, + const std::unordered_map> &matched_path) override; +}; +} // namespace lite +} // namespace mindspore + +#endif // MINDSPORE_LITE_SRC_TRAIN_OPTIMIZER_FUSION_MATMUL_MATMUL_ADD_FUSION_PASS_H_ diff --git a/mindspore/lite/src/train/train_export.cc b/mindspore/lite/src/train/train_export.cc index 7534ed2f..5bace006 100644 --- a/mindspore/lite/src/train/train_export.cc +++ b/mindspore/lite/src/train/train_export.cc @@ -151,11 +151,18 @@ int TrainExport::QuantTensorData(schema::TensorT *dest_tensor, const lite::Tenso return RET_OK; } -std::unique_ptr TrainExport::CreateTensor(const mindspore::lite::Tensor *tensor, - schema::Tensor *scTensor, int preferred_dim, - const int tensor_quant_type) { +std::unique_ptr TrainExport::CreateTensor( + const mindspore::lite::Tensor *tensor, const std::vector const_folded_output, + schema::Tensor *scTensor, int preferred_dim, const int tensor_quant_type) { auto tensorT = std::make_unique(); - tensorT->nodeType = scTensor->nodeType(); + bool const_fold = false; + if (quant_type_ == QT_NONE && !const_folded_output.empty() && + std::find(const_folded_output.begin(), const_folded_output.end(), tensor) != const_folded_output.end()) { + tensorT->nodeType = NodeType_ValueNode; + const_fold = true; + } else { + tensorT->nodeType = scTensor->nodeType(); + } tensorT->dims = tensor->shape(); tensorT->format = static_cast(tensor->format()); tensorT->name = tensor->tensor_name(); @@ -163,7 +170,8 @@ std::unique_ptr TrainExport::CreateTensor(const mindspore::lite tensorT->offset = 0; tensorT->dataType = tensor->data_type(); tensorT->enableHuffmanCode = false; - if ((tensorT->nodeType == NodeType_ValueNode) && (scTensor->data() != nullptr) && (scTensor->data()->size() > 0)) { + if (((tensorT->nodeType == NodeType_ValueNode) && (scTensor->data() != nullptr) && (scTensor->data()->size() > 0)) || + const_fold) { if (NeedQuantization(tensor, tensor_quant_type)) { auto ret = QuantTensorData(tensorT.get(), tensor, preferred_dim); if (ret != RET_OK) { @@ -392,6 +400,7 @@ int TrainExport::KeepGraphInputsInOrder(const Model *model) { return RET_OK; } int TrainExport::ExportTensor(const Model *model, const std::vector &tensors, int offset, + const std::vector const_folded_output, const std::vector> &map_index, const std::vector &output_names, const std::set &out_set) { std::vector in_tensors; @@ -401,6 +410,7 @@ int TrainExport::ExportTensor(const Model *model, const std::vector ordered_output_names; for (auto index : map_index) { auto id = index.first; size_t pid = id - static_cast(offset); @@ -408,7 +418,8 @@ int TrainExport::ExportTensor(const Model *model, const std::vectorgraph_.all_tensors_.at(pid); auto preferred_dim = WeightDecoder::GetPreferredDim(in_tensors, index.second.op_parameter, index.second.input_index, tensor->shape(), model->graph_.version_); - auto tensorT = CreateTensor(tensor, scTensor, preferred_dim, index.second.op_parameter->quant_type_); + auto tensorT = + CreateTensor(tensor, const_folded_output, scTensor, preferred_dim, index.second.op_parameter->quant_type_); if (tensorT == nullptr) { MS_LOG(ERROR) << "error in tensor creation"; return RET_ERROR; @@ -423,21 +434,27 @@ int TrainExport::ExportTensor(const Model *model, const std::vectortensor_name()) != output_names.end()) { - meta_graph_->outputIndex.push_back(remap_[id]); - if (!meta_graph_->subGraph.empty()) { - meta_graph_->subGraph[0]->outputIndices.push_back(remap_[id]); - } + ordered_output_names[tensor->tensor_name()] = remap_[id]; } meta_graph_->allTensors.emplace_back(std::move(tensorT)); if (!meta_graph_->subGraph.empty()) { meta_graph_->subGraph[0]->tensorIndices.push_back(meta_graph_->allTensors.size() - 1); } } + for (auto &output_name : output_names) { + if (ordered_output_names.find(output_name) != ordered_output_names.end()) { + meta_graph_->outputIndex.push_back(ordered_output_names[output_name]); + if (!meta_graph_->subGraph.empty()) { + meta_graph_->subGraph[0]->outputIndices.push_back(ordered_output_names[output_name]); + } + } + } return RET_OK; } int TrainExport::ExportNet(const std::vector &kernels, const std::vector &tensors, + const std::vector const_folded_output, const std::vector &output_names, const Model *model, QuantizationType quant_type, const Model *bb_model) { std::vector> map_index; @@ -498,7 +515,7 @@ int TrainExport::ExportNet(const std::vector &k } } - auto status = ExportTensor(model, tensors, offset, map_index, output_names, out_set); + auto status = ExportTensor(model, tensors, offset, const_folded_output, map_index, output_names, out_set); if (status != RET_OK) { MS_LOG(ERROR) << "ExportTensor failed."; return RET_ERROR; diff --git a/mindspore/lite/src/train/train_export.h b/mindspore/lite/src/train/train_export.h index b44f6526..8428c9b9 100644 --- a/mindspore/lite/src/train/train_export.h +++ b/mindspore/lite/src/train/train_export.h @@ -47,8 +47,10 @@ class TrainExport { explicit TrainExport(Buffer *model_buffer) : model_buffer_(model_buffer) {} virtual ~TrainExport(); int ExportNet(const std::vector &kernels, - const std::vector &tensors, const std::vector &output_names, - const Model *model, QuantizationType quant_type, const Model *bb_model = nullptr); + const std::vector &tensors, + const std::vector const_folded_output, + const std::vector &output_names, const Model *model, QuantizationType quant_type, + const Model *bb_model = nullptr); int ExportInit(const std::string model_name, std::string version); int SaveToFile(); int SaveToBuffer(); @@ -75,7 +77,9 @@ class TrainExport { int TopologicalSort(); void PrepareRemap(int offset); LiteGraph::Node *FindNode(const mindspore::kernel::KernelExec *kernel, const Model *model); - std::unique_ptr CreateTensor(const Tensor *tensor, schema::Tensor *scTensor, int preferred_dim, + std::unique_ptr CreateTensor(const Tensor *tensor, + const std::vector const_folded_output, + schema::Tensor *scTensor, int preferred_dim, const int tensor_quant_type); std::unique_ptr CreateCNode(const mindspore::kernel::KernelExec *kernel, std::vector inputIndex, std::vector outputIndex, @@ -93,6 +97,7 @@ class TrainExport { size_t *target_index); int KeepGraphInputsInOrder(const Model *model); int ExportTensor(const Model *model, const std::vector &tensors, int offset, + const std::vector const_folded_output, const std::vector> &map_index, const std::vector &output_names, const std::set &out_set); virtual int QuantTensorData(schema::TensorT *dest_tensor, const mindspore::lite::Tensor *src_tensor, diff --git a/mindspore/lite/src/train/train_session.cc b/mindspore/lite/src/train/train_session.cc index b581b389..c123cba8 100644 --- a/mindspore/lite/src/train/train_session.cc +++ b/mindspore/lite/src/train/train_session.cc @@ -399,6 +399,8 @@ int TrainSession::CompileTrainGraph(std::shared_ptr model) { MS_LOG(ERROR) << "failed to allocate space"; return RET_ERROR; } + // Prepare a list of kernels which are const folded + MS_CHECK_TRUE_MSG(CompileConstFoldedKernels() == RET_OK, RET_ERROR, "CompileConstFoldedKernels failed."); return RET_OK; } @@ -697,20 +699,30 @@ void TrainSession::CompileEvalOutputs() { } if (is_loss) continue; // insert if not already in - if (eval_output_node_map_.find(in_kernel->name()) == eval_output_node_map_.end()) { - auto *ms_tensor = in_kernel->out_tensors().at(0); - if (ms_tensor != nullptr) { - ms_tensor->set_init_ref_count(ms_tensor->init_ref_count() + 1); - eval_output_node_map_[in_kernel->name()].emplace_back(ms_tensor); - auto index = TSFindTensor(tensors_, ms_tensor); - if (index != tensors_.size()) { - if (!ms_tensor->tensor_name().empty()) { - eval_output_tensor_map_.insert(std::make_pair(ms_tensor->tensor_name(), ms_tensor)); - eval_output_tensor_names_.emplace_back(ms_tensor->tensor_name()); - } else { - eval_output_tensor_map_.insert(std::make_pair(std::to_string(index), ms_tensor)); - eval_output_tensor_names_.emplace_back(std::to_string(index)); - } + auto out_tensors = TSFindTensors(in_kernel, kernel); + if (eval_output_node_map_.find(in_kernel->name()) != eval_output_node_map_.end()) { + auto exist_out_tensors = eval_output_node_map_[in_kernel->name()]; + std::vector all_out_tensors; + auto kernel_all_out_tensors = in_kernel->out_tensors(); + eval_output_node_map_[in_kernel->name()] = {}; + for (auto tensor : kernel_all_out_tensors) { + if (std::find(out_tensors.begin(), out_tensors.end(), tensor) != out_tensors.end() || + std::find(exist_out_tensors.begin(), exist_out_tensors.end(), tensor) != exist_out_tensors.end()) { + eval_output_node_map_[in_kernel->name()].emplace_back(tensor); + } + } + } else { + eval_output_node_map_[in_kernel->name()] = out_tensors; + } + for (auto out_tensor : out_tensors) { + auto index = TSFindTensor(tensors_, out_tensor); + if (index != tensors_.size()) { + if (!out_tensor->tensor_name().empty()) { + eval_output_tensor_map_.insert(std::make_pair(out_tensor->tensor_name(), out_tensor)); + eval_output_tensor_names_.emplace_back(out_tensor->tensor_name()); + } else { + eval_output_tensor_map_.insert(std::make_pair(std::to_string(index), out_tensor)); + eval_output_tensor_names_.emplace_back(std::to_string(index)); } } } @@ -863,6 +875,35 @@ void TrainSession::CompileOptimizedKernels() { } } +int TrainSession::CompileConstFoldedKernels() { + const_output_tensors_.clear(); + for (auto kernel : this->inference_kernels_) { + bool is_input_const = true; + for (auto input : kernel->in_tensors()) { + if ((!input->IsConst() || input->IsGraphInput()) && + std::find(const_output_tensors_.begin(), const_output_tensors_.end(), input) == const_output_tensors_.end()) { + is_input_const = false; + } + if (!is_input_const) { + const_fold_kernels_.emplace_back(kernel); + break; + } + } + if (is_input_const) { + auto ret = kernel->Execute(); + if (RET_OK != ret) { + MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); + return ret; + } + for (auto output : kernel->out_tensors()) { + const_output_tensors_.emplace_back(output); + output->set_category(Category::CONST_TENSOR); + } + } + } + return RET_OK; +} + void TrainSession::CompileTrainableParams() { for (auto kernel : this->train_kernels_) { if (!IsOptimizer(kernel)) { @@ -1214,9 +1255,10 @@ int TrainSession::ExportByDifferentType(DestType destination, ModelType model_ty TRAIN_SESSION_CHECK_FALSE_MSG(status != RET_OK, status, "Fail to init export"); if (!output_tensor_name.empty() && model_type == MT_INFERENCE) { std::vector export_kernels = {}; - status = FindExportKernels(&export_kernels, output_tensor_name, inference_kernels_); + status = FindExportKernels(&export_kernels, output_tensor_name, const_fold_kernels_); TRAIN_SESSION_CHECK_FALSE_MSG(status != RET_OK, status, "FindExportKernels failed."); - status = texport.ExportNet(export_kernels, tensors_, output_tensor_name, model_.get(), quant_type); + status = + texport.ExportNet(export_kernels, tensors_, const_output_tensors_, output_tensor_name, model_.get(), quant_type); } else { if (!output_tensor_name.empty() && model_type == MT_TRAIN) { MS_LOG(WARNING) << "Train model does not support to export selected output tensor, and all of the train kernels " @@ -1234,9 +1276,15 @@ int TrainSession::ExportByDifferentType(DestType destination, ModelType model_ty } return status; } else { - status = texport.ExportNet((model_type == MT_TRAIN) ? train_kernels_ : inference_kernels_, tensors_, - (model_type == MT_TRAIN) ? train_output_tensor_names_ : eval_output_tensor_names_, - model_.get(), quant_type); + if (quant_type == QT_NONE) { + status = texport.ExportNet( + (model_type == MT_TRAIN) ? train_kernels_ : const_fold_kernels_, tensors_, const_output_tensors_, + (model_type == MT_TRAIN) ? train_output_tensor_names_ : eval_output_tensor_names_, model_.get(), quant_type); + } else { + status = texport.ExportNet((model_type == MT_TRAIN) ? train_kernels_ : inference_kernels_, tensors_, {}, + (model_type == MT_TRAIN) ? train_output_tensor_names_ : eval_output_tensor_names_, + model_.get(), quant_type); + } } } TRAIN_SESSION_CHECK_FALSE_MSG(status != RET_OK, status, "Fail to export Network."); @@ -1322,14 +1370,13 @@ int TrainSession::ExportWeightsCollaborateWithMicro(const std::string &file_name MS_CHECK_FALSE_MSG(format != FT_FLATBUFFERS, RET_ERROR, "File name cannot be empty"); MS_CHECK_FALSE_MSG(model_type != mindspore::lite::MT_INFERENCE, RET_ERROR, "Currently, can only export inference-model's weights."); - int status = Eval(); - TRAIN_SESSION_CHECK_FALSE_MSG(status != RET_OK, status, "Eval failed"); TrainExport texport(file_name); - status = texport.ExportInit(model_.get()->graph_.name_, model_.get()->graph_.version_); + auto status = texport.ExportInit(model_.get()->graph_.name_, model_.get()->graph_.version_); TRAIN_SESSION_CHECK_FALSE_MSG(status != RET_OK, status, "Fail to init export"); - status = texport.ExportNet(inference_kernels_, tensors_, eval_output_tensor_names_, model_.get(), QT_DEFAULT); + status = texport.ExportNet(const_fold_kernels_, tensors_, const_output_tensors_, eval_output_tensor_names_, + model_.get(), QT_NONE); TRAIN_SESSION_CHECK_FALSE_MSG(status != RET_OK, status, "Fail to export Network."); status = texport.TrainModelDrop(); TRAIN_SESSION_CHECK_FALSE_MSG(status != RET_OK, status, "TrainModelDrop failed."); diff --git a/mindspore/lite/src/train/train_session.h b/mindspore/lite/src/train/train_session.h index 24f10065..0bd14b21 100644 --- a/mindspore/lite/src/train/train_session.h +++ b/mindspore/lite/src/train/train_session.h @@ -128,6 +128,7 @@ class TrainSession : virtual public lite::LiteSession { virtual int CompileInferenceKernels(); virtual void CompileOptimizedKernels(); virtual void CompileTrainableParams(); + virtual int CompileConstFoldedKernels(); virtual void CompileTrainOutputs(); virtual void CompileEvalOutputs(); virtual int InitCallBack(); @@ -146,6 +147,8 @@ class TrainSession : virtual public lite::LiteSession { std::vector inference_kernels_; std::vector train_kernels_; + std::vector const_fold_kernels_; + std::vector const_output_tensors_; TrainCfg cfg_; private: diff --git a/mindspore/lite/src/train/train_utils.cc b/mindspore/lite/src/train/train_utils.cc index 32c4a502..cb7b669a 100644 --- a/mindspore/lite/src/train/train_utils.cc +++ b/mindspore/lite/src/train/train_utils.cc @@ -204,5 +204,20 @@ int ScaleTensor(Tensor *tensor, float scale) { MS_LOG(DEBUG) << "Scale tensor: " << tensor->tensor_name() << " " << scale; return tensor->Scale(scale); } + +std::vector TSFindTensors(const kernel::KernelExec *pre_kernel, const kernel::KernelExec *post_kernel) { + MS_ASSERT(pre_kernel != nullptr); + MS_ASSERT(post_kernel != nullptr); + auto out_tensors = pre_kernel->out_tensors(); + auto in_tensors = post_kernel->in_tensors(); + std::vector res; + for (auto tensor : out_tensors) { + if (std::find(in_tensors.begin(), in_tensors.end(), tensor) == in_tensors.end()) { + continue; + } + res.push_back(tensor); + } + return res; +} } // namespace lite } // namespace mindspore diff --git a/mindspore/lite/src/train/train_utils.h b/mindspore/lite/src/train/train_utils.h index 5c85738f..9b2d62dc 100644 --- a/mindspore/lite/src/train/train_utils.h +++ b/mindspore/lite/src/train/train_utils.h @@ -36,6 +36,7 @@ float CalculateSparseClassification(lite::Tensor *input, lite::Tensor *output); float CalculateOneHotClassification(lite::Tensor *input, lite::Tensor *output); Tensor *CastTensor(Tensor *tensor, TypeId dst_data_type, bool support_fp16); int ScaleTensor(Tensor *tensor, float scale); +std::vector TSFindTensors(const kernel::KernelExec *pre_kernel, const kernel::KernelExec *post_kernel); } // namespace lite } // namespace mindspore #endif // MINDSPORE_LITE_SRC_TRAIN_TRAIN_UTILS_H_ diff --git a/mindspore/lite/src/train/transfer_session.cc b/mindspore/lite/src/train/transfer_session.cc index 48191b4f..b1cb7b3e 100644 --- a/mindspore/lite/src/train/transfer_session.cc +++ b/mindspore/lite/src/train/transfer_session.cc @@ -230,10 +230,10 @@ int TransferSession::ExportInner(DestType destination, ModelType model_type, Qua MS_LOG(ERROR) << "FindExportKernels failed."; return RET_ERROR; } - status = texport.ExportNet(export_kernels, tensors_, out_put_tensor_name, model_.get(), quant_type, + status = texport.ExportNet(export_kernels, tensors_, {}, out_put_tensor_name, model_.get(), quant_type, backbone_session_->model_); } else { - status = texport.ExportNet(inference_kernels_, tensors_, GetOutputTensorNames(), model_.get(), quant_type, + status = texport.ExportNet(inference_kernels_, tensors_, {}, GetOutputTensorNames(), model_.get(), quant_type, backbone_session_->model_); } if (status != RET_OK) { diff --git a/mindspore/lite/tools/common/string_util.cc b/mindspore/lite/tools/common/string_util.cc index 8d7076e5..13cddb3a 100644 --- a/mindspore/lite/tools/common/string_util.cc +++ b/mindspore/lite/tools/common/string_util.cc @@ -199,5 +199,9 @@ size_t Hex2ByteArray(const std::string &hex_str, unsigned char *byte_array, size } return byte_len; } + +bool IsNumber(const std::string &item) { + return std::all_of(item.begin(), item.end(), [](char ch) { return ch >= '0' && ch <= '9'; }); +} } // namespace lite } // namespace mindspore diff --git a/mindspore/lite/tools/common/string_util.h b/mindspore/lite/tools/common/string_util.h index 0fb9c0b2..95bdd742 100644 --- a/mindspore/lite/tools/common/string_util.h +++ b/mindspore/lite/tools/common/string_util.h @@ -45,6 +45,8 @@ bool ConvertBool(std::string str, bool *value); bool ConvertDoubleVector(const std::string &str, std::vector *value); size_t Hex2ByteArray(const std::string &hex_str, unsigned char *byte_array, size_t max_len); + +bool IsNumber(const std::string &item); } // namespace lite } // namespace mindspore #endif // MINDSPORE_LITE_TOOLS_COMMON_STRING_UTIL_H_ diff --git a/mindspore/lite/tools/converter/anf_transform.cc b/mindspore/lite/tools/converter/anf_transform.cc index c4f84163..b63912fa 100644 --- a/mindspore/lite/tools/converter/anf_transform.cc +++ b/mindspore/lite/tools/converter/anf_transform.cc @@ -135,6 +135,7 @@ #include "tools/common/string_util.h" #include "src/common/common.h" #include "tools/optimizer/graph/miniaturization_pass.h" +#include "tools/optimizer/fusion/tile_matmul_fusion.h" using std::string; namespace mindspore::lite { @@ -317,7 +318,8 @@ std::vector InitFusions(const std::shared_ptr ¶ std::make_shared(), std::make_shared(), std::make_shared(), - std::make_shared()}; + std::make_shared(), + std::make_shared()}; if (param->optimize_transformer) { fusions.push_back(std::make_shared()); fusions.push_back(std::make_shared()); diff --git a/mindspore/lite/tools/converter/config_parser/config_file_parser.cc b/mindspore/lite/tools/converter/config_parser/config_file_parser.cc index 2e7ca749..7b47fb8c 100644 --- a/mindspore/lite/tools/converter/config_parser/config_file_parser.cc +++ b/mindspore/lite/tools/converter/config_parser/config_file_parser.cc @@ -19,10 +19,10 @@ #include "include/errorcode.h" #include "src/common/log_adapter.h" #include "tools/converter/converter_context.h" - #include "tools/common/string_util.h" #include "src/common/config_infos.h" #include "src/common/common.h" +#include "nnacl/op_base.h" namespace mindspore { namespace lite { @@ -208,6 +208,75 @@ void SetDynParams(const std::shared_ptr ¶m, } } +int ParseInputShapeTemplate(const std::string &shape_template, std::set *dynamic_symbols) { + // the inputs_shape config is like: input1:[d0,d1,3];input2:[4,d0] + auto graph_inputs_shape_vec = SplitStringToVector(shape_template, ';'); + for (const auto &graph_input_shape : graph_inputs_shape_vec) { + auto graph_input_shape_info = SplitStringToVector(graph_input_shape, ':'); + MS_CHECK_TRUE_MSG(graph_input_shape_info.size() == kIndex2, RET_INPUT_PARAM_INVALID, "the inputs_shape is invalid"); + auto input_shape = graph_input_shape_info[1]; + if (input_shape[0] != '[' || input_shape[input_shape.size() - 1] != ']') { + MS_LOG(ERROR) << "the inputs_shape is invalid"; + return RET_INPUT_PARAM_INVALID; + } + input_shape = input_shape.substr(1, input_shape.size() - kIndex2); + auto input_shape_vec = SplitStringToVector(input_shape, ','); + for (const auto &shape : input_shape_vec) { + if (!IsNumber(shape)) { + dynamic_symbols->insert(shape); + } + } + } + return RET_OK; +} + +int ParseDynmiacDimTemplate(const std::string &dims_template, std::set *dynamic_symbols, + MicroParamString *micro_param_string) { + // the dynamic_dim_params config is like: d0:[1,3~6];d1:[1~8] + auto dim_info_vec = SplitStringToVector(dims_template, ';'); + MS_CHECK_TRUE_MSG(dim_info_vec.size() <= kIndex2, RET_NOT_SUPPORT, "currently, only support to set two dynamic dims"); + for (const auto &dim_info : dim_info_vec) { + auto dim_vec = SplitStringToVector(dim_info, ':'); + MS_CHECK_TRUE_MSG(dim_vec.size() == kIndex2, RET_INPUT_PARAM_INVALID, "the dynamic_dim_params is invalid"); + std::string symbol = dim_vec[0]; + if (dynamic_symbols->find(symbol) == dynamic_symbols->end()) { + MS_LOG(ERROR) << symbol << "is invalid, because it's not set in the inputs_shape."; + return RET_INPUT_PARAM_INVALID; + } + std::string dim_range = dim_vec[1]; + if (dim_range[0] != '[' || dim_range[dim_range.size() - 1] != ']') { + MS_LOG(ERROR) << "the dynamic_dim_params is invalid"; + return RET_INPUT_PARAM_INVALID; + } + dim_range = dim_range.substr(1, dim_range.size() - kIndex2); + auto discrete_vec = SplitStringToVector(dim_range, ','); + for (const auto &dim : discrete_vec) { + auto continuous_dim = SplitStringToVector(dim, '~'); + MS_CHECK_TRUE_MSG(continuous_dim.size() == C1NUM || continuous_dim.size() == kIndex2, RET_INPUT_PARAM_INVALID, + "the dynamic_dim_params is invalid"); + if (continuous_dim.size() == C1NUM) { + if (!IsNumber(continuous_dim[0]) || std::stoi(continuous_dim[0]) <= 0) { + MS_LOG(ERROR) << "the dynamic_dim_params range value must be greater than 0"; + return RET_INPUT_PARAM_INVALID; + } + micro_param_string->dynamic_symbols_map[symbol] += continuous_dim[0] + ","; + continue; + } + if (!IsNumber(continuous_dim[0]) || std::stoi(continuous_dim[0]) <= 0 || !IsNumber(continuous_dim[1]) || + std::stoi(continuous_dim[1]) <= 0) { + MS_LOG(ERROR) << "the dynamic_dim_params range value must be greater than 0"; + return RET_INPUT_PARAM_INVALID; + } + auto start = std::stoi(continuous_dim[0]); + auto end = std::stoi(continuous_dim[1]); + for (auto i = start; i <= end; ++i) { + micro_param_string->dynamic_symbols_map[symbol] += std::to_string(i) + ","; + } + } + } + return RET_OK; +} + void ConfigFileParser::SetParamByConfigfile(const std::shared_ptr ¶m, const std::map &ascend_map) { std::string ascend_string = ""; @@ -377,8 +446,12 @@ int ConfigFileParser::ParseConfigParam(std::map &input_map, - const std::map &parse_map, const std::string §ion) { + const std::map &parse_map, const std::string §ion, + const std::set &dynamic_key) { for (const auto &map : input_map) { + if (dynamic_key.find(map.first) != dynamic_key.end()) { + continue; + } if (parse_map.find(map.first) == parse_map.end()) { MS_LOG(ERROR) << "INPUT ILLEGAL: `" << map.first << "` is not supported in " << "[" << section << "]"; @@ -511,21 +584,34 @@ int ConfigFileParser::ParseAclOptionCfgString(const std::map> &maps) { - if (maps.find(kMicroParam) != maps.end()) { - const auto &map = maps.at(kMicroParam); - std::map parse_map{ - {"target", micro_param_string_.target}, - {"codegen_mode", micro_param_string_.codegen_mode}, - {"debug_mode", micro_param_string_.debug_mode}, - {"support_parallel", micro_param_string_.support_parallel}, - {"enable_micro", micro_param_string_.enable_micro}, - {"save_path", micro_param_string_.save_path}, - {"project_name", micro_param_string_.project_name}, - {"keep_original_weight", micro_param_string_.keep_original_weight}, - {"changeable_weights_name", micro_param_string_.changeable_weights_name}}; - return SetMapData(map, parse_map, kMicroParam); + if (maps.find(kMicroParam) == maps.end()) { + return RET_OK; } - return RET_OK; + const auto &map = maps.at(kMicroParam); + const std::string graph_inputs_shape_template = "inputs_shape"; + std::set dynamic_symbols; + if (map.find(graph_inputs_shape_template) != map.end()) { + const auto &shape_template = map.at(graph_inputs_shape_template); + ParseInputShapeTemplate(shape_template, &dynamic_symbols); + } + const std::string dynamic_dims = "dynamic_dim_params"; + if (!dynamic_symbols.empty() && map.find(dynamic_dims) != map.end()) { + const auto &dims_template = map.at(dynamic_dims); + ParseDynmiacDimTemplate(dims_template, &dynamic_symbols, µ_param_string_); + } + std::map parse_map{ + {"target", micro_param_string_.target}, + {"codegen_mode", micro_param_string_.codegen_mode}, + {"debug_mode", micro_param_string_.debug_mode}, + {"support_parallel", micro_param_string_.support_parallel}, + {"enable_micro", micro_param_string_.enable_micro}, + {"save_path", micro_param_string_.save_path}, + {"project_name", micro_param_string_.project_name}, + {"keep_original_weight", micro_param_string_.keep_original_weight}, + {"changeable_weights_name", micro_param_string_.changeable_weights_name}, + {"inputs_shape", micro_param_string_.inputs_shape}, + {"dynamic_dim_params", micro_param_string_.dynamic_dim_params}}; + return SetMapData(map, parse_map, kMicroParam); } int ConfigFileParser::ParseWeightQuantString(const std::map> &maps) { diff --git a/mindspore/lite/tools/converter/config_parser/config_file_parser.h b/mindspore/lite/tools/converter/config_parser/config_file_parser.h index 6997bac8..163782b7 100644 --- a/mindspore/lite/tools/converter/config_parser/config_file_parser.h +++ b/mindspore/lite/tools/converter/config_parser/config_file_parser.h @@ -108,17 +108,20 @@ struct MicroParamString { std::string project_name; std::string keep_original_weight; std::string changeable_weights_name; + std::string inputs_shape; + std::string dynamic_dim_params; + std::map dynamic_symbols_map; }; struct ThirdPartyModelString { std::string input_dtypes; std::string input_shapes; - std::string input_names; // optional, default: "" + std::string input_names; // optional, default: "" std::string input_formats; // optional, default: NHWC std::string output_dtypes; std::string output_shapes; - std::string output_names; // optional, default: "" - std::string output_formats; // optional, default: NHWC + std::string output_names; // optional, default: "" + std::string output_formats; // optional, default: NHWC std::string extended_parameters; // format: {key1:value1;ker2:value2} }; @@ -172,7 +175,8 @@ class ConfigFileParser { int ParseRegistryInfoString(const std::map> &maps); int ParseAclOptionCfgString(const std::map> &maps); int SetMapData(const std::map &input_map, - const std::map &parse_map, const std::string §ion); + const std::map &parse_map, const std::string §ion, + const std::set &dynamic_key = {}); int ParseMicroParamString(const std::map> &maps); int ParseThirdPartyParamString(const std::map> §ions); int ParseCpuOptionCfgString(const std::map> &maps); diff --git a/mindspore/lite/tools/converter/config_parser/micro_param_parser.cc b/mindspore/lite/tools/converter/config_parser/micro_param_parser.cc index c9998cc8..903f2863 100644 --- a/mindspore/lite/tools/converter/config_parser/micro_param_parser.cc +++ b/mindspore/lite/tools/converter/config_parser/micro_param_parser.cc @@ -19,6 +19,7 @@ #include "tools/common/string_util.h" #include "src/common/log_adapter.h" #include "src/common/log_util.h" +#include "nnacl/op_base.h" namespace mindspore { namespace lite { @@ -115,6 +116,80 @@ STATUS MicroParamParser::ParseChangeableWeightsName(const std::string &changeabl return RET_OK; } +STATUS MicroParamParser::ParseGraphInputsShapeTemplate(const std::string &graph_inputs_shape_template, + const std::map &dynamic_symbols_map, + micro::MicroParam *micro_param) { + MS_LOG(DEBUG) << "Micro record inputs shape: " << graph_inputs_shape_template; + if (!graph_inputs_shape_template.empty()) { + auto graph_inputs_shape_vec = SplitStringToVector(graph_inputs_shape_template, ';'); + std::map> graph_inputs_info; + std::vector> graph_inputs_shape; + std::vector inputs_name; + for (const auto &graph_input_shape : graph_inputs_shape_vec) { + auto input_shape_info = SplitStringToVector(graph_input_shape, ':'); + std::string input_name = input_shape_info[0]; + std::string input_shape = input_shape_info[1].substr(1, input_shape_info[1].size() - C2NUM); + auto input_shape_vec = SplitStringToVector(input_shape, ','); + graph_inputs_info[input_name] = input_shape_vec; + graph_inputs_shape.push_back(input_shape_vec); + inputs_name.push_back(input_name); + } + micro_param->graph_inputs_origin_info = graph_inputs_info; + micro_param->inputs_shape_by_scenes.clear(); + std::map> symbols_to_num; + std::map symbols_index; + std::vector symbols; + std::vector scene_num_by_symbol; + int index = 0; + size_t scene_num = 1; + for (const auto &item : dynamic_symbols_map) { + symbols_index[item.first] = index++; + symbols.push_back(item.first); + auto num_str_list = SplitStringToVector(item.second, ','); + for (const auto &num_str : num_str_list) { + symbols_to_num[item.first].push_back(std::stoi(num_str)); + } + if (symbols_to_num[item.first].empty()) { + MS_LOG(ERROR) << "Micro param invalid, dynamic symbol must have value."; + return RET_INPUT_PARAM_INVALID; + } + scene_num_by_symbol.push_back(symbols_to_num[item.first].size()); + scene_num *= symbols_to_num[item.first].size(); + } + micro_param->dynamic_symbols = symbols; + micro_param->dynamic_symbols_num = scene_num_by_symbol; + std::vector post_multi(symbols.size(), 1); + for (int i = static_cast(post_multi.size()) - 2; i >= 0; --i) { + post_multi[i] = post_multi[i + 1] * scene_num_by_symbol[i + 1]; + } + std::vector real_num(symbols.size()); + for (size_t i = 0; i < scene_num; ++i) { + size_t remain = i; + for (size_t j = 0; j < symbols.size(); ++j) { + real_num[j] = remain / post_multi[j]; + remain %= post_multi[j]; + } + for (size_t j = 0; j < graph_inputs_shape.size(); ++j) { + const auto &input_template = graph_inputs_shape[j]; + std::vector input_shape; + for (const auto &dim : input_template) { + if (IsNumber(dim)) { + input_shape.push_back(std::stoi(dim)); + continue; + } + if (symbols_index.find(dim) == symbols_index.end()) { + MS_LOG(ERROR) << "Dynamic symbol cannot find real num."; + return RET_INPUT_PARAM_INVALID; + } + input_shape.push_back(symbols_to_num[dim][real_num[symbols_index[dim]]]); + } + micro_param->inputs_shape_by_scenes[inputs_name[j]].push_back(input_shape); + } + } + } + return RET_OK; +} + STATUS MicroParamParser::ParseMicroParam(const MicroParamString µ_param_string, micro::MicroParam *micro_param) { CHECK_NULL_RETURN(micro_param); if (ParseTarget(micro_param_string.target, micro_param) != RET_OK) { @@ -145,9 +220,11 @@ STATUS MicroParamParser::ParseMicroParam(const MicroParamString µ_param_str MS_LOG(ERROR) << "Parse project name val failed: " << micro_param_string.project_name; return RET_INPUT_PARAM_INVALID; } - if (ParseKeepOriginalWeight(micro_param_string.keep_original_weight, micro_param) != RET_OK) { - MS_LOG(ERROR) << "Parse keep_original_weight failed, the val: " << micro_param_string.keep_original_weight; - return RET_INPUT_PARAM_INVALID; + if (!micro_param_string.keep_original_weight.empty()) { + if (ParseKeepOriginalWeight(micro_param_string.keep_original_weight, micro_param) != RET_OK) { + MS_LOG(ERROR) << "Parse keep_original_weight val; " << micro_param_string.keep_original_weight; + return RET_INPUT_PARAM_INVALID; + } } if (!micro_param_string.changeable_weights_name.empty() && !micro_param->keep_original_weight) { MS_LOG(ERROR) << "When changeable_weights_name is set, the keep_original_weight must be true."; @@ -157,6 +234,12 @@ STATUS MicroParamParser::ParseMicroParam(const MicroParamString µ_param_str MS_LOG(ERROR) << "Parse changeable_weights_name failed, the val: " << micro_param_string.changeable_weights_name; return RET_INPUT_PARAM_INVALID; } + if (ParseGraphInputsShapeTemplate(micro_param_string.inputs_shape, micro_param_string.dynamic_symbols_map, + micro_param) != RET_OK) { + MS_LOG(ERROR) << "Parse inputs_shape & dynamic_dim_params failed, the inputs_shape val: " + << micro_param_string.inputs_shape; + return RET_INPUT_PARAM_INVALID; + } return RET_OK; } } // namespace lite diff --git a/mindspore/lite/tools/converter/config_parser/micro_param_parser.h b/mindspore/lite/tools/converter/config_parser/micro_param_parser.h index b6efb4c7..eb95c571 100644 --- a/mindspore/lite/tools/converter/config_parser/micro_param_parser.h +++ b/mindspore/lite/tools/converter/config_parser/micro_param_parser.h @@ -37,6 +37,9 @@ class MicroParamParser { STATUS ParseProjName(const std::string &debug_mode, micro::MicroParam *micro_param); STATUS ParseKeepOriginalWeight(const std::string &keep_weight, micro::MicroParam *micro_param); STATUS ParseChangeableWeightsName(const std::string &changeable_weights_name, micro::MicroParam *micro_param); + STATUS ParseGraphInputsShapeTemplate(const std::string &graph_inputs_shape_template, + const std::map &dynamic_symbols_map, + micro::MicroParam *micro_param); }; } // namespace lite } // namespace mindspore diff --git a/mindspore/lite/tools/converter/converter.cc b/mindspore/lite/tools/converter/converter.cc index a61bd51c..4703e889 100644 --- a/mindspore/lite/tools/converter/converter.cc +++ b/mindspore/lite/tools/converter/converter.cc @@ -56,6 +56,7 @@ #include "src/common/file_utils.h" #include "ops/dynamic_shape.h" #include "tools/common/parse_config_utils.h" +#include "src/common/file_utils.h" #include "tools/converter/converter_packed_node.h" #include "tools/converter/config_parser/cpu_option_param_parser.h" #include "tools/converter/export_model.h" @@ -432,54 +433,34 @@ int ConverterImpl::InitConfigParam(const std::shared_ptr ¶m, MS_LOG(ERROR) << "Parse config param failed."; return ret; } - ret = ParseParam(&config_parser, param, model_param_infos, maps); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Parse param failed."; - return ret; - } - return RET_OK; -} - -int ConverterImpl::ParseParam(lite::ConfigFileParser *config_parser, const std::shared_ptr ¶m, - const std::map> *model_param_infos, - const std::map> maps) { - param->config_infos = maps; - auto ret = RET_OK; if (model_param_infos->empty()) { - ret = - lite::PreprocessParser::ParsePreprocess(config_parser->GetDataPreProcessString(), ¶m->dataPreProcessParam); + ret = lite::PreprocessParser::ParsePreprocess(config_parser.GetDataPreProcessString(), ¶m->dataPreProcessParam); if (ret != RET_OK) { MS_LOG(ERROR) << "Parse preprocess failed."; return ret; } - ret = lite::QuantParamParser::ParseCommonQuant(config_parser->GetCommonQuantString(), ¶m->commonQuantParam); + ret = lite::QuantParamParser::ParseCommonQuant(config_parser.GetCommonQuantString(), ¶m->commonQuantParam); if (ret != RET_OK) { MS_LOG(ERROR) << "Parse common quant param failed."; return ret; } - ret = lite::QuantParamParser::ParseFullQuant(config_parser->GetFullQuantString(), ¶m->fullQuantParam); + ret = lite::QuantParamParser::ParseFullQuant(config_parser.GetFullQuantString(), ¶m->fullQuantParam); if (ret != RET_OK) { MS_LOG(ERROR) << "Parse full quant param failed."; return ret; } - ret = lite::QuantParamParser::ParseWeightQuant(config_parser->GetWeightQuantString(), ¶m->weightQuantParam); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Parse full quant param failed."; - return ret; - } - ret = lite::QuantParamParser::ParseMixedBitWeightQuant(config_parser->GetMixedBitWeightQuantString(), + ret = lite::QuantParamParser::ParseMixedBitWeightQuant(config_parser.GetMixedBitWeightQuantString(), ¶m->mixedBitWeightQuantParam); if (ret != RET_OK) { MS_LOG(ERROR) << "Parse mixed bit weight quant param failed."; return ret; } - ret = lite::ThirdPartyParamParser::Parse(config_parser->GetThirdPartyModelString(), - ¶m->thirdPartyModelParam); + ret = lite::ThirdPartyParamParser::Parse(config_parser.GetThirdPartyModelString(), ¶m->thirdPartyModelParam); if (ret != RET_OK) { MS_LOG(ERROR) << "Parse third party param failed."; return ret; } - ret = InitExtendedIntegrationInfo(param, *config_parser); + ret = InitExtendedIntegrationInfo(param, config_parser); if (ret != RET_OK) { MS_LOG(ERROR) << "Parse extended integration info failed."; return ret; @@ -490,7 +471,7 @@ int ConverterImpl::ParseParam(lite::ConfigFileParser *config_parser, const std:: param->aclModelOptionCfgParam.dump_model_name = dir_pos != std::string::npos ? output_file.substr(dir_pos + 1) : output_file; lite::AclOptionParamParser acl_param_parser; - ret = acl_param_parser.ParseAclOptionCfg(config_parser->GetAclOptionCfgString(), ¶m->aclModelOptionCfgParam); + ret = acl_param_parser.ParseAclOptionCfg(config_parser.GetAclOptionCfgString(), ¶m->aclModelOptionCfgParam); if (ret != RET_OK) { MS_LOG(ERROR) << "Parse acl option param failed."; return ret; @@ -498,14 +479,14 @@ int ConverterImpl::ParseParam(lite::ConfigFileParser *config_parser, const std:: // parse ascend_context in config file, the priority is higher if (maps.find("ascend_context") != maps.end()) { auto map = maps.at("ascend_context"); - config_parser->SetParamByConfigfile(param, map); + config_parser.SetParamByConfigfile(param, map); } if (!param->config_file.empty()) { (void)CheckOfflineParallelConfig(param->config_file, ¶m->parallel_split_config); } lite::CpuOptionParamParser cpu_param_parser; - ret = cpu_param_parser.ParseCpuOptionCfg(config_parser->GetCpuOptionCfgString(), ¶m->cpuOptionCfgParam); + ret = cpu_param_parser.ParseCpuOptionCfg(config_parser.GetCpuOptionCfgString(), ¶m->cpuOptionCfgParam); if (ret != RET_OK) { MS_LOG(ERROR) << "Parse cpu option param failed."; return ret; @@ -515,29 +496,29 @@ int ConverterImpl::ParseParam(lite::ConfigFileParser *config_parser, const std:: << "If there are multi models, only support micro_param and model_param, other configure can not take effect"; lite::MicroParamParser micro_param_parser; - ret = micro_param_parser.ParseMicroParam(config_parser->GetMicroParamString(), ¶m->microParam); + ret = micro_param_parser.ParseMicroParam(config_parser.GetMicroParamString(), ¶m->microParam); if (ret != RET_OK) { MS_LOG(ERROR) << "Parse micro param failed."; return ret; } ret = - lite::QuantParamParser::ParseTransformQuant(config_parser->GetTransformQuantString(), ¶m->transformQuantParam); + lite::QuantParamParser::ParseTransformQuant(config_parser.GetTransformQuantString(), ¶m->transformQuantParam); if (ret != RET_OK) { MS_LOG(ERROR) << "Parse transform quant param failed."; return ret; } - ret = lite::QuantParamParser::ParseAscendQuant(config_parser->GetAscendQuantString(), ¶m->ascendQuantParam); + ret = lite::QuantParamParser::ParseAscendQuant(config_parser.GetAscendQuantString(), ¶m->ascendQuantParam); if (ret != RET_OK) { MS_LOG(ERROR) << "Parse ascend quant param failed."; return ret; } - ret = lite::QuantParamParser::ParseDynamicQuant(config_parser->GetDynamicQuantString(), ¶m->dynamicQuantParam); + ret = lite::QuantParamParser::ParseDynamicQuant(config_parser.GetDynamicQuantString(), ¶m->dynamicQuantParam); if (ret != RET_OK) { MS_LOG(ERROR) << "Parse dynamic quant param failed."; return ret; } lite::GraphKernelParamParser graph_kernel_parser; - ret = graph_kernel_parser.ParseGraphKernelCfg(config_parser->GetGraphKernelString(), ¶m->graphKernelParam); + ret = graph_kernel_parser.ParseGraphKernelCfg(config_parser.GetGraphKernelString(), ¶m->graphKernelParam); if (ret != RET_OK) { MS_LOG(ERROR) << "Parse graph kernel param failed."; return ret; @@ -708,9 +689,9 @@ int CheckFmkType(const std::shared_ptr ¶m) { if (param != nullptr) { return RET_OK; } - std::set kValidFmkTypes = {FmkType::kFmkTypeTf, FmkType::kFmkTypeCaffe, FmkType::kFmkTypeOnnx, - FmkType::kFmkTypeMs, FmkType::kFmkTypeTflite, FmkType::kFmkTypePytorch, - FmkType::kFmkTypeMsLite, FmkType::kFmkTypeThirdParty}; + std::set kValidFmkTypes = {FmkType::kFmkTypeTf, FmkType::kFmkTypeCaffe, FmkType::kFmkTypeOnnx, + FmkType::kFmkTypeMs, FmkType::kFmkTypeTflite, FmkType::kFmkTypePytorch, + FmkType::kFmkTypeMsLite, FmkType::kFmkTypeThirdParty}; if (kValidFmkTypes.find(param->fmk_type) == kValidFmkTypes.end()) { MS_LOG(ERROR) << "INPUT ILLEGAL: fmk_type must be " "TF|CAFFE|ONNX|MS|TFLITE|PYTORCH|MSLITE|THIRDPARTY" @@ -1010,7 +991,6 @@ int ConverterImpl::Convert(const std::shared_ptr ¶m, void **m model_index++; } } - return RET_OK; } @@ -1045,7 +1025,6 @@ int ConverterImpl::HandleGraphCommon(const std::shared_ptr ¶m MS_LOG(ERROR) << "Save graph failed: " << ret << " " << GetErrorInfo(ret); return ret; } - return RET_OK; } @@ -1067,8 +1046,8 @@ int ConverterImpl::ExecuteMicro(const schema::MetaGraphT *meta_graph, const std: } auto status = meta_graph != nullptr - ? micro::Coder::MicroSourceCodeGeneration(*meta_graph, output_path, param->microParam, param->weight_fp16) - : micro::Coder::MicroSourceCodeGeneration(param->model_file, output_path, param->microParam, param->weight_fp16); + ? micro::Coder::MicroSourceCodeGeneration(*meta_graph, output_path, ¶m->microParam, param->weight_fp16) + : micro::Coder::MicroSourceCodeGeneration(param->model_file, output_path, ¶m->microParam, param->weight_fp16); if (status != RET_OK) { MS_LOG(ERROR) << "Execute Micro failed."; } @@ -1123,7 +1102,6 @@ int ConverterImpl::SaveGraph(FuncGraphPtr graph, const std::shared_ptrUpdateReturnCode(RET_ERROR); return RET_ERROR; } + auto to_custom_op_pass = std::make_shared(); + MS_CHECK_TRUE_MSG(to_custom_op_pass != nullptr, RET_NULL_PTR, "to_custom_op_pass is nullptr."); + if (!to_custom_op_pass->Run(func_graph)) { + MS_LOG(ERROR) << "To custom op pass run failed!"; + ReturnCode::GetSingleReturnCode()->UpdateReturnCode(RET_ERROR); + return RET_ERROR; + } return RET_OK; } diff --git a/mindspore/lite/tools/converter/import/to_custom_op_pass.cc b/mindspore/lite/tools/converter/import/to_custom_op_pass.cc new file mode 100644 index 00000000..55e524e6 --- /dev/null +++ b/mindspore/lite/tools/converter/import/to_custom_op_pass.cc @@ -0,0 +1,86 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tools/converter/import/to_custom_op_pass.h" +#include "ops/grad/gather_d_grad_v2.h" +#include "ops/masked_fill.h" +#include "ops/custom.h" +#include "ops/op_utils.h" +#include "mindspore/ccsrc/include/common/utils/utils.h" +#include "nnacl/custom_gather_d_grad_v2_parameter.h" + +using mindspore::ops::kNameGatherDGradV2; +using mindspore::ops::kNameMaskedFill; + +namespace mindspore { +namespace opt { +bool ToCustomOpPass::Run(const FuncGraphPtr &graph) { + MS_ASSERT(graph != nullptr); + auto manager = graph->manager(); + MS_ASSERT(manager != nullptr); + auto node_list = TopoSort(graph->get_return()); + + for (auto &node : node_list) { + if (!utils::isa(node)) { + continue; + } + auto cnode = node->cast(); + MS_ASSERT(cnode != nullptr); + auto value_node = cnode->input(0); + auto prim = GetValueNode(value_node); + if (prim == nullptr) { + MS_LOG(DEBUG) << "this is a call cnode, which input[0] is fg."; + continue; + } + + auto func = ToCustomOpRegistry::GetInstance()->GetToCustomOpFunc(prim->name()); + if (func == nullptr) { + continue; + } + + auto ret = func(cnode); + if (ret != RET_OK) { + MS_LOG(ERROR) << "failed to convert normal cnode node to custom cnode"; + return false; + } + } + return true; +} + +int GatherDGradV2ToCustomOp(const CNodePtr &cnode) { + auto ori_prim = ops::GetOperator(cnode->input(kAnfPrimitiveIndex)); + auto dim = ori_prim->get_dim(); + auto dim_str = std::to_string(dim); + std::map> attrs; + attrs["dim"] = std::vector(dim_str.begin(), dim_str.end()); + auto custom_prim = std::make_shared(); + custom_prim->set_type(kNameGatherDGradV2); + cnode->set_input(kAnfPrimitiveIndex, NewValueNode(custom_prim->GetPrim())); + custom_prim->set_attr(attrs); + return RET_OK; +} + +int MaskedFillToCustomOp(const CNodePtr &cnode) { + auto custom_prim = std::make_shared(); + custom_prim->set_type(kNameMaskedFill); + cnode->set_input(kAnfPrimitiveIndex, NewValueNode(custom_prim->GetPrim())); + return RET_OK; +} + +REGISTER_TO_CUSTOM_OP(kNameGatherDGradV2, GatherDGradV2ToCustomOp); +REGISTER_TO_CUSTOM_OP(kNameMaskedFill, MaskedFillToCustomOp); +} // namespace opt +} // namespace mindspore diff --git a/mindspore/lite/tools/converter/import/to_custom_op_pass.h b/mindspore/lite/tools/converter/import/to_custom_op_pass.h new file mode 100644 index 00000000..7108e48b --- /dev/null +++ b/mindspore/lite/tools/converter/import/to_custom_op_pass.h @@ -0,0 +1,68 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_IMPORT_TO_CUSTOM_OP_PASS_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_IMPORT_TO_CUSTOM_OP_PASS_H_ +#include +#include "backend/common/optimizer/pass.h" +#include "tools/optimizer/common/gllo_utils.h" + +namespace mindspore { +namespace opt { + +typedef int (*ToCustomOpFunc)(const CNodePtr &cnode); +class ToCustomOpRegistry { + public: + static ToCustomOpRegistry *GetInstance() { + static ToCustomOpRegistry registry; + return ®istry; + } + + void InsertToCustomOpMap(const std::string &key, ToCustomOpFunc creator) { to_custom_op_funcs_[key] = creator; } + + ToCustomOpFunc GetToCustomOpFunc(const std::string &key) { + if (to_custom_op_funcs_.find(key) != to_custom_op_funcs_.end()) { + return to_custom_op_funcs_[key]; + } else { + MS_LOG(DEBUG) << "Unsupported primitive type : " << key; + return nullptr; + } + } + + protected: + std::map to_custom_op_funcs_; +}; + +class RegistryToCustomOp { + public: + RegistryToCustomOp(const std::string &key, ToCustomOpFunc creator) { + ToCustomOpRegistry::GetInstance()->InsertToCustomOpMap(key, creator); + } + virtual ~RegistryToCustomOp() = default; +}; + +#define REGISTER_TO_CUSTOM_OP(type, to_custom_op_func) \ + RegistryToCustomOp g_##type##_to_custom_op(type, to_custom_op_func); + +class ToCustomOpPass : public Pass { + public: + ToCustomOpPass() : Pass("ToCustomOpPass") {} + ~ToCustomOpPass() = default; + bool Run(const FuncGraphPtr &graph) override; +}; +} // namespace opt +} // namespace mindspore +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_IMPORT_TO_CUSTOM_OP_PASS_H_ diff --git a/mindspore/lite/tools/converter/legacy_optimizer/fusion/fusion_pass.cc b/mindspore/lite/tools/converter/legacy_optimizer/fusion/fusion_pass.cc index 8ea838cf..a551196d 100644 --- a/mindspore/lite/tools/converter/legacy_optimizer/fusion/fusion_pass.cc +++ b/mindspore/lite/tools/converter/legacy_optimizer/fusion/fusion_pass.cc @@ -287,7 +287,6 @@ bool FusionPass::MatchTree(const schema::MetaGraphT &graph, size_t nodeIdx, cons bool FusionPass::CheckMatchParams(const schema::MetaGraphT &graph, size_t nodeIdx, const std::shared_ptr &target, const std::vector &sinkIdes, const std::vector &pathSinkIdes) { - MS_ASSERT(target != nullptr); MS_ASSERT(nodeIdx < graph.nodes.size()); auto &scope = graph.nodes.at(nodeIdx); MS_CHECK_TRUE_MSG(scope != nullptr, false, "Node in graph is nullptr"); diff --git a/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.cc b/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.cc index 371e93fb..ff99f1f4 100644 --- a/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.cc +++ b/mindspore/lite/tools/converter/legacy_optimizer/graph/infershape_pass.cc @@ -660,7 +660,9 @@ int InferShapePass::InitSearchTensor(const int64_t &subgraph_index, MetaGraphT * } auto &subgraph = graph->subGraph.at(subgraph_index); for (uint32_t i = 0; i < tensors_.size(); i++) { - if (IsContain(subgraph->inputIndices, i) || !graph->allTensors.at(i)->data.empty()) { + if (IsContain(subgraph->inputIndices, i) || !graph->allTensors.at(i)->data.empty() || + (graph->allTensors.at(i)->nodeType == NodeType_ValueNode && graph->allTensors.at(i)->dims.size() == 1 && + graph->allTensors.at(i)->dims[0] == 0)) { tensors_[i].is_inferred_ = true; } } diff --git a/mindspore/lite/tools/converter/micro/cmake/file_list.cmake b/mindspore/lite/tools/converter/micro/cmake/file_list.cmake index c132460e..5dcf0bb7 100644 --- a/mindspore/lite/tools/converter/micro/cmake/file_list.cmake +++ b/mindspore/lite/tools/converter/micro/cmake/file_list.cmake @@ -4,6 +4,8 @@ set(CODER_SRC ${MICRO_DIR}/coder/context.cc ${MICRO_DIR}/coder/graph.cc ${MICRO_DIR}/coder/session.cc + ${MICRO_DIR}/coder/shape_info_container.cc + ${MICRO_DIR}/coder/dynamic_mem_manager.cc ${MICRO_DIR}/coder/utils/coder_utils.cc ${MICRO_DIR}/coder/utils/dir_utils.cc ${MICRO_DIR}/coder/utils/train_utils.cc @@ -23,6 +25,7 @@ set(CODER_ALLOCATOR_SRC set(CODER_GENERATOR_SRC ${MICRO_DIR}/coder/generator/generator.cc ${MICRO_DIR}/coder/generator/inference/inference_generator.cc + ${MICRO_DIR}/coder/generator/component/allocator_component.cc ${MICRO_DIR}/coder/generator/component/common_component.cc ${MICRO_DIR}/coder/generator/component/weight_component.cc ${MICRO_DIR}/coder/generator/component/allocator_component.cc @@ -66,6 +69,8 @@ set(CODER_OPCODERS_SRC ${MICRO_DIR}/coder/opcoders/base/stack_base_coder.cc ${MICRO_DIR}/coder/opcoders/base/unstack_base_coder.cc ${MICRO_DIR}/coder/opcoders/base/strided_slice_base_coder.cc + ${MICRO_DIR}/coder/opcoders/base/reshape_dynamic_base_coder.cc + ${MICRO_DIR}/coder/opcoders/base/strided_slice_dynamic_base_coder.cc #### cmsis int8 coder ${MICRO_DIR}/coder/opcoders/cmsis-nn/int8/add_int8_coder.cc ${MICRO_DIR}/coder/opcoders/cmsis-nn/int8/conv2d_base_coder.cc @@ -81,23 +86,37 @@ set(CODER_OPCODERS_SRC ${MICRO_DIR}/coder/opcoders/nnacl/fp16/arithmetic_fp16_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp16/avg_pooling_fp16_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp16/concat_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv2d_delegate_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv_depthwise_3x3_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv_depthwise_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv_depthwise_sw_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_1x1_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_winograd_fp16_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp16/custom_gru_fp16_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp16/deconv2d_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/lstm_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/matmul_fp16_base_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/matmul_fp16_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp16/resize_fp16_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp16/transpose_fp16_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp16/layernorm_fp16_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp16/reduce_fp16_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp16/resize_fp16_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp16/matmul_fp16_base_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp16/matmul_fp16_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv2d_delegate_fp16_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv_depthwise_fp16_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_fp16_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_winograd_fp16_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_1x1_fp16_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv_depthwise_3x3_fp16_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv_depthwise_sw_fp16_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp16/lstm_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.cc #### nnacl fp32 coder ${MICRO_DIR}/coder/opcoders/nnacl/fp32/activation_fp32_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp32/addn_fp32_coder.cc @@ -122,6 +141,7 @@ set(CODER_OPCODERS_SRC ${MICRO_DIR}/coder/opcoders/nnacl/fp32/lstm_fp32_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp32/matmul_fp32_base_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp32/matmul_fp32_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp32/ones_like_fp32_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp32/pad_fp32_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp32/pooling_fp32_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp32/power_fp32_coder.cc @@ -133,17 +153,14 @@ set(CODER_OPCODERS_SRC ${MICRO_DIR}/coder/opcoders/nnacl/fp32/transpose_fp32_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp32/splice_fp32_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp32/exp_fp32_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp32/fill_fp32_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp32/deconv2d_fp32_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp32/prelu_fp32_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/fp32/layernorm_fp32_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp32/ones_like_fp32_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp32/fill_fp32_coder.cc - #### nnacl fp32_grad coder - ${MICRO_DIR}/coder/opcoders/nnacl/fp32_grad/activation_grad_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp32_grad/adam_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp32_grad/assign_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp32_grad/biasadd_grad_coder.cc - ${MICRO_DIR}/coder/opcoders/nnacl/fp32_grad/softmax_cross_entropy_with_logits_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.cc + ${MICRO_DIR}/coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.cc #### nnacl int8 coder ${MICRO_DIR}/coder/opcoders/nnacl/int8/activation_int8_coder.cc ${MICRO_DIR}/coder/opcoders/nnacl/int8/affine_int8_coder.cc diff --git a/mindspore/lite/tools/converter/micro/coder/coder.cc b/mindspore/lite/tools/converter/micro/coder/coder.cc index cc224ae5..a502500d 100644 --- a/mindspore/lite/tools/converter/micro/coder/coder.cc +++ b/mindspore/lite/tools/converter/micro/coder/coder.cc @@ -42,6 +42,34 @@ std::shared_ptr CreateCoderSession() { } return session; } + +int ParseMicroDynamicShape(const schema::MetaGraphT &graph, micro::MicroParam *micro_param) { + for (auto index : graph.inputIndex) { + auto input_name = graph.allTensors.at(index)->name; + if (micro_param->graph_inputs_origin_info.find(input_name) == micro_param->graph_inputs_origin_info.end() || + micro_param->inputs_shape_by_scenes.find(input_name) == micro_param->inputs_shape_by_scenes.end()) { + MS_LOG(ERROR) << "Micro param: dynamic inputs name is invalid"; + return RET_INPUT_PARAM_INVALID; + } + micro_param->graph_inputs_template.emplace_back(micro_param->graph_inputs_origin_info[input_name]); + micro_param->graph_inputs_shape_infos.emplace_back(micro_param->inputs_shape_by_scenes[input_name]); + } + return RET_OK; +} + +int ParseMicroDynamicShape(const Model &model, micro::MicroParam *micro_param) { + for (auto index : model.graph_.input_indices_) { + auto input_name = model.graph_.all_tensors_.at(index)->name()->str(); + if (micro_param->graph_inputs_origin_info.find(input_name) == micro_param->graph_inputs_origin_info.end() || + micro_param->inputs_shape_by_scenes.find(input_name) == micro_param->inputs_shape_by_scenes.end()) { + MS_LOG(ERROR) << "Micro param: dynamic inputs name is invalid"; + return RET_INPUT_PARAM_INVALID; + } + micro_param->graph_inputs_template.emplace_back(micro_param->graph_inputs_origin_info[input_name]); + micro_param->graph_inputs_shape_infos.emplace_back(micro_param->inputs_shape_by_scenes[input_name]); + } + return RET_OK; +} } // namespace int Coder::Run(const void *model_buff, size_t size, const std::string &model_name, bool end_flag, bool enable_fp16) { session_ = CreateCoderSession(); @@ -109,29 +137,37 @@ bool Coder::InitPath(const std::string &output_path) { return true; } -int Coder::MicroSourceCodeGeneration(const schema::MetaGraphT &graph, const std::string &output_path, - const MicroParam ¶m, bool enable_fp16) { +int Coder::MicroSourceCodeGeneration(const schema::MetaGraphT &graph, const std::string &output_path, MicroParam *param, + bool enable_fp16) { flatbuffers::FlatBufferBuilder builder(kFlatbuffersBuilderInitSize); auto offset = schema::MetaGraph::Pack(builder, &graph); builder.Finish(offset); schema::FinishMetaGraphBuffer(builder, offset); size_t size = builder.GetSize(); - if (ExecuteMicroGeneration(builder.GetBufferPointer(), size, output_path, param, enable_fp16) != RET_OK) { + if (!param->dynamic_symbols.empty()) { + MS_CHECK_TRUE_MSG(ParseMicroDynamicShape(graph, param) == RET_OK, RET_ERROR, "ParseMicroDynamicShape failed."); + } + if (ExecuteMicroGeneration(builder.GetBufferPointer(), size, output_path, *param, enable_fp16) != RET_OK) { MS_LOG(ERROR) << "Execute Micro failed."; return RET_ERROR; } return RET_OK; } -int Coder::MicroSourceCodeGeneration(const std::string &model_file, const std::string &output_path, - const MicroParam ¶m, bool enable_fp16) { +int Coder::MicroSourceCodeGeneration(const std::string &model_file, const std::string &output_path, MicroParam *param, + bool enable_fp16) { size_t buffer_size; auto model_buf = lite::ReadFile(model_file.c_str(), &buffer_size); if (model_buf == nullptr) { MS_LOG(ERROR) << "Read model-file failed."; return RET_NULL_PTR; } - auto ret = ExecuteMicroGeneration(model_buf, buffer_size, output_path, param, enable_fp16); + Model *model = lite::Model::Import(model_buf, buffer_size); + MS_CHECK_PTR(model); + if (!param->dynamic_symbols.empty()) { + MS_CHECK_TRUE_MSG(ParseMicroDynamicShape(*model, param) == RET_OK, RET_ERROR, "ParseMicroDynamicShape failed."); + } + auto ret = ExecuteMicroGeneration(model_buf, buffer_size, output_path, *param, enable_fp16); if (ret != RET_OK) { MS_LOG(ERROR) << "Execute Micro failed."; } @@ -199,6 +235,10 @@ int Coder::Init(const MicroParam ¶m) const { DirectoryGenerator::GetInstance()->project_name()); config->set_keep_original_weight(param.keep_original_weight); config->set_changeable_weights_name(param.changeable_weights_name); + config->set_graph_inputs_shape_infos(param.graph_inputs_shape_infos); + config->set_dynamic_symbols(param.dynamic_symbols); + config->set_dynamic_symbols_num(param.dynamic_symbols_num); + config->set_user_graph_inputs_template(param.graph_inputs_template); auto print_parameter = [](auto name, auto value) { MS_LOG(INFO) << std::setw(20) << std::left << name << "= " << value; @@ -209,6 +249,7 @@ int Coder::Init(const MicroParam ¶m) const { print_parameter("codePath", config->code_path()); print_parameter("codeMode", config->code_mode()); print_parameter("debugMode", config->debug_mode()); + print_parameter("keepOriginalWeight", config->keep_original_weight()); return RET_OK; } } // namespace mindspore::lite::micro diff --git a/mindspore/lite/tools/converter/micro/coder/coder.h b/mindspore/lite/tools/converter/micro/coder/coder.h index c360f4c1..fad479aa 100644 --- a/mindspore/lite/tools/converter/micro/coder/coder.h +++ b/mindspore/lite/tools/converter/micro/coder/coder.h @@ -31,9 +31,9 @@ class Coder final { ~Coder() = default; static int MicroSourceCodeGeneration(const schema::MetaGraphT &graph, const std::string &output_path, - const MicroParam ¶m, bool enable_fp16); - static int MicroSourceCodeGeneration(const std::string &model_file, const std::string &output_path, - const MicroParam ¶m, bool enable_fp16); + MicroParam *param, bool enable_fp16); + static int MicroSourceCodeGeneration(const std::string &model_file, const std::string &output_path, MicroParam *param, + bool enable_fp16); private: static int ExecuteMicroGeneration(const void *model_buf, size_t size, const std::string &output_path, diff --git a/mindspore/lite/tools/converter/micro/coder/config.h b/mindspore/lite/tools/converter/micro/coder/config.h index 9be56178..fb90a2fc 100644 --- a/mindspore/lite/tools/converter/micro/coder/config.h +++ b/mindspore/lite/tools/converter/micro/coder/config.h @@ -34,6 +34,12 @@ struct MicroParam { std::string project_name; bool is_last_model{false}; bool keep_original_weight{false}; + std::vector> graph_inputs_template; + std::map> graph_inputs_origin_info; + std::vector dynamic_symbols; + std::vector dynamic_symbols_num; + std::vector>> graph_inputs_shape_infos; + std::map>> inputs_shape_by_scenes; }; class Configurator { @@ -67,6 +73,29 @@ class Configurator { void set_changeable_weights_name(const std::string &weights_name) { changeable_weights_name_ = weights_name; } const std::string &changeable_weights_name() const { return changeable_weights_name_; } + void set_dynamic_shape(bool dynamic_shape) { dynamic_shape_ = dynamic_shape; } + bool dynamic_shape() const { return dynamic_shape_; } + + void set_dynamic_symbols(const std::vector &dynamic_symbols) { dynamic_symbols_ = dynamic_symbols; } + const std::vector &dynamic_symbols() const { return dynamic_symbols_; } + + void set_dynamic_symbols_num(const std::vector &dynamic_symbols_num) { + dynamic_symbols_num_ = dynamic_symbols_num; + } + const std::vector &dynamic_symbols_num() const { return dynamic_symbols_num_; } + + void set_user_graph_inputs_template(const std::vector> &graph_inputs_template) { + user_graph_inputs_template_ = graph_inputs_template; + } + const std::vector> &user_graph_inputs_template() const { + return user_graph_inputs_template_; + } + + void set_graph_inputs_shape_infos(const std::vector>> &graph_inputs_shape_infos) { + graph_inputs_shape_infos_ = graph_inputs_shape_infos; + } + const std::vector>> &graph_inputs_shape_infos() { return graph_inputs_shape_infos_; } + private: Configurator() = default; ~Configurator() = default; @@ -76,8 +105,13 @@ class Configurator { bool support_parallel_{false}; bool debug_mode_{false}; bool keep_original_weight_{false}; + bool dynamic_shape_{false}; std::string proj_dir_; std::string changeable_weights_name_; + std::vector dynamic_symbols_; + std::vector dynamic_symbols_num_; + std::vector>> graph_inputs_shape_infos_; + std::vector> user_graph_inputs_template_; }; } // namespace mindspore::lite::micro #endif // MICRO_CODER_CONFIG_H diff --git a/mindspore/lite/tools/converter/micro/coder/context.cc b/mindspore/lite/tools/converter/micro/coder/context.cc index 251b282f..7e7f640e 100644 --- a/mindspore/lite/tools/converter/micro/coder/context.cc +++ b/mindspore/lite/tools/converter/micro/coder/context.cc @@ -50,4 +50,17 @@ std::vector CoderContext::GetInitWeightSizeCode() const { } void CoderContext::AppendInitWeightSizeCode(size_t w_buf_size) { weight_buffer_size_ += w_buf_size; } + +const std::map> &CoderContext::shape_all_scenes() const { + return shape_info_container_->GetShapesWholeScenes(); +} +const std::map> &CoderContext::shape_templates() { + return shape_info_container_->GetWholeTemplateShape(); +} +const std::map> &CoderContext::offset_all_scenes() { + return dynamic_mem_manager_->GetOffsetAllScenes(); +} +const std::vector &CoderContext::buffer_sizes() const { return dynamic_mem_manager_->GetBufferSizes(); } +const std::vector &CoderContext::workspaces() const { return dynamic_mem_manager_->GetWorkSpaces(); } +std::string CoderContext::tensor_addr(const Tensor *tensor) { return dynamic_mem_manager_->GetVarTensorAddr(tensor); } } // namespace mindspore::lite::micro diff --git a/mindspore/lite/tools/converter/micro/coder/context.h b/mindspore/lite/tools/converter/micro/coder/context.h index bad4ab40..b511eac1 100644 --- a/mindspore/lite/tools/converter/micro/coder/context.h +++ b/mindspore/lite/tools/converter/micro/coder/context.h @@ -25,6 +25,8 @@ #include #include #include "src/tensor.h" +#include "tools/converter/micro/coder/shape_info_container.h" +#include "tools/converter/micro/coder/dynamic_mem_manager.h" namespace mindspore::lite::micro { class CoderContext { @@ -146,6 +148,17 @@ class CoderContext { bool end_flag() { return end_flag_; } + void set_shape_info_container(ShapeInfoContainer *shape_info_container) { + shape_info_container_ = shape_info_container; + } + void set_dynamic_mem_manager(DynamicMemManager *dynamic_mem_manager) { dynamic_mem_manager_ = dynamic_mem_manager; } + const std::map> &shape_all_scenes() const; + const std::map> &shape_templates(); + const std::map> &offset_all_scenes(); + const std::vector &buffer_sizes() const; + const std::vector &workspaces() const; + std::string tensor_addr(const Tensor *tensor); + private: std::string model_name_; std::vector graph_inputs_; @@ -195,6 +208,8 @@ class CoderContext { // operator C Lang files list, depended by the net.c. it will be add to CMakeLists.txt static std::set c_files_; static size_t max_buffer_size_; + ShapeInfoContainer *shape_info_container_; + DynamicMemManager *dynamic_mem_manager_; }; } // namespace mindspore::lite::micro #endif // MINDSPORE_LITE_MICRO_CODER_CONTEXT_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/dynamic_mem_manager.cc b/mindspore/lite/tools/converter/micro/coder/dynamic_mem_manager.cc new file mode 100644 index 00000000..976bd852 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/dynamic_mem_manager.cc @@ -0,0 +1,116 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "coder/dynamic_mem_manager.h" +#include +#include "coder/allocator/memory_manager.h" +#include "coder/generator/component/component.h" + +namespace mindspore::lite::micro { +int DynamicMemManager::AllocDynamicMem(const std::vector> &nodes, + const std::vector &graph_inputs, + const std::vector &graph_outputs, + const ShapeInfoContainer *shape_info_container) { + MS_CHECK_TRUE_MSG(shape_info_container, RET_NULL_PTR, "ShapeInfoContainer is a nullptr."); + for (size_t i = 0; i < graph_inputs.size(); ++i) { + graph_inputs_.insert(std::make_pair(graph_inputs.at(i), kInputPrefixName + std::to_string(i))); + } + auto var_tensor_shapes = shape_info_container->GetVarTensorInfos(); + MS_CHECK_TRUE_MSG(!var_tensor_shapes.empty(), RET_ERROR, "Cannot get var-tensor's shape-info"); + auto scene_num = var_tensor_shapes.begin()->second.size(); + for (const auto &item : var_tensor_shapes) { + MS_CHECK_TRUE_MSG(item.first, RET_NULL_PTR, "Find a nullptr in shape-infos"); + MS_CHECK_TRUE_MSG(item.second.size() == scene_num, RET_ERROR, "Shape-info is invalid."); + } + for (size_t i = 0; i < scene_num; ++i) { + for (const auto &item : var_tensor_shapes) { + item.first->ResetRefCount(); + item.first->set_shape(item.second[i]); + } + auto ret = AllocDynamicMemCore(nodes, graph_outputs, i); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "Alloc dynamic memory failed."); + } + return RET_OK; +} + +int DynamicMemManager::AllocDynamicMemCore(const std::vector> &nodes, + const std::vector &graph_outputs, int scene_index) { + if (offsets_all_scenes_.find(scene_index) != offsets_all_scenes_.end()) { + MS_LOG(ERROR) << "Current scene has been processed."; + return RET_ERROR; + } + auto manager = std::make_unique(); + int ret = manager->AssignMemory(nodes, graph_outputs); + if (ret != RET_OK) { + MS_LOG(ERROR) << "assign memory failed"; + return RET_ERROR; + } + std::map offsets = manager->variables_offset(); + if (offset_index_.empty()) { + int index = 0; + for (auto &item : offsets) { + offset_index_[item.first] = index++; + offsets_all_scenes_[scene_index].push_back(item.second); + } + } else { + MS_CHECK_TRUE_MSG(offsets.size() == offset_index_.size(), RET_ERROR, "Tensors num is not same."); + for (auto &item : offsets) { + MS_CHECK_TRUE_MSG(offset_index_.find(item.first) != offset_index_.end(), RET_ERROR, "Tensor cannot be found."); + offsets_all_scenes_[scene_index].push_back(item.second); + } + } + buffer_sizes_.push_back(manager->GetAllocatedSize()); + offsets_all_scenes_[scene_index].push_back(manager->GetAllocatedSize()); + return RET_OK; +} + +std::string DynamicMemManager::GetVarTensorAddr(const Tensor *tensor) const { + if (graph_inputs_.find(tensor) != graph_inputs_.end()) { + return graph_inputs_.at(tensor); + } + if (offset_index_.find(tensor) == offset_index_.end()) { + return ""; + } + if (kBufferPrefixName == nullptr || kOffsetPrefixName == nullptr) { + MS_LOG(ERROR) << "Buffer or Offset is a nullptr."; + return ""; + } + return std::string(kBufferPrefixName) + " + " + kOffsetPrefixName + "[" + std::to_string(offset_index_.at(tensor)) + + "]"; +} + +std::string DynamicMemManager::AllocWorkSpace(size_t size, int index) { + if (index < 0 || static_cast(index) >= buffer_sizes_.size()) { + return ""; + } + if (static_cast(index) + 1 >= workspaces_.size()) { + workspaces_.insert(workspaces_.end(), index + 1 - workspaces_.size(), 0); + } + if (workspaces_[index] < size) { + workspaces_[index] = size; + } + if (kBufferPrefixName == nullptr) { + MS_LOG(ERROR) << "Buffer is a nullptr."; + return ""; + } + if (kOffsetPrefixName == nullptr) { + MS_LOG(ERROR) << "Offset is a nullptr."; + return ""; + } + return "(" + std::string(kBufferPrefixName) + " + " + kOffsetPrefixName + "[" + + std::to_string(offsets_all_scenes_.begin()->second.size() - 1) + "])"; +} +} // namespace mindspore::lite::micro diff --git a/mindspore/lite/tools/converter/micro/coder/dynamic_mem_manager.h b/mindspore/lite/tools/converter/micro/coder/dynamic_mem_manager.h new file mode 100644 index 00000000..6db7cff5 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/dynamic_mem_manager.h @@ -0,0 +1,53 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_DYNAMIC_MEM_MANAGER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_DYNAMIC_MEM_MANAGER_H_ + +#include +#include +#include "src/tensor.h" +#include "tools/converter/micro/coder/shape_info_container.h" + +namespace mindspore::lite::micro { +class OperatorCoder; +class DynamicMemManager { + public: + DynamicMemManager() = default; + virtual ~DynamicMemManager() = default; + int AllocDynamicMem(const std::vector> &nodes, + const std::vector &graph_inputs, const std::vector &graph_outputs, + const ShapeInfoContainer *shape_info_container); + + std::string GetVarTensorAddr(const Tensor *tensor) const; + std::string AllocWorkSpace(size_t size, int index); + + const std::vector &GetBufferSizes() const { return buffer_sizes_; } + const std::vector &GetWorkSpaces() const { return workspaces_; } + const std::map> &GetOffsetAllScenes() { return offsets_all_scenes_; } + + private: + int AllocDynamicMemCore(const std::vector> &nodes, + const std::vector &graph_outputs, int scene_index); + std::map> offsets_all_scenes_; + std::map offset_index_; + std::map graph_inputs_; + std::vector buffer_sizes_; + std::vector workspaces_; + int model_id_; +}; +} // namespace mindspore::lite::micro +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_DYNAMIC_MEM_MANAGER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/cmake_component.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/cmake_component.cc index 643cf50b..831d4259 100644 --- a/mindspore/lite/tools/converter/micro/coder/generator/component/cmake_component.cc +++ b/mindspore/lite/tools/converter/micro/coder/generator/component/cmake_component.cc @@ -5,7 +5,7 @@ * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.objrg/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -29,32 +29,32 @@ void CodeCMakeNetLibrary(std::ofstream &ofs, const std::unique_ptr } ofs << "set(OP_SRC\n"; for (const std::string &c_file : ctx->c_files()) { - ofs << " " << c_file << ".o\n"; + ofs << " " << c_file << ".obj\n"; } for (int i = 0; i <= ctx->GetCurModelIndex(); ++i) { - ofs << " weight" << i << ".c.o\n" - << " net" << i << ".c.o\n" - << " model" << i << ".c.o\n"; + ofs << " weight" << i << ".c.obj\n" + << " net" << i << ".c.obj\n" + << " model" << i << ".c.obj\n"; } - ofs << " model.c.o\n" - << " context.c.o\n" - << " tensor.c.o\n"; - if (config->target() != kCortex_M) { - ofs << " allocator.c.o\n"; + ofs << " model.c.obj\n" + << " context.c.obj\n" + << " tensor.c.obj\n"; + if (config->target() != kCortex_M && !config->dynamic_shape()) { + ofs << " allocator.c.obj\n"; } if (config->debug_mode()) { - ofs << " debug_utils.c.o\n"; + ofs << " debug_utils.c.obj\n"; } if (config->support_parallel()) { - ofs << " micro_core_affinity.c.o\n" - " micro_thread_pool.c.o\n"; + ofs << " micro_core_affinity.c.obj\n" + " micro_thread_pool.c.obj\n"; } ofs << ")\n"; std::set kernel_cmake_asm_set_files = ctx->asm_files(); if (!kernel_cmake_asm_set_files.empty() && (config->target() == kARM32 || config->target() == kARM64)) { ofs << "set(ASSEMBLY_SRC\n"; for (const std::string &asm_file : kernel_cmake_asm_set_files) { - ofs << " " << asm_file << ".o\n"; + ofs << " " << asm_file << ".obj\n"; } ofs << ")\n" << "set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)\n" diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.cc index 774e8353..62c2f668 100644 --- a/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.cc +++ b/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.cc @@ -16,6 +16,7 @@ #include "coder/generator/component/common_component.h" #include +#include "coder/generator/component/const_blocks/license.h" #include "coder/generator/component/component.h" #include "coder/utils/type_cast.h" #include "coder/utils/coder_utils.h" @@ -23,36 +24,59 @@ #include "include/errorcode.h" #include "nnacl/op_base.h" #include "include/c_api/model_c.h" +#include "tools/common/string_util.h" namespace mindspore::lite::micro { -const char handle_array_destroy_state[] = R"RAW( -void MSTensorHandleArrayDestroy(MSTensorHandleArray inputs); +const char model_runtime_init_source[] = R"RAW( +typedef struct { + void *runtime_buffer; + OH_AI_TensorHandleArray inputs; + OH_AI_TensorHandleArray outputs; +} MicroModel; +OH_AI_ModelHandle OH_AI_ModelCreate() { + MicroModel *micro_model = (MicroModel *)malloc(sizeof(MicroModel)); + if (micro_model == NULL) { + return NULL; + } +)RAW"; +const char model_runtime_malloc_source[] = R"RAW( + int buffer_size = GetBufferSize(); + void *runtime_buffer = malloc(buffer_size); + if (runtime_buffer == NULL) { + return NULL; + } + micro_model->runtime_buffer = runtime_buffer; + int ret = SetBuffer(runtime_buffer); + if (ret != OH_AI_STATUS_SUCCESS) { + return NULL; + } + )RAW"; const char handle_array_destroy[] = R"RAW( -void MSTensorHandleArrayDestroy(MSTensorHandleArray inputs) { - if (inputs.handle_list == NULL) { - return; - } - for (size_t i = 0; i < inputs.handle_num; i++) { - MicroTensor *micro_tensor = inputs.handle_list[i]; - if (micro_tensor == NULL) { - continue; - } - if (micro_tensor->data != NULL && micro_tensor->owned) { - free(micro_tensor->data); - micro_tensor->data = NULL; - micro_tensor->owned = false; - } - if (micro_tensor->shape != NULL) { - free(micro_tensor->shape); - micro_tensor->shape = NULL; - } - free(micro_tensor); - micro_tensor = NULL; - } - free(inputs.handle_list); - inputs.handle_list = NULL; +void OH_AI_TensorHandleArrayDestroy(OH_AI_TensorHandleArray inputs) { + if (inputs.handle_list == NULL) { + return; + } + for (size_t i = 0; i < inputs.handle_num; i++) { + MicroTensor *micro_tensor = inputs.handle_list[i]; + if (micro_tensor == NULL) { + continue; + } + if (micro_tensor->data != NULL && micro_tensor->owned) { + free(micro_tensor->data); + micro_tensor->data = NULL; + micro_tensor->owned = false; + } + if (micro_tensor->shape) { + free(micro_tensor->shape); + micro_tensor->shape = NULL; + } + free(micro_tensor); + micro_tensor = NULL; + } + free(inputs.handle_list); + inputs.handle_list = NULL; } )RAW"; @@ -62,7 +86,7 @@ const char cortex_set_workspace[] = R"RAW( if (micro_model == NULL) { return; } - if (workspace_size < MSModelCalcWorkspaceSize(model)) { + if (workspace_size < OH_AI_ModelCalcWorkspaceSize(model)) { return; } if (micro_model->inputs.handle_num != GRAPH_INPUTS_SIZE) { @@ -75,29 +99,29 @@ const char cortex_set_workspace[] = R"RAW( )RAW"; const char micro_model_build_state[] = R"RAW( -typedef MSStatus (*ModelBuild)(MSModelHandle model, const void *model_data, +typedef OH_AI_Status (*ModelBuild)(OH_AI_ModelHandle model, const void *model_data, size_t data_size, - const MSContextHandle model_context); + const OH_AI_ContextHandle model_context); )RAW"; const char micro_model_build_implement[] = R"RAW( -MSStatus MSModelBuild(MSModelHandle model, const void *model_data, - size_t data_size, MSModelType model_type, - const MSContextHandle model_context) { - if (model_type != kMSModelTypeMindIR) { - return kMSStatusLiteNotSupport; +OH_AI_Status OH_AI_ModelBuild(OH_AI_ModelHandle model, const void *model_data, + size_t data_size, OH_AI_ModelType model_type, + const OH_AI_ContextHandle model_context) { + if (model_type != OH_AI_MODELTYPE_MINDIR) { + return OH_AI_STATUS_LITE_NOT_SUPPORT; } if (model == NULL) { - return kMSStatusLiteParamInvalid; + return OH_AI_STATUS_LITE_PARAM_INVALID; } )RAW"; const char micro_model_predict_state[] = R"RAW( -typedef MSStatus (*ModelPredict)(MSModelHandle model, - const MSTensorHandleArray inputs, - MSTensorHandleArray *outputs, - const MSKernelCallBackC before, - const MSKernelCallBackC after); +typedef OH_AI_Status (*ModelPredict)(OH_AI_ModelHandle model, + const OH_AI_TensorHandleArray inputs, + OH_AI_TensorHandleArray *outputs, + const OH_AI_KernelCallBack before, + const OH_AI_KernelCallBack after); )RAW"; const char free_resource_state[] = R"RAW( @@ -107,7 +131,7 @@ typedef void (*FreeResource)(); void CodeMSModelCalcWorkspaceSize(std::ofstream &ofs, const std::unique_ptr &ctx, const Configurator &config) { if (config.target() == kCortex_M) { - ofs << "size_t MSModelCalcWorkspaceSize(MSModelHandle model) {\n" + ofs << "size_t OH_AI_ModelCalcWorkspaceSize(OH_AI_ModelHandle model) {\n" << " MicroModel *micro_model = (MicroModel *)model;\n" << " if (micro_model == NULL) {\n" << " return 0;\n" @@ -118,13 +142,13 @@ void CodeMSModelCalcWorkspaceSize(std::ofstream &ofs, const std::unique_ptrcalc_work_space(model);\n" << "}\n"; } else { - ofs << "size_t MSModelCalcWorkspaceSize(MSModelHandle model) {\n return 0;\n}\n"; + ofs << "size_t OH_AI_ModelCalcWorkspaceSize(OH_AI_ModelHandle model) {\n return 0;\n}\n"; } ofs << "\n"; } void CodeCortexCalcWorkspaceSize(std::ofstream &ofs, const std::unique_ptr &ctx) { - ofs << "size_t MSModelCalcWorkspaceSize" << ctx->GetCurModelIndex() << "(MSModelHandle model) {\n" + ofs << "size_t OH_AI_ModelCalcWorkspaceSize" << ctx->GetCurModelIndex() << "(OH_AI_ModelHandle model) {\n" << "size_t shape_size = 0;\n"; std::vector inputs = ctx->graph_inputs(); for (size_t i = 0; i < inputs.size(); ++i) { @@ -141,7 +165,7 @@ void CodeCortexCalcWorkspaceSize(std::ofstream &ofs, const std::unique_ptr &ctx, const Configurator &config) { - ofs << "void MSModelSetWorkspace(MSModelHandle model, void *workspace, size_t workspace_size) {"; + ofs << "void OH_AI_ModelSetWorkspace(OH_AI_ModelHandle model, void *workspace, size_t workspace_size) {"; if (config.target() == kCortex_M) { ofs << " MicroModel *micro_model = (MicroModel *)model;\n" << " if (micro_model == NULL) {\n" @@ -156,8 +180,8 @@ void CodeMSModelSetWorkspace(std::ofstream &ofs, const std::unique_ptr &ctx) { - ofs << "void MSModelSetWorkspace" << ctx->GetCurModelIndex() - << "(MSModelHandle model, void *workspace, size_t workspace_size) {\n"; + ofs << "void OH_AI_ModelSetWorkspace" << ctx->GetCurModelIndex() + << "(OH_AI_ModelHandle model, void *workspace, size_t workspace_size) {\n"; ofs << cortex_set_workspace; ofs << " micro_model->runtime_buffer = workspace;\n" " int buffer_size = GetBufferSize" @@ -173,12 +197,12 @@ void CodeCortexSetWorkspace(std::ofstream &ofs, const std::unique_ptrinputs.handle_list = (MSTensorHandle *)&buf[buffer_size]; + micro_model->inputs.handle_list = (OH_AI_TensorHandle *)&buf[buffer_size]; buffer_size += GRAPH_INPUTS_SIZE * sizeof(MicroTensor *); buffer_size = UP_ROUND(buffer_size,4); MicroTensor **input_tensors = (MicroTensor **)micro_model->inputs.handle_list; - micro_model->outputs.handle_list = (MSTensorHandle *)&buf[buffer_size]; + micro_model->outputs.handle_list = (OH_AI_TensorHandle *)&buf[buffer_size]; buffer_size += GRAPH_OUTPUTS_SIZE * sizeof(MicroTensor *); buffer_size = UP_ROUND(buffer_size,4); MicroTensor **output_tensors = (MicroTensor **)micro_model->outputs.handle_list; @@ -215,7 +239,7 @@ void CodeCortexSetWorkspace(std::ofstream &ofs, const std::unique_ptrtype = " << EnumNameMSDataType(tensor->data_type()) << ";\n"; - ofs << kAlignedString << prefix << "_tensors[" << index << "]->format = kMSFormatNHWC;\n"; + ofs << kAlignedString << prefix << "_tensors[" << index << "]->format = OH_AI_FORMAT_NHWC;\n"; ofs << kAlignedString << prefix << "_tensors[" << index << "]->ndim = " << tensor->shape().size() << ";\n"; size_t shape_size = tensor->shape().size(); for (size_t i = 0; i < shape_size; i++) { @@ -234,32 +258,31 @@ void CodeCortexSetWorkspace(std::ofstream &ofs, const std::unique_ptr &ctx, const Configurator &config) { if (config.target() != kCortex_M) { - ofs << "MSStatus MSModelCreate" << ctx->GetCurModelIndex() << "(MicroModel *micro_model) {"; + ofs << "OH_AI_Status OH_AI_ModelCreate" << ctx->GetCurModelIndex() << "(MicroModel *micro_model) {"; ofs << R"RAW( if (micro_model == NULL) { - return kMSStatusLiteNullptr; - } - - void *runtime_buffer = GlobalMemory(); - if (runtime_buffer == NULL) { - return kMSStatusLiteNullptr; + return OH_AI_STATUS_LITE_NULLPTR; } - micro_model->runtime_buffer = runtime_buffer; )RAW"; - ofs << " int ret = SetBuffer" << ctx->GetCurModelIndex() << "(((MemBlock *)runtime_buffer)->addr);\n" - << " if (ret != kMSStatusSuccess) {\n" - << " return kMSStatusLiteMemoryFailed;\n" - << " }\n\n"; + if (!config.dynamic_shape()) { + ofs << "void *runtime_buffer = GlobalMemory();\n" + << "if (runtime_buffer == NULL) {\n" + << " return OH_AI_STATUS_LITE_NULLPTR;\n" + << " }\n" + << " micro_model->runtime_buffer = runtime_buffer;\n"; + ofs << " int ret = SetBuffer" << ctx->GetCurModelIndex() << "(((MemBlock *)runtime_buffer)->addr);\n" + << " if (ret != OH_AI_STATUS_SUCCESS) {\n" + << " return OH_AI_STATUS_LITE_MEMORY_FAILED;\n" + << " }\n\n"; + } else { + ofs << " micro_model->runtime_buffer = NULL;\n"; + } if (config.code_mode() == CodeMode::Inference) { ofs << " micro_model->train_mode = false;\n"; } else if (config.code_mode() == CodeMode::Train) { @@ -269,7 +292,7 @@ void CodeMSModelCreate(std::ofstream &ofs, const std::unique_ptr & ofs << kAlignedString << prefix << "_tensors[" << index << "] = malloc(sizeof(MicroTensor));\n"; ofs << kAlignedString << prefix << "_tensors[" << index << "]->type = " << EnumNameMSDataType(tensor->data_type()) << ";\n"; - ofs << kAlignedString << prefix << "_tensors[" << index << "]->format = kMSFormatNHWC;\n"; + ofs << kAlignedString << prefix << "_tensors[" << index << "]->format = OH_AI_FORMAT_NHWC;\n"; ofs << kAlignedString << prefix << "_tensors[" << index << "]->ndim = " << tensor->shape().size() << ";\n"; size_t shape_size = tensor->shape().size(); ofs << kAlignedString << prefix << "_tensors[" << index << "]->shape = " @@ -289,30 +312,30 @@ void CodeMSModelCreate(std::ofstream &ofs, const std::unique_ptr & outputs = ctx->graph_train_outputs(); } size_t inputs_size = inputs.size(); - ofs << " MSTensorHandleArray model_inputs;\n"; + ofs << " OH_AI_TensorHandleArray model_inputs;\n"; ofs << " model_inputs.handle_num = " << inputs_size << ";\n"; ofs << " MicroTensor **input_tensors = malloc(" << inputs_size << " * sizeof(MicroTensor *));\n"; - ofs << " model_inputs.handle_list = (MSTensorHandle *)(input_tensors);\n"; + ofs << " model_inputs.handle_list = (OH_AI_TensorHandle *)(input_tensors);\n"; ofs << " micro_model->inputs = model_inputs;\n"; for (size_t i = 0; i < inputs_size; ++i) { Tensor *input = inputs[i]; array_tostring(input, "input", i); } size_t outputs_size = outputs.size(); - ofs << " MSTensorHandleArray model_outputs;\n"; + ofs << " OH_AI_TensorHandleArray model_outputs;\n"; ofs << " model_outputs.handle_num = " << outputs_size << ";\n"; ofs << " MicroTensor **output_tensors = malloc(" << outputs_size << " * sizeof(MicroTensor *));\n"; - ofs << " model_outputs.handle_list = (MSTensorHandle *)(output_tensors);\n"; + ofs << " model_outputs.handle_list = (OH_AI_TensorHandle *)(output_tensors);\n"; ofs << " micro_model->outputs = model_outputs;\n"; for (size_t i = 0; i < outputs_size; ++i) { Tensor *output = outputs[i]; array_tostring(output, "output", i); } - ofs << " return kMSStatusSuccess;\n"; + ofs << " return OH_AI_STATUS_SUCCESS;\n"; } else { - ofs << "MSStatus MSModelCreate" << ctx->GetCurModelIndex() << "(MicroModel *micro_model) {\n"; + ofs << "OH_AI_Status OH_AI_ModelCreate" << ctx->GetCurModelIndex() << "(MicroModel *micro_model) {\n"; ofs << " micro_model->train_mode = false;\n"; - ofs << " return kMSStatusSuccess;\n"; + ofs << " return OH_AI_STATUS_SUCCESS;\n"; } ofs << "}\n\n"; } @@ -324,20 +347,20 @@ void CodeMSModelBuildCommon(std::ofstream &ofs, const Configurator &config) { ofs << R"RAW( MicroModel *micro_model = (MicroModel *)model; if (micro_model == NULL) { - return kMSStatusLiteNullptr; + return OH_AI_STATUS_LITE_NULLPTR; } if (micro_model->build == NULL) { - return kMSStatusLiteNullptr; + return OH_AI_STATUS_LITE_NULLPTR; } )RAW"; - if (config.target() != kCortex_M) { + if (config.target() != kCortex_M && !config.dynamic_shape()) { ofs << " IncRefCount();\n"; } ofs << R"RAW( - MSStatus ret = + OH_AI_Status ret = micro_model->build(model, model_data, data_size, model_context); - if (ret != kMSStatusSuccess) { - MSModelDestroy(model); + if (ret != OH_AI_STATUS_SUCCESS) { + OH_AI_ModelDestroy(&model); } return ret; } @@ -345,23 +368,23 @@ void CodeMSModelBuildCommon(std::ofstream &ofs, const Configurator &config) { } void CodeMSModelBuild(std::ofstream &ofs, const int model_index, const size_t weight_size, const Configurator &config) { - ofs << "MSStatus MSModelBuild" << model_index - << "(MSModelHandle model, const void *model_data, size_t data_size,\n" - " const MSContextHandle model_context) {\n" + ofs << "OH_AI_Status OH_AI_ModelBuild" << model_index + << "(OH_AI_ModelHandle model, const void *model_data, size_t data_size,\n" + " const OH_AI_ContextHandle model_context) {\n" " if (model == NULL) {\n" - " return kMSStatusLiteParamInvalid;\n" + " return OH_AI_STATUS_LITE_PARAM_INVALID;\n" " }\n"; if (config.changeable_weights_name().empty()) { ofs << " if (data_size != " << weight_size << ") {\n" - " return kMSStatusLiteInputParamInvalid;\n" + " return OH_AI_STATUS_LITE_INPUT_PARAM_INVALID;\n" " }\n"; } ofs << " MicroModel *micro_model = (MicroModel *)model;\n" - " int ret = MSModelCreate" + " int ret = OH_AI_ModelCreate" << model_index << "(micro_model);\n" - " if (ret != kMSStatusSuccess) {\n" + " if (ret != OH_AI_STATUS_SUCCESS) {\n" " return ret;\n" " }\n"; if (config.target() != kCortex_M) { @@ -372,7 +395,7 @@ void CodeMSModelBuild(std::ofstream &ofs, const int model_index, const size_t we if (config.support_parallel()) { ofs << " MicroContext *micro_context = (MicroContext *)model_context;\n" " if (micro_context == NULL) {\n" - " return kMSStatusLiteNullptr;" + " return OH_AI_STATUS_LITE_NULLPTR;" " }\n" " ret = CreateThreadPool(micro_context->thread_num_);\n" " if(ret != RET_OK) {\n" @@ -384,35 +407,172 @@ void CodeMSModelBuild(std::ofstream &ofs, const int model_index, const size_t we ofs << "}\n"; } +void CodeMSModelResizeInit(std::ofstream &ofs, const std::unique_ptr &ctx, const Configurator &config) { + auto &dynamic_symbols_num = config.dynamic_symbols_num(); + std::string array_index; + for (auto num : dynamic_symbols_num) { + array_index += "[" + std::to_string(num) + "]"; + } + auto shapes = ctx->shape_all_scenes(); + if (!shapes.empty()) { + auto num_of_each_scene = shapes.begin()->second.size(); + ofs << " static int shapes" << array_index << "[" + std::to_string(num_of_each_scene) + "] = {"; + for (auto &item : shapes) { + auto &shape_val = item.second; + for (size_t j = 0; j < shape_val.size(); ++j) { + ofs << shape_val[j] << ", "; + } + } + ofs << "};\n"; + } + auto offsets = ctx->offset_all_scenes(); + if (!offsets.empty()) { + auto num_of_each_scene = offsets.begin()->second.size(); + ofs << " static int offsets" << array_index << "[" + std::to_string(num_of_each_scene) + "] = {"; + for (auto &item : offsets) { + auto &offset_val = item.second; + for (size_t j = 0; j < offset_val.size(); ++j) { + ofs << offset_val[j] << ", "; + } + } + ofs << "};\n"; + } + ofs << " size_t buffer_sizes" << array_index << " = {"; + auto buffer_size = ctx->buffer_sizes(); + auto workspace = ctx->workspaces(); + if (buffer_size.size() != workspace.size()) { + return; + } + for (size_t i = 0; i < buffer_size.size(); i++) { + ofs << buffer_size[i] + workspace[i] << ", "; + } + ofs << "};\n"; +} + +void CodeMSModelResize(std::ofstream &ofs, const std::unique_ptr &ctx, const Configurator &config) { + auto &shape_templates = ctx->shape_templates(); + ofs << "OH_AI_Status OH_AI_ModelResize" << ctx->GetCurModelIndex() + << "(OH_AI_ModelHandle model, const OH_AI_TensorHandleArray inputs, OH_AI_ShapeInfo *shape_infos, size_t " + "shape_info_num) {\n" + " if (model == NULL) {\n" + " return OH_AI_STATUS_LITE_PARAM_INVALID;\n" + " }\n"; + if (!config.dynamic_shape()) { + ofs << " return OH_AI_STATUS_LITE_NOT_SUPPORT;\n"; + } else { + ofs << " MicroModel *micro_model = (MicroModel *)model;\n" + << " if (micro_model == NULL) {\n" + " return OH_AI_STATUS_LITE_NULLPTR;\n" + " }\n"; + CodeMSModelResizeInit(ofs, ctx, config); + std::map> symbol_to_indexes; + std::map user_to_inner; + auto &user_graph_inputs_template = config.user_graph_inputs_template(); + for (size_t i = 0; i < ctx->graph_inputs().size(); ++i) { + auto cur_tensor = ctx->graph_inputs()[i]; + auto cur_shapes = shape_templates.at(cur_tensor); + for (size_t j = 0; j < cur_shapes.size(); ++j) { + if (IsNumber(cur_shapes.at(j))) { + continue; + } + ofs << " if (shape_infos[" << i << "].shape[" << j << "] <= 0) {\n" + << " return OH_AI_STATUS_LITE_PARAM_INVALID;\n" + << " }\n"; + ofs << " ((MicroTensor *)(inputs.handle_list[" << i << "]))->shape[" << j << "] = shape_infos[" << i + << "].shape[" << j << "];\n"; + if (symbol_to_indexes.find(cur_shapes.at(j)) != symbol_to_indexes.end()) { + continue; + } + symbol_to_indexes[cur_shapes.at(j)] = {static_cast(i), static_cast(j)}; + user_to_inner[user_graph_inputs_template[i][j]] = cur_shapes.at(j); + } + } + int index = 0; + std::map inner_to_outer; + for (auto &item : symbol_to_indexes) { + ofs << " int dim" << index << " = shape_infos[" << item.second[0] << "].shape[" << item.second[1] << "];\n"; + inner_to_outer[item.first] = "dim" + std::to_string(index); + ++index; + } + std::string condition; + index = 0; + for (; index < static_cast(symbol_to_indexes.size()) - 1; ++index) { + condition += "store" + std::to_string(ctx->GetCurModelIndex()) + "_" + std::to_string(index) + " == dim" + + std::to_string(index) + " && "; + } + condition += "store" + std::to_string(ctx->GetCurModelIndex()) + "_" + std::to_string(index) + " == dim" + + std::to_string(index); + ofs << " if (" << condition << ") {\n" + << " return OH_AI_STATUS_SUCCESS;\n" + << " }\n"; + for (size_t i = 0; i < symbol_to_indexes.size(); ++i) { + ofs << " store" + std::to_string(ctx->GetCurModelIndex()) + "_" << i << " = dim" << i << ";\n"; + } + ofs << " if (" << kBufferPrefixName << " != NULL) {\n"; + ofs << " free(" << kBufferPrefixName << ");\n"; + ofs << " }\n"; + std::string real_array_index; + auto &dynamic_symbols = config.dynamic_symbols(); + for (auto &symbol : dynamic_symbols) { + real_array_index += "[" + inner_to_outer[user_to_inner[symbol]] + " - 1]"; + } + ofs << " " << kBufferPrefixName << " = malloc(buffer_sizes" << real_array_index << ");\n"; + ofs << " micro_model->runtime_buffer = " << kBufferPrefixName << ";\n"; + ofs << " " << kShapePrefixName << " = &shapes" << real_array_index << "[0];\n"; + ofs << " " << kOffsetPrefixName << " = &offsets" << real_array_index << "[0];\n"; + ofs << " OH_AI_TensorHandleArray outputs = OH_AI_ModelGetOutputs(model);\n"; + for (size_t i = 0; i < ctx->graph_outputs().size(); ++i) { + ofs << " OH_AI_TensorSetData(outputs.handle_list[" << i << "], NULL);\n"; + auto cur_tensor = ctx->graph_outputs()[i]; + auto cur_shapes = shape_templates.at(cur_tensor); + for (size_t j = 0; j < cur_shapes.size(); ++j) { + if (IsNumber(cur_shapes.at(j))) { + continue; + } + ofs << " ((MicroTensor *)(outputs.handle_list[" << i << "]))->shape[" << j << "] = " << cur_shapes.at(j) + << ";\n"; + } + } + ofs << " return OH_AI_STATUS_SUCCESS;\n"; + } + ofs << "}\n"; +} + void CodeMSModelDestory(std::ofstream &ofs, const Configurator *config) { - if (config->target() != kCortex_M) { + if (config->code_mode() == CodeMode::Inference && config->target() != kCortex_M) { ofs << handle_array_destroy; } - ofs << "void MSModelDestroy(MSModelHandle *model) {\n"; + ofs << "void OH_AI_ModelDestroy(OH_AI_ModelHandle *model) {\n"; + ofs << " if (*model) {\n" + " MicroModel *micro_model = (MicroModel *)*model;\n"; if (config->target() != kCortex_M) { - ofs << " if (*model) {\n" - " MicroModel *micro_model = (MicroModel *)*model;\n"; - ofs << " if (micro_model->runtime_buffer) {\n" - " micro_model->runtime_buffer = NULL;\n" - " }\n"; - ofs << " MSTensorHandleArrayDestroy(micro_model->inputs);\n" - " MSTensorHandleArrayDestroy(micro_model->outputs);\n" - " micro_model->inputs.handle_list = NULL;\n" + ofs << " if (micro_model->runtime_buffer) {\n"; + if (config->dynamic_shape()) { + ofs << " free(micro_model->runtime_buffer);\n"; + } else { + ofs << " micro_model->runtime_buffer = NULL;\n"; + } + ofs << " }\n"; + } + ofs << " OH_AI_TensorHandleArrayDestroy(micro_model->inputs);\n" + " OH_AI_TensorHandleArrayDestroy(micro_model->outputs);\n"; + if (config->code_mode() == CodeMode::Inference) { + ofs << " micro_model->inputs.handle_list = NULL;\n" " micro_model->outputs.handle_list = NULL;\n" - " micro_model->free_resource();\n" - " DecRefCount();\n" - " }\n"; - - if (config->support_parallel()) { - ofs << " ClearThreadPool();\n"; + " micro_model->free_resource();\n"; + if (!config->dynamic_shape()) { + ofs << " DecRefCount();\n"; } + ofs << " }\n"; } else { - ofs << " if (*model) {\n" - " MicroModel *micro_model = (MicroModel *)*model;\n"; - ofs << " micro_model->runtime_buffer = NULL;\n" + ofs << " free(*model);\n" " *model = NULL;\n" " }\n"; } + + if (config->support_parallel()) { + ofs << " ClearThreadPool();\n"; + } ofs << "}\n"; } @@ -420,14 +580,14 @@ void CodeMSModelPredictState(std::ofstream &ofs) { ofs << micro_model_predict_st void CodeMSModelPredictCommon(std::ofstream &ofs) { ofs << R"RAW( -MSStatus MSModelPredict(MSModelHandle model, const MSTensorHandleArray inputs, MSTensorHandleArray *outputs, - const MSKernelCallBackC before, const MSKernelCallBackC after) { +OH_AI_Status OH_AI_ModelPredict(OH_AI_ModelHandle model, const OH_AI_TensorHandleArray inputs, OH_AI_TensorHandleArray *outputs, + const OH_AI_KernelCallBack before, const OH_AI_KernelCallBack after) { MicroModel *micro_model = (MicroModel *)model; if (micro_model == NULL) { - return kMSStatusLiteNullptr; + return OH_AI_STATUS_LITE_NULLPTR; } if (micro_model->predict == NULL) { - return kMSStatusLiteNullptr; + return OH_AI_STATUS_LITE_NULLPTR; } return micro_model->predict(model, inputs, outputs, before, after); } @@ -438,35 +598,35 @@ MSStatus MSModelPredict(MSModelHandle model, const MSTensorHandleArray inputs, M void CodeMSModelPredict(std::ofstream &ofs, const std::unique_ptr &ctx, const Configurator &config) { auto inputs_num = ctx->graph_inputs().size(); auto outputs_num = ctx->graph_outputs().size(); - ofs << "MSStatus MSModelPredict" << ctx->GetCurModelIndex() - << "(MSModelHandle model, const MSTensorHandleArray inputs, MSTensorHandleArray *outputs,\n" - << " const MSKernelCallBackC before, const MSKernelCallBackC after) {\n"; + ofs << "OH_AI_Status OH_AI_ModelPredict" << ctx->GetCurModelIndex() + << "(OH_AI_ModelHandle model, const OH_AI_TensorHandleArray inputs, OH_AI_TensorHandleArray *outputs,\n" + << " const OH_AI_KernelCallBack before, const OH_AI_KernelCallBack after) {\n"; ofs << R"RAW( MicroModel *micro_model = (MicroModel *)model; if (micro_model == NULL) { - return kMSStatusLiteNullptr; + return OH_AI_STATUS_LITE_NULLPTR; } if (micro_model->runtime_buffer == NULL) { - return kMSStatusLiteMemoryFailed; + return OH_AI_STATUS_LITE_MEMORY_FAILED; } )RAW"; ofs << " if (inputs.handle_num != " << inputs_num << ") {\n"; - ofs << " return kMSStatusLiteParamInvalid;\n"; + ofs << " return OH_AI_STATUS_LITE_PARAM_INVALID;\n"; ofs << " }\n"; ofs << " if (outputs->handle_num != " << outputs_num << ") {\n"; - ofs << " return kMSStatusLiteParamInvalid;\n"; + ofs << " return OH_AI_STATUS_LITE_PARAM_INVALID;\n"; ofs << " }\n"; - if (config.target() != kCortex_M) { + if (config.target() != kCortex_M && !config.dynamic_shape()) { ofs << " if (!LockBuffer(micro_model->runtime_buffer)) {\n" << " void *buffer = Malloc(GetBufferSize" << ctx->GetCurModelIndex() << "());\n" << " if (buffer == NULL) {\n" - << " return kMSStatusLiteNullptr;\n" + << " return OH_AI_STATUS_LITE_NULLPTR;\n" << " }\n" << " if (micro_model->runtime_buffer != buffer) {\n" << " micro_model->runtime_buffer = buffer;\n" << " int ret = SetBuffer" << ctx->GetCurModelIndex() << "(((MemBlock *)buffer)->addr);\n" - << " if (ret != kMSStatusSuccess) {\n" - << " return kMSStatusLiteMemoryFailed;\n" + << " if (ret != OH_AI_STATUS_SUCCESS) {\n" + << " return OH_AI_STATUS_LITE_MEMORY_FAILED;\n" << " }\n" << " }\n" << " }\n"; @@ -495,8 +655,7 @@ void CodeMSModelPredict(std::ofstream &ofs, const std::unique_ptr ofs << " }\n"; ofs << " }\n"; ofs << "\n"; - ofs << " void *outputs_data_array[" << outputs_num << "];\n"; - ofs << " int expect_out_types[" << outputs_num << "] = {"; + ofs << " int cur_out_types[" << outputs_num << "] = {"; for (size_t i = 0; i < outputs_num; ++i) { ofs << ctx->graph_outputs().at(i)->data_type() << ", "; } @@ -506,21 +665,18 @@ void CodeMSModelPredict(std::ofstream &ofs, const std::unique_ptr ofs << "false, "; } ofs << "};\n"; - ofs << " for (int i = 0; i < " << outputs_num << "; i++) {\n"; - ofs << " outputs_data_array[i] = MSTensorGetMutableData(outputs->handle_list[i]);\n"; - ofs << " }\n"; - ofs << " CopyOutputsData" << ctx->GetCurModelIndex() - << "(outputs, outputs_data_array, expect_out_types, out_type_changed);\n"; - if (config.target() != kCortex_M) { + ofs << " OH_AI_Status ret = CopyOutputsData" << ctx->GetCurModelIndex() + << "(outputs, cur_out_types, out_type_changed);\n"; + if (config.target() != kCortex_M && !config.dynamic_shape()) { ofs << " UnLockBuffer(micro_model->runtime_buffer);\n"; } - ofs << " return kMSStatusSuccess;\n"; + ofs << " return ret;\n"; ofs << "}\n"; } void CodeCopyOutputsState(std::ofstream &ofs, const int model_index) { - ofs << "int CopyOutputsData" << model_index - << "(MSTensorHandleArray *outputs_ori, void **outputs, int *expect_types, bool *type_changed);\n\n"; + ofs << "OH_AI_Status CopyOutputsData" << model_index + << "(OH_AI_TensorHandleArray *outputs_ori, void **outputs, int *cur_out_types, bool *type_changed);\n\n"; } void CodeCopyOutputsImplement(std::ofstream &ofs, const std::unique_ptr &ctx) { @@ -528,56 +684,60 @@ void CodeCopyOutputsImplement(std::ofstream &ofs, const std::unique_ptr outputs = ctx->graph_outputs(); size_t outputs_size = outputs.size(); - ofs << "int CopyOutputsData" << ctx->GetCurModelIndex() - << "(MSTensorHandleArray *outputs_ori, void **outputs, int *expect_types, bool *type_changed) {\n" - " if (outputs_ori == NULL || outputs == NULL) {\n" - " return RET_ERROR;\n" + ofs << "OH_AI_Status CopyOutputsData" << ctx->GetCurModelIndex() + << "(OH_AI_TensorHandleArray *outputs_ori, int *cur_out_types, bool *type_changed) {\n" + " if (outputs_ori == NULL || cur_out_types == NULL || type_changed == NULL) {\n" + " return OH_AI_STATUS_LITE_NULLPTR;\n" " }\n"; ofs << " unsigned char *buffer[" << outputs_size << "] = {"; for (size_t i = 0; i < outputs_size; ++i) { - ofs << tensor_map[outputs[i]] << ", "; - } - ofs << "};\n"; - ofs << " size_t buffer_size[" << outputs_size << "] = {"; - for (size_t i = 0; i < outputs_size; ++i) { - Tensor *output = outputs[i]; - MS_CHECK_PTR_IF_NULL(output); - ofs << output->Size() << ", "; + auto out_str = ctx->tensor_addr(outputs[i]); + if (out_str.empty()) { + ofs << tensor_map[outputs[i]] << ", "; + } else { + ofs << out_str << ", "; + } } ofs << "};\n"; ofs << " for (int i = 0; i < " << outputs_size << "; i++) {\n" << " MicroTensor *micro_tensor = (MicroTensor *)outputs_ori->handle_list[i];\n" - << " int cur_type = micro_tensor->type;\n" - << " int expect_type = expect_types[i];\n"; - ofs << " if (cur_type == expect_type) {\n" - << " memcpy(outputs[i], buffer[i], buffer_size[i]);\n" + << " int expect_type = micro_tensor->type;\n" + << " int cur_type = cur_out_types[i];\n"; + ofs << " if (expect_type == cur_type) {\n" + << " micro_tensor->data = buffer[i];\n" + << " micro_tensor->owned = false;\n" << " continue;\n" << " }\n" + << "#ifdef ENABLE_FP16\n" << " int shape_size = micro_tensor->ndim;\n" << " int num = 1;\n" - << " for (int i = 0; i < shape_size; ++i) {\n" - << " num *= micro_tensor->shape[i];\n" + << " for (int j = 0; j < shape_size; ++j) {\n" + << " num *= micro_tensor->shape[j];\n" << " }\n"; - ofs << " int type_trans_mode = TypeTransMode_MAX;\n" - " if (expect_type == kMSDataTypeNumberTypeFloat16 && cur_type == kMSDataTypeNumberTypeFloat32) {\n" - " type_trans_mode = TypeTransMode_FP32_TO_FP16;\n" - " } else if (expect_type == kMSDataTypeNumberTypeFloat32 && cur_type == kMSDataTypeNumberTypeFloat16) {\n" - " type_trans_mode = TypeTransMode_FP16_TO_FP32;\n" - " }\n"; + ofs + << " int type_trans_mode = TypeTransMode_MAX;\n" + " if (expect_type == OH_AI_DATATYPE_NUMBERTYPE_FLOAT16 && cur_type == OH_AI_DATATYPE_NUMBERTYPE_FLOAT32) {\n" + " type_trans_mode = TypeTransMode_FP32_TO_FP16;\n" + " } else if (expect_type == OH_AI_DATATYPE_NUMBERTYPE_FLOAT32 && cur_type == " + "OH_AI_DATATYPE_NUMBERTYPE_FLOAT16) {\n" + " type_trans_mode = TypeTransMode_FP16_TO_FP32;\n" + " }\n"; ofs << " if (type_trans_mode == TypeTransMode_UNSUPPORT) {\n" - << " return kMSStatusLiteNotSupport;\n" + << " return OH_AI_STATUS_LITE_NOT_SUPPORT;\n" << " }\n"; - ofs << "#ifdef ENABLE_FP16\n" - << " if (type_trans_mode == TypeTransMode_FP32_TO_FP16) {\n" - << " Fp32CastToFp16((float *)(buffer[i]), (float16_t *)&outputs, num);\n" + ofs << " void *out_data = OH_AI_TensorGetMutableData(micro_tensor);\n"; + ofs << " if (type_trans_mode == TypeTransMode_FP32_TO_FP16) {\n" + << " Fp32CastToFp16((float *)(buffer[i]), (float16_t *)out_data, num);\n" << " type_changed[i] = true;\n" << " } else if (type_trans_mode == TypeTransMode_FP16_TO_FP32) {\n" - << " Fp16CastToFp32((float16_t *)&outputs, (float *)(buffer[i]), num);\n" + << " Fp16CastToFp32((float16_t *)(buffer[i]), (float *)out_data, num);\n" << " type_changed[i] = true;\n" << " }\n" + << "#else\n" + << " return OH_AI_STATUS_LITE_NOT_SUPPORT;\n" << "#endif\n" << " }\n"; - ofs << " return RET_OK;\n" + ofs << " return OH_AI_STATUS_SUCCESS;\n" "}\n\n"; } @@ -688,6 +848,16 @@ void CodeInitResourceImplement(std::ofstream &ofs, const std::unique_ptr &ctx, const Configurator &config) { + ofs << "void Reset" << ctx->GetCurModelIndex() << "() {\n"; + auto &dynamic_symbols = config.dynamic_symbols(); + for (size_t i = 0; i < dynamic_symbols.size(); ++i) { + ofs << " store" << ctx->GetCurModelIndex() << "_" << i << " = -1;\n"; + } + ofs << " FreeResource" << ctx->GetCurModelIndex() << "();\n"; + ofs << "}\n"; +} + void CodeFreeResourceState(std::ofstream &ofs) { ofs << free_resource_state; } void CodeFreeResourceImplement(std::ofstream &ofs, const std::unique_ptr &ctx, diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.h b/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.h index 56209f05..6f0c7736 100644 --- a/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.h +++ b/mindspore/lite/tools/converter/micro/coder/generator/component/common_component.h @@ -32,12 +32,13 @@ void CodeMSModelCalcWorkspaceSize(std::ofstream &ofs, const std::unique_ptr &ctx); void CodeMSModelSetWorkspace(std::ofstream &ofs, const std::unique_ptr &ctx, const Configurator &config); void CodeCortexSetWorkspace(std::ofstream &ofs, const std::unique_ptr &ctx); -void CodeMSTensorHandleArrayDestroyState(std::ofstream &ofs, const Configurator &config); void CodeMSModelCreateDefault(std::ofstream &ofs); void CodeMSModelCreate(std::ofstream &ofs, const std::unique_ptr &ctx, const Configurator &config); void CodeMSModelBuildState(std::ofstream &ofs); void CodeMSModelBuildCommon(std::ofstream &ofs, const Configurator &config); void CodeMSModelBuild(std::ofstream &ofs, const int model_index, const size_t weight_size, const Configurator &config); +void CodeMSModelResizeInit(std::ofstream &ofs, const std::unique_ptr &ctx, const Configurator &config); +void CodeMSModelResize(std::ofstream &ofs, const std::unique_ptr &ctx, const Configurator &config); void CodeMSModelDestory(std::ofstream &ofs, const Configurator *config); void CodeMSModelPredictState(std::ofstream &ofs); void CodeMSModelPredictCommon(std::ofstream &ofs); @@ -57,6 +58,7 @@ void CodeGraphQuantArgsImplement(std::ofstream &ofs, const std::unique_ptr &ctx); +void CodeResetImplement(std::ofstream &ofs, const std::unique_ptr &ctx, const Configurator &config); void CodeFreeResourceState(std::ofstream &ofs); void CodeFreeResourceImplement(std::ofstream &ofs, const std::unique_ptr &ctx, const Configurator &config); diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/component.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/component.cc index b2ed21be..0ee02e0c 100644 --- a/mindspore/lite/tools/converter/micro/coder/generator/component/component.cc +++ b/mindspore/lite/tools/converter/micro/coder/generator/component/component.cc @@ -24,6 +24,8 @@ const char *kOutputPrefixName = nullptr; const char *kWeightPrefixName = nullptr; const char *kBufferPrefixName = nullptr; const char *kBufferPrefixNameAdd = nullptr; +const char *kOffsetPrefixName = nullptr; +const char *kShapePrefixName = nullptr; char *ModifyPrefixName(char *name, int model_index, const std::string &prefix) { if (name != nullptr) { @@ -57,6 +59,8 @@ void FreeGlobalVariable() { Free(kWeightPrefixName); Free(kBufferPrefixName); Free(kBufferPrefixNameAdd); + Free(kOffsetPrefixName); + Free(kShapePrefixName) } void InitGlobalVariable(int model_index) { @@ -65,5 +69,7 @@ void InitGlobalVariable(int model_index) { kWeightPrefixName = ModifyPrefixName(const_cast(kWeightPrefixName), model_index, "_weight"); kBufferPrefixName = ModifyPrefixName(const_cast(kBufferPrefixName), model_index, "_buffer"); kBufferPrefixNameAdd = ModifyPrefixName(const_cast(kBufferPrefixNameAdd), model_index, "_buffer + "); + kOffsetPrefixName = ModifyPrefixName(const_cast(kOffsetPrefixName), model_index, "_offset"); + kShapePrefixName = ModifyPrefixName(const_cast(kShapePrefixName), model_index, "_shape"); } } // namespace mindspore::lite::micro diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/component.h b/mindspore/lite/tools/converter/micro/coder/generator/component/component.h index 0e943317..e084d692 100644 --- a/mindspore/lite/tools/converter/micro/coder/generator/component/component.h +++ b/mindspore/lite/tools/converter/micro/coder/generator/component/component.h @@ -16,7 +16,6 @@ #ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_GENERATOR_COMPONENT_COMPONENT_H_ #define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_GENERATOR_COMPONENT_COMPONENT_H_ -#include namespace mindspore::lite::micro { extern const char *kInputPrefixName; @@ -26,6 +25,8 @@ constexpr auto kPackWeightOffsetName = "w_offset"; constexpr auto kPackWeightSizeName = "w_size"; extern const char *kBufferPrefixName; extern const char *kBufferPrefixNameAdd; +extern const char *kOffsetPrefixName; +extern const char *kShapePrefixName; void FreeGlobalVariable(); void InitGlobalVariable(int model_index); diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/benchmark.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/benchmark.cc index 91f2ca89..ad638276 100644 --- a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/benchmark.cc +++ b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/benchmark.cc @@ -53,7 +53,7 @@ const char benchmark_source[] = R"RAW(/** void usage() { printf( - "-- mindspore benchmark params usage:\n" + "-- mindspore benchmark paraOH_AI_ usage:\n" "args[0]: executable file\n" "args[1]: inputs binary file\n" "args[2]: model weight binary file\n" @@ -67,38 +67,38 @@ void usage() { uint64_t GetTimeUs() { const int USEC = 1000000; - const int MSEC = 1000; + const int OH_AI_EC = 1000; struct timespec ts = {0, 0}; if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) { return 0; } - uint64_t retval = (uint64_t)((ts.tv_sec * USEC) + (ts.tv_nsec / MSEC)); + uint64_t retval = (uint64_t)((ts.tv_sec * USEC) + (ts.tv_nsec / OH_AI_EC)); return retval; } -void PrintTensorHandle(MSTensorHandle tensor) { - printf("name: %s, ", MSTensorGetName(tensor)); - MSDataType data_type = MSTensorGetDataType(tensor); +void PrintTensorHandle(OH_AI_TensorHandle tensor) { + printf("name: %s, ", OH_AI_TensorGetName(tensor)); + OH_AI_DataType data_type = OH_AI_TensorGetDataType(tensor); printf("DataType: %d, ", data_type); - size_t element_num = (size_t)(MSTensorGetElementNum(tensor)); + size_t element_num = (size_t)(OH_AI_TensorGetElementNum(tensor)); printf("Elements: %zu, ", element_num); printf("Shape: ["); size_t shape_num = 0; - const int64_t *dims = MSTensorGetShape(tensor, &shape_num); + const int64_t *dims = OH_AI_TensorGetShape(tensor, &shape_num); for (size_t i = 0; i < shape_num; i++) { printf("%d ", (int)dims[i]); } printf("], Data: \n"); - void *data = MSTensorGetMutableData(tensor); + void *data = OH_AI_TensorGetMutableData(tensor); element_num = element_num > 10 ? 10 : element_num; switch (data_type) { - case kMSDataTypeNumberTypeFloat32: { + case OH_AI_DATATYPE_NUMBERTYPE_FLOAT32: { for (size_t i = 0; i < element_num; i++) { printf("%.6f, ", ((float *)data)[i]); } printf("\n"); } break; - case kMSDataTypeNumberTypeFloat16: + case OH_AI_DATATYPE_NUMBERTYPE_FLOAT16: #ifdef ENABLE_FP16 { for (size_t i = 0; i < element_num; i++) { @@ -107,25 +107,25 @@ void PrintTensorHandle(MSTensorHandle tensor) { printf("\n"); } break; #endif - case kMSDataTypeNumberTypeInt16: { + case OH_AI_DATATYPE_NUMBERTYPE_INT16: { for (size_t i = 0; i < element_num; i++) { printf("%" PRId16, ((int16_t *)data)[i]); } printf("\n"); } break; - case kMSDataTypeNumberTypeInt32: { + case OH_AI_DATATYPE_NUMBERTYPE_INT32: { for (size_t i = 0; i < element_num; i++) { printf("%" PRId32, ((int32_t *)data)[i]); } printf("\n"); } break; - case kMSDataTypeNumberTypeInt8: { + case OH_AI_DATATYPE_NUMBERTYPE_INT8: { for (size_t i = 0; i < element_num; i++) { printf("%" PRIi8, ((int8_t *)data)[i]); } printf("\n"); } break; - case kMSDataTypeNumberTypeUInt8: { + case OH_AI_DATATYPE_NUMBERTYPE_UINT8: { for (size_t i = 0; i < element_num; i++) { printf("%u", ((uint8_t *)data)[i]); } @@ -141,31 +141,31 @@ int main(int argc, const char **argv) { if (argc < 2) { printf("input command is invalid\n"); usage(); - return kMSStatusLiteError; + return OH_AI_STATUS_LITE_ERROR; } printf("=======run benchmark======\n"); - MSContextHandle ms_context_handle = MSContextCreate(); + OH_AI_ContextHandle ms_context_handle = OH_AI_ContextCreate(); if (argc >= 6) { int thread_num = atoi(argv[5]); if (thread_num < 1 || thread_num > kMaxThreadNum) { printf("Thread number error! It should be greater than 0 and less than 5\n"); - return kMSStatusLiteParamInvalid; + return OH_AI_STATUS_LITE_PARAM_INVALID; } - MSContextSetThreadNum(ms_context_handle, thread_num); + OH_AI_ContextSetThreadNum(ms_context_handle, thread_num); } - printf("ThreadNum: %d.\n", MSContextGetThreadNum(ms_context_handle)); + printf("ThreadNum: %d.\n", OH_AI_ContextGetThreadNum(ms_context_handle)); int bind_mode = kBindDefault; if (argc >= 7) { bind_mode = atoi(argv[6]); if (bind_mode < 0 || bind_mode > 2) { printf("Thread bind mode error! 0: No bind, 1: Bind hign cpu, 2: Bind mid cpu.\n"); - return kMSStatusLiteParamInvalid; + return OH_AI_STATUS_LITE_PARAM_INVALID; } } - MSContextSetThreadAffinityMode(ms_context_handle, bind_mode); - printf("BindMode: %d.\n", MSContextGetThreadAffinityMode(ms_context_handle)); + OH_AI_ContextSetThreadAffinityMode(ms_context_handle, bind_mode); + printf("BindMode: %d.\n", OH_AI_ContextGetThreadAffinityMode(ms_context_handle)); void *model_buffer = NULL; int model_size = 0; @@ -174,14 +174,14 @@ int main(int argc, const char **argv) { model_buffer = ReadInputData(argv[2], &model_size); if (model_buffer == NULL) { printf("Read model file failed."); - return kMSStatusLiteParamInvalid; + return OH_AI_STATUS_LITE_PARAM_INVALID; } } - MSModelHandle model_handle = MSModelCreate(); - int ret = MSModelBuild(model_handle, model_buffer, model_size, kMSModelTypeMindIR, ms_context_handle); - MSContextDestroy(&ms_context_handle); - if (ret != kMSStatusSuccess) { - printf("MSModelBuildFromFile failed, ret: %d\n", ret); + OH_AI_ModelHandle model_handle = OH_AI_ModelCreate(); + int ret = OH_AI_ModelBuild(model_handle, model_buffer, model_size, OH_AI_MODELTYPE_MINDIR, ms_context_handle); + OH_AI_ContextDestroy(&ms_context_handle); + if (ret != OH_AI_STATUS_SUCCESS) { + printf("OH_AI_ModelBuild failed, ret: %d\n", ret); free(model_buffer); model_buffer = NULL; return ret; @@ -191,33 +191,33 @@ int main(int argc, const char **argv) { model_buffer = NULL; } // set model inputs tensor data - MSTensorHandleArray inputs_handle = MSModelGetInputs(model_handle); + OH_AI_TensorHandleArray inputs_handle = OH_AI_ModelGetInputs(model_handle); if (inputs_handle.handle_list == NULL) { - printf("MSModelGetInputs failed, ret: %d", ret); + printf("OH_AI_ModelGetInputs failed, ret: %d", ret); return ret; } size_t inputs_num = inputs_handle.handle_num; void *inputs_binbuf[inputs_num]; int inputs_size[inputs_num]; for (size_t i = 0; i < inputs_num; ++i) { - MSTensorHandle tensor = inputs_handle.handle_list[i]; - inputs_size[i] = (int)MSTensorGetDataSize(tensor); + OH_AI_TensorHandle tensor = inputs_handle.handle_list[i]; + inputs_size[i] = (int)OH_AI_TensorGetDataSize(tensor); } ret = ReadInputsFile((char *)(argv[1]), inputs_binbuf, inputs_size, (int)inputs_num); if (ret != 0) { - MSModelDestroy(&model_handle); + OH_AI_ModelDestroy(&model_handle); return ret; } for (size_t i = 0; i < inputs_num; ++i) { - void *input_data = MSTensorGetMutableData(inputs_handle.handle_list[i]); + void *input_data = OH_AI_TensorGetMutableData(inputs_handle.handle_list[i]); memcpy(input_data, inputs_binbuf[i], inputs_size[i]); free(inputs_binbuf[i]); inputs_binbuf[i] = NULL; } - MSTensorHandleArray outputs_handle = MSModelGetOutputs(model_handle); + OH_AI_TensorHandleArray outputs_handle = OH_AI_ModelGetOutputs(model_handle); if (!outputs_handle.handle_list) { - printf("MSModelGetOutputs failed, ret: %d", ret); + printf("OH_AI_ModelGetOutputs failed, ret: %d", ret); return ret; } @@ -226,15 +226,15 @@ int main(int argc, const char **argv) { warm_up_loop_count = atoi(argv[7]); if (warm_up_loop_count < 0) { printf("The warm up loop count error! Cannot be less than 0.\n"); - return kMSStatusLiteParamInvalid; + return OH_AI_STATUS_LITE_PARAM_INVALID; } } printf("Running warm up loops..."); for (int i = 0; i < warm_up_loop_count; ++i) { - ret = MSModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL); - if (ret != kMSStatusSuccess) { - MSModelDestroy(&model_handle); - printf("MSModelPredict failed, ret: %d", ret); + ret = OH_AI_ModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL); + if (ret != OH_AI_STATUS_SUCCESS) { + OH_AI_ModelDestroy(&model_handle); + printf("OH_AI_ModelPredict failed, ret: %d", ret); return ret; } } @@ -244,10 +244,10 @@ int main(int argc, const char **argv) { printf("\nloop count: %d\n", loop_count); uint64_t start_time = GetTimeUs(); for (int i = 0; i < loop_count; ++i) { - ret = MSModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL); - if (ret != kMSStatusSuccess) { - MSModelDestroy(&model_handle); - printf("MSModelPredict failed, ret: %d", ret); + ret = OH_AI_ModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL); + if (ret != OH_AI_STATUS_SUCCESS) { + OH_AI_ModelDestroy(&model_handle); + printf("OH_AI_ModelPredict failed, ret: %d", ret); return ret; } } @@ -255,23 +255,23 @@ int main(int argc, const char **argv) { float total_time = (float)(end_time - start_time) / 1000.0f; printf("total time: %.5fms, per time: %.5fms\n", total_time, total_time / loop_count); } - ret = MSModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL); - if (ret != kMSStatusSuccess) { - MSModelDestroy(&model_handle); + ret = OH_AI_ModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL); + if (ret != OH_AI_STATUS_SUCCESS) { + OH_AI_ModelDestroy(&model_handle); return ret; } printf("========run success=======\n"); printf("\noutputs: \n"); for (size_t i = 0; i < outputs_handle.handle_num; i++) { - MSTensorHandle output = outputs_handle.handle_list[i]; + OH_AI_TensorHandle output = outputs_handle.handle_list[i]; PrintTensorHandle(output); } if (argc >= 5) { CalibTensor *calib_tensors; int calib_num = 0; ret = ReadCalibData(argv[4], &calib_tensors, &calib_num); - if (ret != kMSStatusSuccess) { - MSModelDestroy(&model_handle); + if (ret != OH_AI_STATUS_SUCCESS) { + OH_AI_ModelDestroy(&model_handle); return ret; } float cosine_distance_threshold = 0.9999; @@ -279,15 +279,15 @@ int main(int argc, const char **argv) { cosine_distance_threshold = atof(argv[8]); } ret = CompareOutputs(outputs_handle, &calib_tensors, calib_num, cosine_distance_threshold); - if (ret != kMSStatusSuccess) { - MSModelDestroy(&model_handle); + if (ret != OH_AI_STATUS_SUCCESS) { + OH_AI_ModelDestroy(&model_handle); return ret; } FreeCalibTensors(&calib_tensors, calib_num); } printf("========run success=======\n"); - MSModelDestroy(&model_handle); - return kMSStatusSuccess; + OH_AI_ModelDestroy(&model_handle); + return OH_AI_STATUS_SUCCESS; } )RAW"; @@ -385,7 +385,7 @@ int benchmark() { return kMSStatusLiteError; } MSModelSetWorkspace(model_handle, g_WorkSpace, WORK_SPACE_SIZE); - ret = MSModelBuild(model_handle, NULL, 0, kMSModelTypeMindIR, NULL); + ret = OH_AI_ModelBuild(model_handle, NULL, 0, kMSModelTypeMindIR, NULL); if (ret != kMSStatusSuccess) { printf("MSModelBuildFromFile failed, ret : %d.\n", ret); MSModelDestroy(&model_handle); @@ -424,7 +424,7 @@ int benchmark() { } printf("========Infer start=======\n"); - ret = MSModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL); + ret = OH_AI_ModelPredict(model_handle, inputs_handle, &outputs_handle, NULL, NULL); if (ret != kMSStatusSuccess) { MSModelDestroy(&model_handle); return ret; diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/calib_output.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/calib_output.cc index 71ca2287..66af9069 100644 --- a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/calib_output.cc +++ b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/calib_output.cc @@ -48,7 +48,7 @@ typedef struct CalibTensor { float *data_; } CalibTensor; int ReadCalibData(const char *calib_data_path, CalibTensor **calib_tensots, int *calib_num); -int CompareOutputs(MSTensorHandleArray outputs, CalibTensor **calib_tensors, int calib_num, +int CompareOutputs(OH_AI_TensorHandleArray outputs, CalibTensor **calib_tensors, int calib_num, float cosine_distance_threshold); void FreeCalibTensors(CalibTensor **calib_tensors, int calib_num); @@ -89,12 +89,12 @@ int ReadCalibData(const char *calib_data_path, CalibTensor **calib_tensor_pointe FILE *file = fopen(calib_data_path, "r"); if (!file) { printf("Unable open %s", calib_data_path); - return kMSStatusLiteError; + return OH_AI_STATUS_LITE_ERROR; } CalibTensor *calib_tensors = (CalibTensor *)malloc(kMaxOutput * sizeof(CalibTensor)); if(calib_tensors == NULL) { printf("Malloc calib tensors failed."); - return kMSStatusLiteError; + return OH_AI_STATUS_LITE_ERROR; } // read line by line char line[kMaxTensorSize]; @@ -111,7 +111,7 @@ int ReadCalibData(const char *calib_data_path, CalibTensor **calib_tensor_pointe char* tensor_name = (char *)malloc(strlen(p)+1); if(tensor_name == NULL) { printf("Malloc tensor name failed."); - return kMSStatusLiteError; + return OH_AI_STATUS_LITE_ERROR; } (void)strcpy(tensor_name, p); calib_tensors[*calib_num].tensor_name = tensor_name; @@ -134,7 +134,7 @@ int ReadCalibData(const char *calib_data_path, CalibTensor **calib_tensor_pointe float *data = (float *)malloc(elements * sizeof(float)); if(data == NULL) { printf("Malloc tensor data failed."); - return kMSStatusLiteError; + return OH_AI_STATUS_LITE_ERROR; } p = strtok(line, " "); int k = 0; @@ -152,43 +152,43 @@ int ReadCalibData(const char *calib_data_path, CalibTensor **calib_tensor_pointe } *calib_tensor_pointers = calib_tensors; fclose(file); - return kMSStatusSuccess; + return OH_AI_STATUS_SUCCESS; } -int CompareOutputs(MSTensorHandleArray outputs, CalibTensor **calib_tensors, int calib_num, +int CompareOutputs(OH_AI_TensorHandleArray outputs, CalibTensor **calib_tensors, int calib_num, float cosine_distance_threshold) { if (outputs.handle_num != (size_t)calib_num) { printf("error, outputs and calibs size is mismatch\n"); - return kMSStatusLiteError; + return OH_AI_STATUS_LITE_ERROR; } size_t outputs_num = outputs.handle_num; bool is_success = true; for (size_t i = 0; i < outputs_num; ++i) { MicroTensor *output = (MicroTensor *)outputs.handle_list[i]; if (!output || !output->data) { - return kMSStatusLiteError; + return OH_AI_STATUS_LITE_ERROR; } CalibTensor *calib = calib_tensors[0]; if (!calib || !calib[i].data_) { - return kMSStatusLiteError; + return OH_AI_STATUS_LITE_ERROR; } if (strcmp(output->name, calib[i].tensor_name) != 0) { printf("warning, output tensor name is not equal to calib\n"); } - size_t elements = (size_t)MSTensorGetElementNum(output); + size_t elements = (size_t)OH_AI_TensorGetElementNum(output); if (elements != (size_t)calib[i].elemets_num_) { printf("error, output elements num is not equal to calib\n"); - return kMSStatusLiteError; + return OH_AI_STATUS_LITE_ERROR; } float cosin = 0.f, dot = 0.f, normx = 0.f, normy = 0.f; switch (output->type) { - case kMSDataTypeNumberTypeFloat32: { + case OH_AI_DATATYPE_NUMBERTYPE_FLOAT32: { float *float_output = (float *)output->data; for (size_t j = 0; j < elements; ++j) { if (isnan(float_output[j]) || isinf(float_output[j]) || isnan(calib[i].data_[j]) || isinf(calib[i].data_[j])) { printf("error, output data is nan or inf\n"); - return kMSStatusLiteError; + return OH_AI_STATUS_LITE_ERROR; } dot += float_output[j] * calib[i].data_[j]; normx += float_output[j] * float_output[j]; @@ -196,7 +196,7 @@ int CompareOutputs(MSTensorHandleArray outputs, CalibTensor **calib_tensors, int } break; } - case kMSDataTypeNumberTypeInt8: { + case OH_AI_DATATYPE_NUMBERTYPE_INT8: { int8_t *int_output = (int8_t *)output->data; for (size_t j = 0; j < elements; ++j) { dot += (float) (int_output[j] * calib[i].data_[j]); @@ -205,7 +205,7 @@ int CompareOutputs(MSTensorHandleArray outputs, CalibTensor **calib_tensors, int } break; } - case kMSDataTypeNumberTypeUInt8: { + case OH_AI_DATATYPE_NUMBERTYPE_UINT8: { uint8_t *int_output = (uint8_t *)output->data; for (size_t j = 0; j < elements; ++j) { dot += (float) (int_output[j] * calib[i].data_[j]); @@ -214,8 +214,8 @@ int CompareOutputs(MSTensorHandleArray outputs, CalibTensor **calib_tensors, int } break; } - case kMSDataTypeNumberTypeInt32: - case kMSDataTypeNumberTypeUInt32: { + case OH_AI_DATATYPE_NUMBERTYPE_INT32: + case OH_AI_DATATYPE_NUMBERTYPE_UINT32: { int32_t *int_output = (int32_t *)output->data; for (size_t j = 0; j < elements; ++j) { dot += (float) (int_output[j] * calib[i].data_[j]); @@ -238,10 +238,10 @@ int CompareOutputs(MSTensorHandleArray outputs, CalibTensor **calib_tensors, int } if (!is_success) { printf("compare outputs failed.\n"); - return kMSStatusLiteError; + return OH_AI_STATUS_LITE_ERROR; } printf("compare outputs success.\n"); - return kMSStatusSuccess; + return OH_AI_STATUS_SUCCESS; } void FreeCalibTensors(CalibTensor **calib_tensors_pointers, int calib_num) { @@ -328,7 +328,7 @@ const char *calib_source_cortex = R"RAW(/** int LoadCalibInputs(MSTensorHandleArray *inputs, TensorArray *tensor_array) { if (inputs->handle_num != tensor_array->tensors_size_) { printf("error, inputs and calibs size is mismatch.\n"); - return kMSStatusLiteError; + return OH_AI_STATUS_LITE_ERROR; } Tensor *calib_tensors = tensor_array->tensors_; if (calib_tensors == NULL) { diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/cmake_lists.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/cmake_lists.cc index 79bfc485..f63e6f9e 100644 --- a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/cmake_lists.cc +++ b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/cmake_lists.cc @@ -127,9 +127,9 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=default") else() message(STATUS "build benchmark release version") - set(CMAKE_C_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -Werror -fstack-protector-strong -Wno-attributes \ + set(CMAKE_C_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -fstack-protector-strong -Wno-attributes \ -Wno-deprecated-declarations -Wno-missing-braces ${CMAKE_C_FLAGS}") - set(CMAKE_CXX_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -Werror -fstack-protector-strong -Wno-attributes \ + set(CMAKE_CXX_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -fstack-protector-strong -Wno-attributes \ -Wno-deprecated-declarations -Wno-missing-braces -Wno-overloaded-virtual ${CMAKE_CXX_FLAGS}") string(REPLACE "-g" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") string(REPLACE "-g" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") @@ -211,9 +211,9 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=default") else() message(STATUS "build net library release version") - set(CMAKE_C_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -Werror -fstack-protector-strong -Wno-attributes \ + set(CMAKE_C_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -fstack-protector-strong -Wno-attributes \ -Wno-deprecated-declarations -Wno-missing-braces ${CMAKE_C_FLAGS}") - set(CMAKE_CXX_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -Werror -fstack-protector-strong -Wno-attributes \ + set(CMAKE_CXX_FLAGS "-fPIC -fPIE -D_FORTIFY_SOURCE=2 -O3 -Wall -fstack-protector-strong -Wno-attributes \ -Wno-deprecated-declarations -Wno-missing-braces -Wno-overloaded-virtual ${CMAKE_CXX_FLAGS}") string(REPLACE "-g" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") string(REPLACE "-g" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") @@ -241,11 +241,11 @@ function(create_library) endforeach() add_custom_command(TARGET net POST_BUILD - COMMAND ar cr ${library_name} *.o + COMMAND ar cr ${library_name} *.obj COMMAND ranlib ${library_name} COMMAND echo "new static library ${library_name} size:" COMMAND ls -lh ${library_name} - COMMAND rm -rf tmp && rm -rf *.o + COMMAND rm -rf tmp && rm -rf *.obj COMMENT "generate specified static library ${library_name}" ) endfunction(create_library) diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/load_input.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/load_input.cc index 9a2aeaa7..669cd8c1 100644 --- a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/load_input.cc +++ b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/load_input.cc @@ -131,7 +131,7 @@ int ReadInputsFile(char *path, void **buffers, const int *inputs_size, int input while ((token = strtok_r(path, delim, &path))) { if (i >= inputs_num) { printf("inputs num is error, need: %d\n", inputs_num); - return kMSStatusLiteParamInvalid; + return OH_AI_STATUS_LITE_PARAM_INVALID; } inputs_path[i] = token; printf("input %d: %s\n", i, inputs_path[i]); @@ -144,7 +144,7 @@ int ReadInputsFile(char *path, void **buffers, const int *inputs_size, int input if (size != inputs_size[i] || buffers[i] == NULL) { printf("size mismatch, %s, input: %d, needed: %d\n", inputs_path[i], size, inputs_size[i]); free(buffers[i]); - return kMSStatusLiteError; + return OH_AI_STATUS_LITE_ERROR; } } return 0; diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mcontext.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mcontext.cc index 856de855..d662e3a8 100644 --- a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mcontext.cc +++ b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mcontext.cc @@ -73,24 +73,24 @@ const char context_source_cortex[] = R"RAW( #include #include -MSContextHandle MSContextCreate() { +OH_AI_ContextHandle OH_AI_ContextCreate() { return NULL; } -void MSContextDestroy(MSContextHandle *context) { +void OH_AI_ContextDestroy(OH_AI_ContextHandle *context) { } -void MSContextSetThreadNum(MSContextHandle context, int32_t thread_num) { +void OH_AI_ContextSetThreadNum(OH_AI_ContextHandle context, int32_t thread_num) { } -int32_t MSContextGetThreadNum(const MSContextHandle context) { +int32_t OH_AI_ContextGetThreadNum(const OH_AI_ContextHandle context) { return 1; } -void MSContextSetThreadAffinityMode(MSContextHandle context, int mode) { +void OH_AI_ContextSetThreadAffinityMode(OH_AI_ContextHandle context, int mode) { } -int MSContextGetThreadAffinityMode(const MSContextHandle context) { +int OH_AI_ContextGetThreadAffinityMode(const OH_AI_ContextHandle context) { return 0; } )RAW"; @@ -116,7 +116,7 @@ const char context_source_no_parallel[] = R"RAW( #include #include -MSContextHandle MSContextCreate() { +OH_AI_ContextHandle OH_AI_ContextCreate() { MicroContext *micro_context = (MicroContext *)malloc(sizeof(MicroContext)); if (micro_context == NULL) { return NULL; @@ -129,7 +129,7 @@ MSContextHandle MSContextCreate() { return micro_context; } -void MSContextDestroy(MSContextHandle *context) { +void OH_AI_ContextDestroy(OH_AI_ContextHandle *context) { MicroContext *micro_context = (MicroContext *)(*context); if (micro_context) { free(micro_context); @@ -137,17 +137,17 @@ void MSContextDestroy(MSContextHandle *context) { } } -void MSContextSetThreadNum(MSContextHandle context, int32_t thread_num) { +void OH_AI_ContextSetThreadNum(OH_AI_ContextHandle context, int32_t thread_num) { } -int32_t MSContextGetThreadNum(const MSContextHandle context) { +int32_t OH_AI_ContextGetThreadNum(const OH_AI_ContextHandle context) { return 1; } -void MSContextSetThreadAffinityMode(MSContextHandle context, int mode) { +void OH_AI_ContextSetThreadAffinityMode(OH_AI_ContextHandle context, int mode) { } -int MSContextGetThreadAffinityMode(const MSContextHandle context) { +int OH_AI_ContextGetThreadAffinityMode(const OH_AI_ContextHandle context) { return 0; } )RAW"; @@ -176,7 +176,7 @@ const char context_source[] = R"RAW( #define MAX_THREAD_NUM 4 -MSContextHandle MSContextCreate() { +OH_AI_ContextHandle OH_AI_ContextCreate() { MicroContext *micro_context = (MicroContext *)malloc(sizeof(MicroContext)); if (micro_context == NULL) { return NULL; @@ -189,7 +189,7 @@ MSContextHandle MSContextCreate() { return micro_context; } -void MSContextDestroy(MSContextHandle *context) { +void OH_AI_ContextDestroy(OH_AI_ContextHandle *context) { MicroContext *micro_context = (MicroContext *)(*context); if (micro_context) { if (micro_context->affinity_core_list_) { @@ -201,7 +201,7 @@ void MSContextDestroy(MSContextHandle *context) { } } -void MSContextSetThreadNum(MSContextHandle context, int32_t thread_num) { +void OH_AI_ContextSetThreadNum(OH_AI_ContextHandle context, int32_t thread_num) { MicroContext *micro_context = (MicroContext *)context; if (micro_context) { int core_num = GetCpuCoreNum(); @@ -214,7 +214,7 @@ void MSContextSetThreadNum(MSContextHandle context, int32_t thread_num) { } } -int32_t MSContextGetThreadNum(const MSContextHandle context) { +int32_t OH_AI_ContextGetThreadNum(const OH_AI_ContextHandle context) { MicroContext *micro_context = (MicroContext *)context; if (micro_context) { return micro_context->thread_num_; @@ -222,7 +222,7 @@ int32_t MSContextGetThreadNum(const MSContextHandle context) { return 0; } -void MSContextSetThreadAffinityMode(MSContextHandle context, int mode) { +void OH_AI_ContextSetThreadAffinityMode(OH_AI_ContextHandle context, int mode) { MicroContext *micro_context = (MicroContext *)context; if (micro_context) { if (mode >= 0 && mode <= 2) { @@ -233,7 +233,7 @@ void MSContextSetThreadAffinityMode(MSContextHandle context, int mode) { } } -int MSContextGetThreadAffinityMode(const MSContextHandle context) { +int OH_AI_ContextGetThreadAffinityMode(const OH_AI_ContextHandle context) { MicroContext *micro_context = (MicroContext *)context; if (micro_context) { return micro_context->affinity_mode; diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/msession.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/msession.cc index 44273071..5cbe4507 100644 --- a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/msession.cc +++ b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/msession.cc @@ -18,25 +18,25 @@ namespace mindspore::lite::micro { const char model_runtime_other_source[] = R"RAW( -MSTensorHandleArray MSModelGetInputs(const MSModelHandle model) { +OH_AI_TensorHandleArray OH_AI_ModelGetInputs(const OH_AI_ModelHandle model) { MicroModel *micro_model = (MicroModel *)model; if (micro_model == NULL) { - MSTensorHandleArray tmp = {0, NULL}; + OH_AI_TensorHandleArray tmp = {0, NULL}; return tmp; } return micro_model->inputs; } -MSTensorHandleArray MSModelGetOutputs(const MSModelHandle model) { +OH_AI_TensorHandleArray OH_AI_ModelGetOutputs(const OH_AI_ModelHandle model) { MicroModel *micro_model = (MicroModel *)model; if (micro_model == NULL) { - MSTensorHandleArray tmp = {0, NULL}; + OH_AI_TensorHandleArray tmp = {0, NULL}; return tmp; } return micro_model->outputs; } -MSTensorHandle MSModelGetInputByTensorName(const MSModelHandle model, const char *tensor_name) { +OH_AI_TensorHandle OH_AI_ModelGetInputByTensorName(const OH_AI_ModelHandle model, const char *tensor_name) { MicroModel *micro_model = (MicroModel *)model; if (micro_model == NULL || micro_model->inputs.handle_list == NULL) { return NULL; @@ -53,7 +53,7 @@ MSTensorHandle MSModelGetInputByTensorName(const MSModelHandle model, const char return NULL; } -MSTensorHandle MSModelGetOutputByTensorName(const MSModelHandle model, const char *tensor_name) { +OH_AI_TensorHandle OH_AI_ModelGetOutputByTensorName(const OH_AI_ModelHandle model, const char *tensor_name) { MicroModel *micro_model = (MicroModel *)model; if (micro_model == NULL || micro_model->outputs.handle_list == NULL) { return NULL; @@ -70,9 +70,16 @@ MSTensorHandle MSModelGetOutputByTensorName(const MSModelHandle model, const cha return NULL; } -MSStatus MSModelResize(MSModelHandle model, const MSTensorHandleArray inputs, MSShapeInfo *shape_infos, +OH_AI_Status OH_AI_ModelResize(OH_AI_ModelHandle model, const OH_AI_TensorHandleArray inputs, OH_AI_ShapeInfo *shape_infos, size_t shape_info_num) { - return kMSStatusLiteNotSupport; + MicroModel *micro_model = (MicroModel *)model; + if (micro_model == NULL) { + return OH_AI_STATUS_LITE_NULLPTR; + } + if (micro_model->resize == NULL) { + return OH_AI_STATUS_LITE_NULLPTR; + } + return micro_model->resize(model, inputs, shape_infos, shape_info_num); } )RAW"; diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mtensor.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mtensor.cc index b125b31d..e4581829 100644 --- a/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mtensor.cc +++ b/mindspore/lite/tools/converter/micro/coder/generator/component/const_blocks/mtensor.cc @@ -46,8 +46,8 @@ const char tensor_header[] = R"RAW( #endif typedef struct { - enum MSDataType type; - enum MSFormat format; + enum OH_AI_DataType type; + enum OH_AI_Format format; char *name; int ndim; int64_t *shape; @@ -76,7 +76,7 @@ enum TypeTransMode { TypeTransMode_MAX = TypeTransMode_UNSUPPORT }; -void *TransformInput(MSTensorHandle tensor, int expect_type, bool *type_changed); +void *TransformInput(OH_AI_TensorHandle tensor, int expect_type, bool *type_changed); #ifdef ENABLE_FP16 void Fp32CastToFp16(const float *input, float16_t *output, int number); @@ -109,37 +109,37 @@ const char tensor_source[] = R"RAW( #include "string.h" #include "tensor.h" -size_t DataTypeSize(const MSDataType type) { +size_t DataTypeSize(const OH_AI_DataType type) { switch (type) { - case kMSDataTypeNumberTypeFloat64: + case OH_AI_DATATYPE_NUMBERTYPE_FLOAT64: return sizeof(double); - case kMSDataTypeNumberTypeFloat32: + case OH_AI_DATATYPE_NUMBERTYPE_FLOAT32: return sizeof(float); - case kMSDataTypeNumberTypeInt8: + case OH_AI_DATATYPE_NUMBERTYPE_INT8: return sizeof(int8_t); - case kMSDataTypeNumberTypeUInt8: + case OH_AI_DATATYPE_NUMBERTYPE_UINT8: return sizeof(uint8_t); - case kMSDataTypeNumberTypeFloat16: - case kMSDataTypeNumberTypeInt16: + case OH_AI_DATATYPE_NUMBERTYPE_FLOAT16: + case OH_AI_DATATYPE_NUMBERTYPE_INT16: return sizeof(int16_t); - case kMSDataTypeNumberTypeInt32: + case OH_AI_DATATYPE_NUMBERTYPE_INT32: return sizeof(int32_t); - case kMSDataTypeNumberTypeInt64: + case OH_AI_DATATYPE_NUMBERTYPE_INT64: return sizeof(int64_t); - case kMSDataTypeNumberTypeUInt16: + case OH_AI_DATATYPE_NUMBERTYPE_UINT16: return sizeof(uint16_t); - case kMSDataTypeNumberTypeUInt32: + case OH_AI_DATATYPE_NUMBERTYPE_UINT32: return sizeof(uint32_t); - case kMSDataTypeNumberTypeUInt64: + case OH_AI_DATATYPE_NUMBERTYPE_UINT64: return sizeof(uint64_t); - case kMSDataTypeObjectTypeString: + case OH_AI_DATATYPE_OBJECTTYPE_STRING: return sizeof(char); default: return 0; } } -MSTensorHandle MSTensorCreate(const char *name, MSDataType type, const int64_t *shape, size_t shape_num, +OH_AI_TensorHandle OH_AI_TensorCreate(const char *name, OH_AI_DataType type, const int64_t *shape, size_t shape_num, const void *data, size_t data_len) { size_t data_type_len = DataTypeSize(type); size_t acc_sum = 1; @@ -160,16 +160,16 @@ MSTensorHandle MSTensorCreate(const char *name, MSDataType type, const int64_t * memcpy(micro_tensor->data, data, data_len); micro_tensor->shape = malloc(shape_num * sizeof(int64_t)); memcpy(micro_tensor->shape, shape, shape_num * sizeof(int64_t)); - micro_tensor->format = kMSFormatNHWC; + micro_tensor->format = OH_AI_FORMAT_NHWC; return micro_tensor; } -void MSTensorDestroy(MSTensorHandle *tensor) { +void OH_AI_TensorDestroy(OH_AI_TensorHandle *tensor) { MicroTensor* micro_tensor = (MicroTensor*)(*tensor); free(micro_tensor); } -void MSTensorSetName(MSTensorHandle tensor, const char *name) { +void OH_AI_TensorSetName(OH_AI_TensorHandle tensor, const char *name) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); if(micro_tensor->name != NULL) { free(micro_tensor->name); @@ -179,10 +179,10 @@ void MSTensorSetName(MSTensorHandle tensor, const char *name) { memcpy(micro_tensor->name, name, len + 1); } -MSTensorHandle MSTensorClone(MSTensorHandle tensor) { +OH_AI_TensorHandle OH_AI_TensorClone(OH_AI_TensorHandle tensor) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); MicroTensor *clone_tensor = malloc( sizeof(MicroTensor)); - size_t tensor_data_size = MSTensorGetDataSize(micro_tensor); + size_t tensor_data_size = OH_AI_TensorGetDataSize(micro_tensor); clone_tensor->data = malloc(tensor_data_size); clone_tensor->owned = true; memcpy(clone_tensor->data,micro_tensor->data,tensor_data_size); @@ -195,26 +195,26 @@ MSTensorHandle MSTensorClone(MSTensorHandle tensor) { clone_tensor->shape = clone_shape; char* clone_name = malloc(strlen(micro_tensor->name)); strcpy(clone_name,micro_tensor->name); - clone_tensor->format = kMSFormatNHWC; + clone_tensor->format = OH_AI_FORMAT_NHWC; return clone_tensor; } -const char *MSTensorGetName(const MSTensorHandle tensor) { +const char *OH_AI_TensorGetName(const OH_AI_TensorHandle tensor) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); return micro_tensor->name; } -void MSTensorSetDataType(MSTensorHandle tensor, MSDataType type) { +void OH_AI_TensorSetDataType(OH_AI_TensorHandle tensor, OH_AI_DataType type) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); micro_tensor->type = type; } -MSDataType MSTensorGetDataType(const MSTensorHandle tensor) { +OH_AI_DataType OH_AI_TensorGetDataType(const OH_AI_TensorHandle tensor) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); return micro_tensor->type; } -void MSTensorSetShape(MSTensorHandle tensor, const int64_t *shape, size_t shape_num) { +void OH_AI_TensorSetShape(OH_AI_TensorHandle tensor, const int64_t *shape, size_t shape_num) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); if(micro_tensor->shape != NULL) { free(micro_tensor->shape); @@ -224,23 +224,23 @@ void MSTensorSetShape(MSTensorHandle tensor, const int64_t *shape, size_t shape_ memcpy(micro_tensor->shape, shape, shape_num * sizeof(int64_t)); } -const int64_t *MSTensorGetShape(const MSTensorHandle tensor, size_t *shape_num) { +const int64_t *OH_AI_TensorGetShape(const OH_AI_TensorHandle tensor, size_t *shape_num) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); *shape_num = micro_tensor->ndim; return micro_tensor->shape; } -void MSTensorSetFormat(MSTensorHandle tensor, MSFormat format) { +void OH_AI_TensorSetFormat(OH_AI_TensorHandle tensor, OH_AI_Format format) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); micro_tensor->format = format; } -MSFormat MSTensorGetFormat(const MSTensorHandle tensor) { +OH_AI_Format OH_AI_TensorGetFormat(const OH_AI_TensorHandle tensor) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); return micro_tensor->format; } -void MSTensorSetData(MSTensorHandle tensor, void *data) { +void OH_AI_TensorSetData(OH_AI_TensorHandle tensor, void *data) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); if (micro_tensor->data == data) { return; @@ -254,23 +254,23 @@ void MSTensorSetData(MSTensorHandle tensor, void *data) { micro_tensor->data = data; } -const void *MSTensorGetData(const MSTensorHandle tensor) { +const void *OH_AI_TensorGetData(const OH_AI_TensorHandle tensor) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); return micro_tensor->data; } -void *MSTensorGetMutableData(const MSTensorHandle tensor) { +void *OH_AI_TensorGetMutableData(const OH_AI_TensorHandle tensor) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); if(micro_tensor->data) { return micro_tensor->data; } - void* data = malloc(MSTensorGetDataSize(tensor)); + void* data = malloc(OH_AI_TensorGetDataSize(tensor)); micro_tensor->owned = true; micro_tensor->data = data; return data; } -int64_t MSTensorGetElementNum(const MSTensorHandle tensor) { +int64_t OH_AI_TensorGetElementNum(const OH_AI_TensorHandle tensor) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); int64_t acc_sum = 1; for(int i=0;i< micro_tensor->ndim;i++) { @@ -279,10 +279,10 @@ int64_t MSTensorGetElementNum(const MSTensorHandle tensor) { return acc_sum; } -size_t MSTensorGetDataSize(const MSTensorHandle tensor) { +size_t OH_AI_TensorGetDataSize(const OH_AI_TensorHandle tensor) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); size_t data_type_size = DataTypeSize(micro_tensor->type); - int64_t elements = MSTensorGetElementNum(tensor); + int64_t elements = OH_AI_TensorGetElementNum(tensor); return data_type_size * elements; } @@ -300,16 +300,16 @@ void Fp16CastToFp32(const float16_t *input, float *output, int number) { } #endif -void *TransformInput(MSTensorHandle tensor, int expect_type, bool *type_changed) { +void *TransformInput(OH_AI_TensorHandle tensor, int expect_type, bool *type_changed) { MicroTensor* micro_tensor = (MicroTensor*)(tensor); int cur_type = micro_tensor->type; if (cur_type == expect_type) { return micro_tensor->data; } int type_trans_mode = TypeTransMode_MAX; - if (expect_type == kMSDataTypeNumberTypeFloat16 && cur_type == kMSDataTypeNumberTypeFloat32) { + if (expect_type == OH_AI_DATATYPE_NUMBERTYPE_FLOAT16 && cur_type == OH_AI_DATATYPE_NUMBERTYPE_FLOAT32) { type_trans_mode = TypeTransMode_FP32_TO_FP16; - } else if (expect_type == kMSDataTypeNumberTypeFloat32 && cur_type == kMSDataTypeNumberTypeFloat16) { + } else if (expect_type == OH_AI_DATATYPE_NUMBERTYPE_FLOAT32 && cur_type == OH_AI_DATATYPE_NUMBERTYPE_FLOAT16) { type_trans_mode = TypeTransMode_FP16_TO_FP32; } if (type_trans_mode == TypeTransMode_UNSUPPORT) { diff --git a/mindspore/lite/tools/converter/micro/coder/generator/component/weight_component.cc b/mindspore/lite/tools/converter/micro/coder/generator/component/weight_component.cc index ac958750..6a131b52 100644 --- a/mindspore/lite/tools/converter/micro/coder/generator/component/weight_component.cc +++ b/mindspore/lite/tools/converter/micro/coder/generator/component/weight_component.cc @@ -61,6 +61,8 @@ void CodeWeightFileHeader(std::ofstream &ofs, const std::unique_ptr\n" << "extern unsigned char *" << ctx->buffer_name() << ";\n" << "extern uint8_t *" << ctx->weight_name() << ";\n" + << "extern int *" << kShapePrefixName << ";\n" + << "extern int *" << kOffsetPrefixName << ";\n" << "enum STATUS {\n" " RET_OK = 0,\n" " RET_ERROR = 1,\n" diff --git a/mindspore/lite/tools/converter/micro/coder/generator/generator.cc b/mindspore/lite/tools/converter/micro/coder/generator/generator.cc index dd66c333..23009e17 100644 --- a/mindspore/lite/tools/converter/micro/coder/generator/generator.cc +++ b/mindspore/lite/tools/converter/micro/coder/generator/generator.cc @@ -43,20 +43,28 @@ const char micro_model_define_source[] = R"RAW( typedef struct { void *runtime_buffer; bool train_mode; // true: train mode, false: eval mode - MSTensorHandleArray inputs; - MSTensorHandleArray outputs; + OH_AI_TensorHandleArray inputs; + OH_AI_TensorHandleArray outputs; ModelBuild build; + ModelResize resize; ModelSetWorkspace set_work_space; ModelCalcWorkspaceSize calc_work_space; FreeResource free_resource; )RAW"; const char set_workspace_state[] = R"RAW( -typedef void (*ModelSetWorkspace)(MSModelHandle model, void *workspace, size_t workspace_size); +typedef void (*ModelSetWorkspace)(OH_AI_ModelHandle model, void *workspace, size_t workspace_size); )RAW"; const char calc_workspace_state[] = R"RAW( -typedef size_t (*ModelCalcWorkspaceSize)(MSModelHandle model); +typedef size_t (*ModelCalcWorkspaceSize)(OH_AI_ModelHandle model); +)RAW"; + +const char model_resize[] = R"RAW( +typedef OH_AI_Status (*ModelResize)(OH_AI_ModelHandle model, + const OH_AI_TensorHandleArray inputs, + OH_AI_ShapeInfo *shape_infos, + size_t shape_info_num); )RAW"; int WriteContentToFile(const std::string &file, const std::string &content) { @@ -311,6 +319,7 @@ int Generator::CodeCommonModelFile() { CodeFreeResourceState(hofs); hofs << set_workspace_state; hofs << calc_workspace_state; + hofs << model_resize; hofs << micro_model_define_source; if (config_->code_mode() == CodeMode::Inference) { hofs << " ModelPredict predict;\n"; @@ -321,7 +330,7 @@ int Generator::CodeCommonModelFile() { } hofs << "} MicroModel;\n"; - hofs << "void MSTensorHandleArrayDestroy(MSTensorHandleArray inputs);\n"; + hofs << "void MSTensorHandleArrayDestroy(OH_AI_TensorHandleArray inputs);\n"; hofs << "#endif // MINDSPORE_LITE_MICRO_LIBRARY_SOURCE_MODEL_H_\n\n"; // model source file @@ -340,7 +349,7 @@ int Generator::CodeCommonModelFile() { if (config_->support_parallel()) { cofs << "#include \"" << kThreadWrapper << "\"\n"; } - if (config_->target() != kCortex_M) { + if (config_->target() != kCortex_M && !config_->dynamic_shape()) { cofs << "#include \"src/allocator.h\"\n"; } CodeMSModelCalcWorkspaceSize(cofs, ctx_, *config_); @@ -369,7 +378,7 @@ int Generator::CodeModelHandleHFile() { "#define MINDSPORE_LITE_MICRO_LIBRARY_INCLUDE_MODEL_HANDLE_H_\n\n" << "#include \"c_api/model_c.h\"\n\n"; for (int i = 0; i <= ctx_->GetCurModelIndex(); ++i) { - ofs << "extern MSModelHandle model" << std::to_string(i) << "; // " << ctx_->model_name() << "\n"; + ofs << "extern OH_AI_ModelHandle model" << std::to_string(i) << "; // " << ctx_->model_name() << "\n"; } ofs << "\n#endif // MINDSPORE_LITE_MICRO_LIBRARY_INCLUDE_MODEL_HANDLE_H_\n"; return RET_OK; @@ -386,7 +395,7 @@ int Generator::CodeMSModelImplement() { ofs << "#include \"c_api/model_c.h\"\n"; ofs << "#include \"src/model.h\"\n"; ofs << "#include \"src/model" << ctx_->GetCurModelIndex() << "/" << net_inc_hfile_ << "\"\n"; - if (config_->target() != kCortex_M) { + if (config_->target() != kCortex_M && !config_->dynamic_shape()) { ofs << "#include \"src/allocator.h\"\n"; } if (config_->support_parallel()) { @@ -399,33 +408,37 @@ int Generator::CodeMSModelImplement() { ofs << "#define GRAPH_OUTPUTS_SIZE " << ctx_->graph_outputs().size() << "\n"; ofs << "#define WEIGHT_BUF_SIZE " << ctx_->weight_buffer_size() << "\n"; } - ofs << "MSStatus MSModelBuild" << ctx_->GetCurModelIndex() << "(MSModelHandle model, const void *model_data,\n" - << " size_t data_size, const MSContextHandle model_context);\n"; + ofs << "OH_AI_Status OH_AI_ModelBuild" << ctx_->GetCurModelIndex() << "(OH_AI_ModelHandle model, const void *model_data,\n" + << " size_t data_size, const OH_AI_ContextHandle model_context);\n"; + ofs << "OH_AI_Status OH_AI_ModelResize" << ctx_->GetCurModelIndex() << "(OH_AI_ModelHandle model, \n" + << " const OH_AI_TensorHandleArray inputs, OH_AI_ShapeInfo *shape_infos, size_t shape_info_num);\n"; if (config_->code_mode() == CodeMode::Inference) { - ofs << "MSStatus MSModelPredict" << ctx_->GetCurModelIndex() - << "(MSModelHandle model, const MSTensorHandleArray inputs,\n" - << " MSTensorHandleArray *output,\n" - << " const MSKernelCallBackC before,\n" - << " const MSKernelCallBackC after);\n"; + ofs << "OH_AI_Status OH_AI_ModelPredict" << ctx_->GetCurModelIndex() + << "(OH_AI_ModelHandle model, const OH_AI_TensorHandleArray inputs,\n" + << " OH_AI_TensorHandleArray *output,\n" + << " const OH_AI_KernelCallBack before,\n" + << " const OH_AI_KernelCallBack after);\n"; } else { - ofs << "MSStatus MSModelRunStep" << ctx_->GetCurModelIndex() - << "(MSModelHandle model,\n" - " const MSKernelCallBackC before,\n" - " const MSKernelCallBackC after);\n"; - ofs << "MSStatus MSModelSetTrainMode" << ctx_->GetCurModelIndex() << "(MSModelHandle model, bool train);\n"; - ofs << "MSStatus MSModelExportWeight" << ctx_->GetCurModelIndex() - << "(MSModelHandle model, const char *export_path);\n"; - } + ofs << "OH_AI_Status MSModelRunStep" << ctx_->GetCurModelIndex() + << "(OH_AI_ModelHandle model,\n" + " const OH_AI_KernelCallBack before,\n" + " const OH_AI_KernelCallBack after);\n"; + ofs << "OH_AI_Status MSModelSetTrainMode" << ctx_->GetCurModelIndex() << "(OH_AI_ModelHandle model, bool train);\n"; + ofs << "OH_AI_Status MSModelExportWeight" << ctx_->GetCurModelIndex() + << "(OH_AI_ModelHandle model, const char *export_path);\n"; + } + ofs << "void Reset" << ctx_->GetCurModelIndex() << "();\n"; ofs << "void MSModelSetWorkspace" << ctx_->GetCurModelIndex() - << "(MSModelHandle model, void *workspace, size_t workspace_size);\n"; - ofs << "size_t MSModelCalcWorkspaceSize" << ctx_->GetCurModelIndex() << "(MSModelHandle model);\n"; + << "(OH_AI_ModelHandle model, void *workspace, size_t workspace_size);\n"; + ofs << "size_t MSModelCalcWorkspaceSize" << ctx_->GetCurModelIndex() << "(OH_AI_ModelHandle model);\n"; ofs << "static MicroModel gModel" << ctx_->GetCurModelIndex() << " = {.runtime_buffer = NULL,\n" << " .train_mode = false,\n" << " .inputs = {" << ctx_->graph_inputs().size() << ", NULL},\n" << " .outputs = {" << ctx_->graph_outputs().size() << ", NULL},\n" - << " .build = MSModelBuild" << ctx_->GetCurModelIndex() << ",\n"; + << " .build = OH_AI_ModelBuild" << ctx_->GetCurModelIndex() << ",\n" + << " .resize = OH_AI_ModelResize" << ctx_->GetCurModelIndex() << ",\n"; if (config_->code_mode() == CodeMode::Inference) { - ofs << " .predict = MSModelPredict" << ctx_->GetCurModelIndex() << ",\n"; + ofs << " .predict = OH_AI_ModelPredict" << ctx_->GetCurModelIndex() << ",\n"; } else { ofs << " .run_step = MSModelRunStep" << ctx_->GetCurModelIndex() << ",\n" << " .set_train_mode = MSModelSetTrainMode" << ctx_->GetCurModelIndex() << ",\n" @@ -439,11 +452,16 @@ int Generator::CodeMSModelImplement() { ofs << " .set_work_space = NULL,\n" << " .calc_work_space = NULL,\n"; } - ofs << " .free_resource = FreeResource" << ctx_->GetCurModelIndex() << "};\n"; - ofs << "MSModelHandle model" << ctx_->GetCurModelIndex() << " = &gModel" << ctx_->GetCurModelIndex() << ";\n\n"; - + ofs << " .free_resource = Reset" << ctx_->GetCurModelIndex() << "};\n"; + ofs << "OH_AI_ModelHandle model" << ctx_->GetCurModelIndex() << " = &gModel" << ctx_->GetCurModelIndex() << ";\n\n"; + auto &dynamic_symbols = config_->dynamic_symbols(); + for (size_t i = 0; i < dynamic_symbols.size(); ++i) { + ofs << "static int store" << ctx_->GetCurModelIndex() << "_" << i << " = -1;\n"; + } + CodeResetImplement(ofs, ctx_, *config_); CodeMSModelCreate(ofs, ctx_, *config_); CodeMSModelBuild(ofs, ctx_->GetCurModelIndex(), weight_size_, *config_); + CodeMSModelResize(ofs, ctx_, *config_); CodeCopyOutputsImplement(ofs, ctx_); if (config_->target() == kCortex_M) { CodeCortexCalcWorkspaceSize(ofs, ctx_); @@ -483,6 +501,8 @@ int Generator::CodeWeightFile() { if (config_->target() != kCortex_M) { cofs << "unsigned char *" << ctx_->buffer_name() << " = 0; \n"; cofs << "unsigned char *" << ctx_->weight_name() << " = 0; \n"; + cofs << "int *" << kShapePrefixName << " = 0; \n"; + cofs << "int *" << kOffsetPrefixName << " = 0; \n"; std::string net_file = model_dir_ + "net" + std::to_string(ctx_->GetCurModelIndex()) + ".bin"; SaveDataToNet(ctx_, net_file, config_->keep_original_weight(), &weight_size_); } else { @@ -598,8 +618,10 @@ int Generator::CreateCommonFiles() { MS_CHECK_RET_CODE(CodeStaticContent(), "code static content failed."); MS_CHECK_RET_CODE(CodeModelHandleHFile(), "code model_handle h file failed."); MS_CHECK_RET_CODE(CodeCommonModelFile(), "code common model file failed."); + if (!config_->dynamic_shape()) { + MS_CHECK_RET_CODE(CodeAllocatorFile(), "code allocator file failed."); + } MS_CHECK_RET_CODE(CodeRegKernelHFile(), "code registered kernel header file failed."); - MS_CHECK_RET_CODE(CodeAllocatorFile(), "code allocator file failed."); MS_CHECK_RET_CODE(CodeSourceCMakeFile(), "code net cmake file failed."); return RET_OK; } diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/base/reshape_dynamic_base_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/base/reshape_dynamic_base_coder.cc new file mode 100644 index 00000000..108ba227 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/base/reshape_dynamic_base_coder.cc @@ -0,0 +1,116 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "coder/opcoders/base/reshape_dynamic_base_coder.h" +#include +#include "coder/opcoders/serializers/serializer.h" +#include "include/errorcode.h" +#include "tools/common/string_util.h" +#include "coder/utils/coder_utils.h" + +using mindspore::schema::PrimitiveType_ExpandDims; +using mindspore::schema::PrimitiveType_Flatten; +using mindspore::schema::PrimitiveType_FlattenGrad; +using mindspore::schema::PrimitiveType_Reshape; +using mindspore::schema::PrimitiveType_Squeeze; +using mindspore::schema::PrimitiveType_Unsqueeze; + +namespace mindspore::lite::micro { +int ReshapeDynamicBaseCoder::Prepare(CoderContext *const context) { + if (input_tensors_.size() == C2NUM) { + MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->IsConst(), RET_NOT_SUPPORT, + "Currently, only support the first input of reshape is non-const when shape is dynamical."); + + MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->data_type() == kNumberTypeInt32 || + input_tensors_[SECOND_INPUT]->data_type() == kNumberTypeInt, + RET_ERROR, "The data-type of Reshape's second input must be int."); + } + return RET_OK; +} + +int ReshapeDynamicBaseCoder::DoCode(CoderContext *const context) { + Serializer coder; + + int data_item_size = static_cast(lite::DataTypeSize(input_tensor_->data_type())); + auto in_shape = shape_info_container_->GetTemplateShape(input_tensor_); + int64_t const_part = 1; + std::string non_const_part; + for (const auto &item : in_shape) { + if (IsNumber(item)) { + const_part *= std::stoi(item); + } else { + if (!non_const_part.empty()) { + non_const_part += " * "; + } + non_const_part += item; + } + } + std::string size = std::to_string(const_part * data_item_size) + " * " + non_const_part; + std::string input_data = dynamic_mem_manager_->GetVarTensorAddr(input_tensor_); + MS_CHECK_TRUE_MSG(!input_data.empty(), RET_ERROR, "pointer is not allocated by the allocator"); + std::string output_data = dynamic_mem_manager_->GetVarTensorAddr(output_tensor_); + MS_CHECK_TRUE_MSG(!output_data.empty(), RET_ERROR, "pointer is not allocated by the allocator"); + coder.CodeFunction("memcpy", output_data, input_data, size); + + context->AppendCode(coder.str()); + return RET_OK; +} + +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Reshape, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_Reshape, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Reshape, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Reshape, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Flatten, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_Flatten, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt8, PrimitiveType_Flatten, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_ExpandDims, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_ExpandDims, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_ExpandDims, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_ExpandDims, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt8, PrimitiveType_ExpandDims, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Squeeze, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_Squeeze, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Squeeze, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Squeeze, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt8, PrimitiveType_Squeeze, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Unsqueeze, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Unsqueeze, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Unsqueeze, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_Unsqueeze, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt8, PrimitiveType_Unsqueeze, + CPUOpCoderCreator) +} // namespace mindspore::lite::micro diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/base/reshape_dynamic_base_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/base/reshape_dynamic_base_coder.h new file mode 100644 index 00000000..aaae22eb --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/base/reshape_dynamic_base_coder.h @@ -0,0 +1,38 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_BASE_RESHAPE_DYNAMIC_BASE_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_BASE_RESHAPE_DYNAMIC_BASE_CODER_H_ + +#include "tools/converter/micro/coder/opcoders/op_coder.h" +#include "tools/converter/micro/coder/shape_info_container.h" +#include "tools/converter/micro/coder/dynamic_mem_manager.h" + +namespace mindspore::lite::micro { +class ReshapeDynamicBaseCoder final : public OperatorCoder { + public: + ReshapeDynamicBaseCoder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + + ~ReshapeDynamicBaseCoder() override = default; + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; +}; +} // namespace mindspore::lite::micro +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_BASE_RESHAPE_DYNAMIC_BASE_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/base/strided_slice_dynamic_base_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/base/strided_slice_dynamic_base_coder.cc new file mode 100644 index 00000000..4b2b0abe --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/base/strided_slice_dynamic_base_coder.cc @@ -0,0 +1,115 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "coder/opcoders/base/strided_slice_dynamic_base_coder.h" +#include +#include +#include "mindspore/lite/src/common/log_util.h" +#include "coder/opcoders/file_collector.h" +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/opcoders/parallel.h" +#include "coder/utils/coder_utils.h" +#include "tools/common/string_util.h" +#include "base/float16.h" + +using mindspore::schema::PrimitiveType_StridedSlice; + +namespace mindspore::lite::micro { +namespace { +size_t GetInnerSize(TypeId type_id, size_t inner_elements) { + switch (type_id) { + case kNumberTypeInt8: + return inner_elements * sizeof(int8_t); + case kNumberTypeFloat32: + return inner_elements * sizeof(float); + case kNumberTypeInt32: + return inner_elements * sizeof(int32_t); + case kNumberTypeFloat16: + return inner_elements * sizeof(float16); + default: + MS_LOG(ERROR) << "Not supported data type: " << type_id; + return 0; + } +} +} // namespace + +int StridedSliceDynamicBaseCoder::Prepare(CoderContext *context) { + CHECK_LESS_RETURN(input_tensors_.size(), C2NUM); + for (size_t i = 1; i < input_tensors_.size(); ++i) { + MS_CHECK_TRUE_MSG(input_tensors_[i]->IsConst(), RET_PARAM_INVALID, + "The " << i << " input of strided slice should be const."); + MS_CHECK_TRUE_MSG(input_tensors_[i]->data_type() == kNumberTypeInt32, RET_PARAM_INVALID, + "The " << i << " input tensor data type should be int32."); + } + CHECK_LESS_RETURN(output_tensors_.size(), C1NUM); + strided_slice_param_ = reinterpret_cast(parameter_); + CHECK_NULL_RETURN(strided_slice_param_); + auto begin_tensor = input_tensors_.at(1); + input_shape_ = shape_info_container_->GetTemplateShape(input_tensor_); + if (input_shape_.size() > DIMENSION_8D || begin_tensor->shape().size() > DIMENSION_8D) { + MS_LOG(ERROR) << "StridedSlice not support input rank or begin num exceeds " << DIMENSION_8D; + return RET_ERROR; + } + dynamic_param_.in_shape_ = "{"; + for (size_t i = 0; i < input_shape_.size(); ++i) { + dynamic_param_.in_shape_ += input_shape_[i] + ", "; + } + dynamic_param_.in_shape_ += "}"; + return RET_OK; +} + +int StridedSliceDynamicBaseCoder::DoCode(CoderContext *ctx) { + inner_size_ = GetInnerSize(input_tensor_->data_type(), inner_); + Collect(ctx, + { + "nnacl/fp32/strided_slice_fp32.h", + }, + { + "strided_slice_fp32.c", + }); + switch (input_tensor_->data_type()) { + case kNumberTypeInt8: + strided_slice_param_->data_type = ::kNumberTypeInt8; + break; + case kNumberTypeFloat32: + strided_slice_param_->data_type = ::kNumberTypeFloat32; + break; + case kNumberTypeInt32: + strided_slice_param_->data_type = ::kNumberTypeInt32; + break; + case kNumberTypeFloat16: + strided_slice_param_->data_type = ::kNumberTypeFloat16; + break; + default: + MS_LOG(ERROR) << "Not supported data type: " << input_tensor_->data_type(); + return RET_ERROR; + } + nnacl::NNaclFp32Serializer code; + code.CodeStruct("strided_slice_parameter", *strided_slice_param_, dynamic_param_); + std::string input_data = GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_); + std::string output_data = GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_); + code.CodeFunction("DoStridedSlice", input_data, output_data, "&strided_slice_parameter"); + ctx->AppendCode(code.str()); + return RET_OK; +} + +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_StridedSlice, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat16, PrimitiveType_StridedSlice, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_StridedSlice, + CPUOpCoderCreator) +} // namespace mindspore::lite::micro diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/base/strided_slice_dynamic_base_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/base/strided_slice_dynamic_base_coder.h new file mode 100644 index 00000000..d41cff4f --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/base/strided_slice_dynamic_base_coder.h @@ -0,0 +1,45 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_BASE_STRIDED_SLICE_BASE_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_BASE_STRIDED_SLICE_BASE_CODER_H_ +#include +#include "coder/opcoders/op_coder.h" +#include "coder/opcoders/nnacl/dynamic_parameter/strided_slice_dynamic_parameter.h" +#include "nnacl/strided_slice_parameter.h" + +namespace mindspore::lite::micro { +class StridedSliceDynamicBaseCoder final : public OperatorCoder { + public: + StridedSliceDynamicBaseCoder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + + ~StridedSliceDynamicBaseCoder() override = default; + + int Prepare(CoderContext *context) override; + + int DoCode(CoderContext *context) override; + + private: + StridedSliceParameter *strided_slice_param_{nullptr}; + StridedSliceDynamicParameter dynamic_param_; + size_t inner_{1}; + size_t inner_size_{1}; + std::vector input_shape_; + std::vector output_shape_; +}; +} // namespace mindspore::lite::micro +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_BASE_STRIDED_SLICE_BASE_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/arithmetic_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/arithmetic_dynamic_parameter.h new file mode 100644 index 00000000..1e9e4f8d --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/arithmetic_dynamic_parameter.h @@ -0,0 +1,43 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_ARITHMETIC_DYNAMIC_PARAMETER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_ARITHMETIC_DYNAMIC_PARAMETER_H_ +#include + +typedef struct ArithmeticDynamicParameter { + std::string in_shape0_; + std::string in_elements_num0_; + std::string in_shape1_; + std::string in_elements_num1_; + + std::string out_shape_; + std::string out_elements_num_; + + std::string in_strides0_; + std::string in_strides1_; + std::string out_strides_; + + std::string multiples0_; + std::string multiples1_; +} ArithmeticDynamicParameter; + +typedef struct BroadcastDynamicShapeInfo { + std::string input_shape_; + std::string output_shape_; +} BroadcastDynamicShapeInfo; + +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_ARITHMETIC_DYNAMIC_PARAMETER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/conv_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/conv_dynamic_parameter.h new file mode 100644 index 00000000..a05ab848 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/conv_dynamic_parameter.h @@ -0,0 +1,26 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_CONV_DYNAMIC_PARAMETER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_CONV_DYNAMIC_PARAMETER_H_ +#include + +typedef struct ConvDynamicParameter { + std::string input_batch_; + std::string output_batch_; +} ConvDynamicParameter; + +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_CONV_DYNAMIC_PARAMETER_H_ \ No newline at end of file diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/dynamic_lstm_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/dynamic_lstm_parameter.h new file mode 100644 index 00000000..970a863a --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/dynamic_lstm_parameter.h @@ -0,0 +1,28 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_DYNAMIC_LSTM_PARAMETER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_DYNAMIC_LSTM_PARAMETER_H_ + +typedef struct DynamicLstmParameter { + std::string seq_len_; + std::string batch_; + std::string input_row_align_; + std::string state_row_align_; + std::string output_step_; +} DynamicLstmParameter; + +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_DYNAMIC_LSTM_PARAMETER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/matmul_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/matmul_dynamic_parameter.h new file mode 100644 index 00000000..d99b0cf9 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/matmul_dynamic_parameter.h @@ -0,0 +1,25 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_MATMUL_DYNAMIC_PARAMETER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_MATMUL_DYNAMIC_PARAMETER_H_ + +typedef struct MatmulDynamicParameter { + std::string row_; + std::string batch_; +} MatmulDynamicParameter; + +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_MATMUL_DYNAMIC_PARAMETER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/pooling_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/pooling_dynamic_parameter.h new file mode 100644 index 00000000..f2636e55 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/pooling_dynamic_parameter.h @@ -0,0 +1,33 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_POOLING_DYNAMIC_PARAMETER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_POOLING_DYNAMIC_PARAMETER_H_ +#include + +typedef struct PoolingDynamicParameter { + int avg_mode_; + bool global_; + int window_w_; + int window_h_; + int stride_w_; + int stride_h_; + + std::string input_batch_; + std::string output_batch_; +} PoolingDynamicParameter; + +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_POOLING_DYNAMIC_PARAMETER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/scale_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/scale_dynamic_parameter.h new file mode 100644 index 00000000..e8728383 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/scale_dynamic_parameter.h @@ -0,0 +1,26 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SCALE_DYNAMIC_PARAMETER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SCALE_DYNAMIC_PARAMETER_H_ +#include + +typedef struct ScaleDynamicParameter { + std::string outer_size_; + std::string axis_size_; + std::string inner_size_; +} ScaleDynamicParameter; +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SCALE_DYNAMIC_PARAMETER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/slice_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/slice_dynamic_parameter.h new file mode 100644 index 00000000..f17993d4 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/slice_dynamic_parameter.h @@ -0,0 +1,27 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SLICE_DYNAMIC_PARAMETER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SLICE_DYNAMIC_PARAMETER_H_ +#include + +typedef struct SliceDynamicParameter { + std::string shape_; + std::string size_; + std::string end_; +} SliceDynamicParameter; + +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SLICE_DYNAMIC_PARAMETER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/softmax_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/softmax_dynamic_parameter.h new file mode 100644 index 00000000..92dfaf21 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/softmax_dynamic_parameter.h @@ -0,0 +1,26 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SOFTMAX_DYNAMIC_PARAMETER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SOFTMAX_DYNAMIC_PARAMETER_H_ +#include + +typedef struct SoftmaxDynamicParameter { + std::string input_shape_; + std::string element_size_; +} SoftmaxDynamicParameter; + +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SOFTMAX_DYNAMIC_PARAMETER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/split_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/split_dynamic_parameter.h new file mode 100644 index 00000000..b97097ad --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/split_dynamic_parameter.h @@ -0,0 +1,26 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SPLIT_DYNAMIC_PARAMETER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SPLIT_DYNAMIC_PARAMETER_H_ +#include + +typedef struct SplitDynamicParameter { + std::string strides_; + std::string split_count_; +} SplitDynamicParameter; + +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_SPLIT_DYNAMIC_PARAMETER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/strided_slice_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/strided_slice_dynamic_parameter.h new file mode 100644 index 00000000..202ee7dd --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/strided_slice_dynamic_parameter.h @@ -0,0 +1,25 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_STRIDED_SLICE_DYNAMIC_PARAMETER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_STRIDED_SLICE_DYNAMIC_PARAMETER_H_ +#include + +typedef struct StridedSliceDynamicParameter { + std::string in_shape_; +} StridedSliceDynamicParameter; + +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_STRIDED_SLICE_DYNAMIC_PARAMETER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/transpose_dynamic_parameter.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/transpose_dynamic_parameter.h new file mode 100644 index 00000000..ed4f21f2 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/dynamic_parameter/transpose_dynamic_parameter.h @@ -0,0 +1,28 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_TRANSPOSE_DYNAMIC_PARAMETER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_TRANSPOSE_DYNAMIC_PARAMETER_H_ +#include + +typedef struct TransposeDynamicParameter { + // shape correlative + std::string strides_; + std::string out_strides_; + std::string data_num_; +} TransposeDynamicParameter; + +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_DYNAMIC_PARAMETER_TRANSPOSE_DYNAMIC_PARAMETER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.cc new file mode 100644 index 00000000..86048179 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.cc @@ -0,0 +1,93 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.h" +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/opcoders/file_collector.h" +#include "coder/utils/coder_utils.h" +#include "tools/common/string_util.h" + +using mindspore::schema::PrimitiveType_Activation; + +namespace mindspore::lite::micro::nnacl { +int ActivationDynamicFP16Coder::Prepare(CoderContext *const context) { + MS_CHECK_TRUE_MSG(input_tensor_->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "Input tensor data type is invalid."); + MS_CHECK_TRUE_MSG(output_tensor_->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "Output tensor data type is invalid."); + return RET_OK; +} + +int ActivationDynamicFP16Coder::DoCode(CoderContext *const context) { + Collect(context, + { + "nnacl/fp16/activation_fp16.h", + }, + { + "activation_fp16.c", + }); + NNaclFp32Serializer code; + // attribute + auto *activation_parameter = reinterpret_cast(parameter_); + MS_CHECK_PTR(activation_parameter); + auto in_shape = shape_info_container_->GetTemplateShape(input_tensor_); + count_ = AccumulateShape(in_shape, 0, in_shape.size()); + input_data_ = dynamic_mem_manager_->GetVarTensorAddr(input_tensor_); + MS_CHECK_TRUE_MSG(!input_data_.empty(), RET_ERROR, "pointer is not allocated by the allocator"); + output_data_ = dynamic_mem_manager_->GetVarTensorAddr(output_tensor_); + MS_CHECK_TRUE_MSG(!output_data_.empty(), RET_ERROR, "pointer is not allocated by the allocator"); + input_data_ = "(float16_t *)(" + input_data_ + ")"; + output_data_ = "(float16_t *)(" + output_data_ + ")"; + + switch (activation_parameter->type_) { + case schema::ActivationType_RELU: + code.CodeFunction("ReluFp16", input_data_, output_data_, count_); + break; + case schema::ActivationType_RELU6: + code.CodeFunction("Relu6Fp16", input_data_, output_data_, count_); + break; + case schema::ActivationType_LEAKY_RELU: + code.CodeFunction("LReluFp16", input_data_, output_data_, count_, activation_parameter->alpha_); + break; + case schema::ActivationType_SIGMOID: + code.CodeFunction("SigmoidFp16", input_data_, output_data_, count_); + break; + case schema::ActivationType_TANH: + code.CodeFunction("TanhFp16", input_data_, output_data_, count_); + break; + case schema::ActivationType_HSWISH: + code.CodeFunction("HSwishFp16", input_data_, output_data_, count_); + break; + case schema::ActivationType_SWISH: + code.CodeFunction("SwishFp16", input_data_, output_data_, count_); + break; + case schema::ActivationType_HSIGMOID: + code.CodeFunction("HSigmoidFp16", input_data_, output_data_, count_); + break; + case schema::ActivationType_ELU: + code.CodeFunction("EluFp16", input_data_, output_data_, count_, activation_parameter->alpha_); + break; + default: + MS_LOG(ERROR) << "Activation type error"; + return RET_ERROR; + } + MS_LOG(DEBUG) << "ActivationFP16Code has been called"; + context->AppendCode(code.str()); + return lite::RET_OK; +} + +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Activation, + CPUOpCoderCreator) +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.h new file mode 100644 index 00000000..c881567f --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/activation_dynamic_fp16_coder.h @@ -0,0 +1,37 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_ACTIVATION_DYNAMIC_FP16_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_ACTIVATION_DYNAMIC_FP16_CODER_H_ + +#include +#include "tools/converter/micro/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.h" + +namespace mindspore::lite::micro::nnacl { +class ActivationDynamicFP16Coder final : public ActivationDynamicFP32Coder { + public: + ActivationDynamicFP16Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : ActivationDynamicFP32Coder(in_tensors, out_tensors, node, node_index, target) {} + + ~ActivationDynamicFP16Coder() override = default; + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_ACTIVATION_DYNAMIC_FP16_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.cc new file mode 100644 index 00000000..7050b8b0 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.cc @@ -0,0 +1,369 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.h" +#include "coder/opcoders/file_collector.h" +#include "coder/opcoders/parallel.h" +#include "coder/log.h" +#include "coder/utils/coder_utils.h" +#include "tools/common/string_util.h" + +namespace mindspore::lite::micro::nnacl { +namespace { +std::string wrap_void(const std::string &a) { return "(void *)(" + a + ")"; } +} // namespace + +void ArithmeticDynamicFP16Coder::InitFunTable() { + fun_table_ = { + {PrimitiveType_MulFusion, schema::ActivationType_RELU, "ElementMulReluFp16", "", "", "", ""}, + {PrimitiveType_MulFusion, schema::ActivationType_RELU6, "ElementMulRelu6Fp16", "", "", "", ""}, + {PrimitiveType_MulFusion, schema::ActivationType_NO_ACTIVATION, "ElementMulFp16", "", "", "", ""}, + {PrimitiveType_AddFusion, schema::ActivationType_RELU, "ElementAddReluFp16", "", "", "", ""}, + {PrimitiveType_AddFusion, schema::ActivationType_RELU6, "ElementAddRelu6Fp16", "", "", "", ""}, + {PrimitiveType_AddFusion, schema::ActivationType_NO_ACTIVATION, "ElementAddFp16", "", "", "", ""}, + {PrimitiveType_SubFusion, schema::ActivationType_RELU, "ElementSubReluFp16", "", "", "", ""}, + {PrimitiveType_SubFusion, schema::ActivationType_RELU6, "ElementSubRelu6Fp16", "", "", "", ""}, + {PrimitiveType_SubFusion, schema::ActivationType_NO_ACTIVATION, "ElementSubFp16", "", "", "", ""}, + {PrimitiveType_DivFusion, schema::ActivationType_RELU, "ElementDivReluFp16", "", "", "", ""}, + {PrimitiveType_DivFusion, schema::ActivationType_RELU6, "ElementDivRelu6Fp16", "", "", "", ""}, + {PrimitiveType_DivFusion, schema::ActivationType_NO_ACTIVATION, "ElementDivFp16", "", "", "", ""}, + {PrimitiveType_RealDiv, schema::ActivationType_RELU, "ElementDivReluFp16", "", "", "", ""}, + {PrimitiveType_RealDiv, schema::ActivationType_RELU6, "ElementDivRelu6Fp16", "", "", "", ""}, + {PrimitiveType_RealDiv, schema::ActivationType_NO_ACTIVATION, "ElementDivFp16", "", "", "", ""}, + {PrimitiveType_LogicalAnd, schema::ActivationType_NO_ACTIVATION, "ElementLogicalAndFp16", "", "", "", ""}, + {PrimitiveType_LogicalOr, schema::ActivationType_NO_ACTIVATION, "ElementLogicalOrFp16", "", "", "", ""}, + {PrimitiveType_Maximum, schema::ActivationType_NO_ACTIVATION, "ElementMaximumFp16", "", "", "", ""}, + {PrimitiveType_Minimum, schema::ActivationType_NO_ACTIVATION, "ElementMinimumFp16", "", "", "", ""}, + {PrimitiveType_FloorMod, schema::ActivationType_NO_ACTIVATION, "ElementFloorModFp16", "", "", "", ""}, + {PrimitiveType_FloorDiv, schema::ActivationType_NO_ACTIVATION, "ElementFloorDivFp16", "", "", "", ""}, + {PrimitiveType_SquaredDifference, schema::ActivationType_NO_ACTIVATION, "ElementSquaredDifferenceFp16", "", "", "", + ""}}; +} + +int ArithmeticDynamicFP16Coder::Prepare(CoderContext *const context) { + CHECK_LESS_RETURN(input_tensors_.size(), C2NUM); + CHECK_LESS_RETURN(output_tensors_.size(), 1); + for (size_t i = 0; i < input_tensors_.size(); ++i) { + MS_CHECK_TRUE_MSG(input_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "Tensor data type is invalid"); + } + MS_CHECK_TRUE_MSG(output_tensor_->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "Tensor data type is invalid"); + filter_tensor_ = input_tensors_.at(SECOND_INPUT); + MS_CHECK_PTR(filter_tensor_); + param_ = reinterpret_cast(parameter_); + MS_CHECK_PTR(param_); + auto primitive_type = param_->op_parameter_.type_; + if (primitive_type == schema::PrimitiveType_Eltwise) { + switch (param_->eltwise_mode_) { + case schema::EltwiseMode_PROD: + primitive_type = schema::PrimitiveType_MulFusion; + break; + case schema::EltwiseMode_SUM: + primitive_type = schema::PrimitiveType_AddFusion; + break; + case schema::EltwiseMode_MAXIMUM: + primitive_type = schema::PrimitiveType_Maximum; + break; + default: + MS_LOG(ERROR) << "Eltwise mode not support, mode:" << param_->eltwise_mode_; + return RET_ERROR; + } + } + InitRunFunction(primitive_type); + InitDynamicParams(); + ResetStatus(); + CalcMultiplesAndStrides(); + return RET_OK; +} + +int ArithmeticDynamicFP16Coder::DoCode(CoderContext *const context) { + input0_ptr_str_ = GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_); + input1_ptr_str_ = GetTensorAddr(filter_tensor_, filter_tensor_->IsConst(), dynamic_mem_manager_, allocator_); + output_ptr_str_ = GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_); + NNaclFp32Serializer code; + Collect(context, + { + "nnacl/fp16/arithmetic_fp16.h", + "nnacl/base/broadcast_to.h", + }, + { + "arithmetic_fp16.c", + "arithmetic_base.c", + "broadcast_to.c", + }); + + // all elements eltwise calculation + arithmetic_func_str_ = wrap_void(arithmetic_run_); + // run broadcast + auto in0_shape = shape_info_container_->GetTemplateShape(input_tensor_); + std::vector in1_shape; + if (filter_tensor_->IsConst()) { + for (auto dim : filter_tensor_->shape()) { + in1_shape.emplace_back(std::to_string(dim)); + } + } else { + in1_shape = shape_info_container_->GetTemplateShape(filter_tensor_); + } + auto out_shape = shape_info_container_->GetTemplateShape(output_tensor_); + broadcast_info_.output_shape_size_ = static_cast(out_shape_.size()); + if (in0_shape != out_shape) { + broadcast_info_.input_shape_size_ = static_cast(in0_shape.size()); + dynamic_shape_info_.input_shape_ = dynamic_param_.in_shape0_; + dynamic_shape_info_.output_shape_ = dynamic_param_.out_shape_; + code.CodeStruct("in0_broadcast_info", broadcast_info_, dynamic_shape_info_); + code.CodeFunction("BroadcastToSize16", input0_ptr_str_, "&in0_broadcast_info", output_ptr_str_); + input0_ptr_str_ = output_ptr_str_; + } + if (in1_shape != out_shape) { + broadcast_info_.input_shape_size_ = static_cast(in1_shape.size()); + dynamic_shape_info_.input_shape_ = dynamic_param_.in_shape1_; + dynamic_shape_info_.output_shape_ = dynamic_param_.out_shape_; + code.CodeStruct("in1_broadcast_info", broadcast_info_, dynamic_shape_info_); + auto temp = output_ptr_str_; + if (input0_ptr_str_ == output_ptr_str_) { + std::map> real_nums; + size_t scene_num = 0; + for (auto &dim_template : out_shape) { + auto dim_nums = shape_info_container_->GetRealNums(dim_template); + MS_CHECK_TRUE_MSG(!dim_nums.empty(), RET_ERROR, "Dynamic shape's num must be greater than 0."); + real_nums[dim_template] = dim_nums; + scene_num = std::max(scene_num, dim_nums.size()); + } + for (size_t i = 0; i < scene_num; ++i) { + int out_element_num = 1; + for (size_t j = 0; j < out_shape.size(); ++j) { + if (IsNumber(out_shape[j])) { + out_element_num *= std::stoi(out_shape[j]); + } else { + out_element_num *= real_nums[out_shape[j]][i % real_nums[out_shape[j]].size()]; + } + } + int workspace = out_element_num * DataTypeSize(kNumberTypeFloat16); + temp = dynamic_mem_manager_->AllocWorkSpace(workspace, i); + MS_CHECK_TRUE_MSG(!temp.empty(), RET_ERROR, "Arithmetic cannot alloc workspace."); + } + } + code.CodeFunction("BroadcastToSize16", input1_ptr_str_, "&in1_broadcast_info", temp); + input1_ptr_str_ = temp; + } + return ExecuteCode("(float16_t *)(" + input0_ptr_str_ + ")", "(float16_t *)(" + input1_ptr_str_ + ")", + "(float16_t *)(" + output_ptr_str_ + ")", dynamic_param_.out_elements_num_, context, &code); +} + +void ArithmeticDynamicFP16Coder::InitDynamicParams() { + auto in0_shape = shape_info_container_->GetTemplateShape(input_tensor_); + std::vector in1_shape; + if (filter_tensor_->IsConst()) { + for (auto dim : filter_tensor_->shape()) { + in1_shape.emplace_back(std::to_string(dim)); + } + } else { + in1_shape = shape_info_container_->GetTemplateShape(filter_tensor_); + } + auto out_shape = shape_info_container_->GetTemplateShape(output_tensor_); + dynamic_param_.in_shape0_ = "{"; + dynamic_param_.in_shape1_ = "{"; + dynamic_param_.out_shape_ = "{"; + for (auto shape : in0_shape) { + dynamic_param_.in_shape0_ += shape + ", "; + } + for (auto shape : in1_shape) { + dynamic_param_.in_shape1_ += shape + ", "; + } + for (auto shape : out_shape) { + dynamic_param_.out_shape_ += shape + ", "; + } + dynamic_param_.in_shape0_ += "}"; + dynamic_param_.in_shape1_ += "}"; + dynamic_param_.out_shape_ += "}"; + dynamic_param_.in_elements_num0_ = AccumulateShape(in0_shape, 0, in0_shape.size()); + dynamic_param_.in_elements_num1_ = AccumulateShape(in1_shape, 0, in1_shape.size()); + dynamic_param_.out_elements_num_ = AccumulateShape(out_shape, 0, out_shape.size()); +} + +void ArithmeticDynamicFP16Coder::InitRunFunction(int primitive_type) { + InitFunTable(); + for (size_t i = 0; i < fun_table_.size(); i++) { + if (fun_table_[i].primitive_type_ == primitive_type && fun_table_[i].activation_type_ == param_->activation_type_) { + arithmetic_run_ = fun_table_[i].func_; + arithmetic_run_int_ = fun_table_[i].int_func_; + arithmetic_run_bool_ = fun_table_[i].bool_func_; + arithmetic_opt_run_ = fun_table_[i].opt_func_; + arithmetic_opt_run_int_ = fun_table_[i].opt_int_func_; + } + } + arithmetic_func_type_ = kArithmeticFuncFloat; +} + +void ArithmeticDynamicFP16Coder::ResetStatus() { + auto input_shape = shape_info_container_->GetTemplateShape(input_tensor_); + std::vector filter_shape; + if (filter_tensor_->IsConst()) { + for (auto dim : filter_tensor_->shape()) { + filter_shape.emplace_back(std::to_string(dim)); + } + } else { + filter_shape = shape_info_container_->GetTemplateShape(filter_tensor_); + } + auto dim_num = input_shape.size() >= filter_shape.size() ? input_shape.size() : filter_shape.size(); + for (size_t i = 0; i < dim_num - input_shape.size(); ++i) { + in0_shape_.emplace_back("1"); + } + in0_shape_.insert(in0_shape_.end(), input_shape.begin(), input_shape.end()); + for (size_t i = 0; i < dim_num - filter_shape.size(); ++i) { + in1_shape_.emplace_back("1"); + } + in1_shape_.insert(in1_shape_.end(), filter_shape.begin(), filter_shape.end()); +} + +void ArithmeticDynamicFP16Coder::CalcMultiplesAndStrides() { + out_shape_ = shape_info_container_->GetTemplateShape(output_tensor_); + dynamic_param_.multiples0_ = "{"; + dynamic_param_.multiples1_ = "{"; + for (size_t i = 0; i < param_->ndim_; i++) { + if (in0_shape_[i] != "0") { + dynamic_param_.multiples0_ += out_shape_[i] + " / " + in0_shape_[i] + ", "; + } + if (in1_shape_[i] != "0") { + dynamic_param_.multiples1_ += out_shape_[i] + " / " + in1_shape_[i] + ", "; + } + } + dynamic_param_.multiples0_ += "}"; + dynamic_param_.multiples1_ += "}"; + + // cal strides + in0_strides_.resize(param_->ndim_); + in1_strides_.resize(param_->ndim_); + out_strides_.resize(param_->ndim_); + ComputeStrides(in0_shape_, in0_strides_); + ComputeStrides(in1_shape_, in1_strides_); + ComputeStrides(out_shape_, out_strides_); + dynamic_param_.in_strides0_ = "{"; + dynamic_param_.in_strides1_ = "{"; + dynamic_param_.out_strides_ = "{"; + for (size_t i = 0; i < param_->ndim_; ++i) { + dynamic_param_.in_strides0_ += in0_strides_[i] + ", "; + dynamic_param_.in_strides1_ += in1_strides_[i] + ", "; + dynamic_param_.out_strides_ += out_strides_[i] + ", "; + } + dynamic_param_.in_strides0_ += "}"; + dynamic_param_.in_strides1_ += "}"; + dynamic_param_.out_strides_ += "}"; +} + +void ArithmeticDynamicFP16Coder::ComputeStrides(const std::vector &shape, + std::vector &strides) { + std::string stride = "1"; + for (int i = param_->ndim_ - 1; i >= 0; i--) { + strides[i] = stride; + stride += "*=" + shape[i]; + } +} + +int ArithmeticDynamicFP16Coder::ExecuteCode(const std::string &input0, const std::string &input1, + const std::string &output, const std::string size, + CoderContext *const context, NNaclFp32Serializer *const code) { + if (arithmetic_func_str_.empty()) { + return RET_ERROR; + } + for (size_t i = 0; i < fun_table_.size(); i++) { + if (fun_table_[i].primitive_type_ == param_->op_parameter_.type_ && + fun_table_[i].activation_type_ == param_->activation_type_) { + code->CodeFunction(fun_table_[i].func_, input0, input1, output, size); + break; + } + } + context->AppendCode(code->str()); + return RET_OK; +} + +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_AddFusion, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_MulFusion, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_SubFusion, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_DivFusion, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_RealDiv, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_LogicalAnd, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_LogicalOr, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Maximum, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Minimum, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_FloorDiv, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_FloorMod, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_SquaredDifference, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Equal, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_NotEqual, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Less, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_LessEqual, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Greater, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_GreaterEqual, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Eltwise, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_AddFusion, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_MulFusion, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_SubFusion, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_DivFusion, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_RealDiv, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_LogicalAnd, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_LogicalOr, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Maximum, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Minimum, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_FloorDiv, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_FloorMod, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_SquaredDifference, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Equal, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_NotEqual, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Less, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_LessEqual, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Greater, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_GreaterEqual, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Eltwise, + CPUOpCoderCreator) +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.h new file mode 100644 index 00000000..87e43687 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/arithmetic_dynamic_fp16_coder.h @@ -0,0 +1,132 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_ARITHMETIC_DYNAMIC_FP16_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_ARITHMETIC_DYNAMIC_FP16_CODER_H_ + +#include +#include +#include "coder/opcoders/op_coder.h" +#include "nnacl/base/cast_base.h" +#include "nnacl/arithmetic_parameter.h" +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/opcoders/nnacl/dynamic_parameter/arithmetic_dynamic_parameter.h" +#include "nnacl/broadcast_to_parameter.h" + +namespace mindspore::lite::micro::nnacl { +using mindspore::schema::PrimitiveType_AddFusion; +using mindspore::schema::PrimitiveType_DivFusion; +using mindspore::schema::PrimitiveType_Eltwise; +using mindspore::schema::PrimitiveType_Equal; +using mindspore::schema::PrimitiveType_FloorDiv; +using mindspore::schema::PrimitiveType_FloorMod; +using mindspore::schema::PrimitiveType_Greater; +using mindspore::schema::PrimitiveType_GreaterEqual; +using mindspore::schema::PrimitiveType_Less; +using mindspore::schema::PrimitiveType_LessEqual; +using mindspore::schema::PrimitiveType_LogicalAnd; +using mindspore::schema::PrimitiveType_LogicalOr; +using mindspore::schema::PrimitiveType_Maximum; +using mindspore::schema::PrimitiveType_Minimum; +using mindspore::schema::PrimitiveType_Mod; +using mindspore::schema::PrimitiveType_MulFusion; +using mindspore::schema::PrimitiveType_NotEqual; +using mindspore::schema::PrimitiveType_RealDiv; +using mindspore::schema::PrimitiveType_SquaredDifference; +using mindspore::schema::PrimitiveType_SubFusion; + +class ArithmeticDynamicFP16Coder final : public OperatorCoder { + typedef struct { + int primitive_type_; + int activation_type_; + std::string func_; + std::string int_func_; + std::string bool_func_; + std::string opt_func_; + std::string opt_int_func_; + } ARITHMETIC_FUNC_INFO_FP16; + + // typedef struct MATRIC_INFO { + // bool is_const{false}; + // bool is_valid{false}; + // void *data{nullptr}; + // int64_t inner_size{1}; // the element num of once batch + // std::vector shape; + // std::vector batch_post_sum; + // void Reset() { + // is_valid = false; + // data = nullptr; + // inner_size = 1; + // shape.clear(); + // batch_post_sum.clear(); + // } + // } MATRIC_INFO; + + public: + ArithmeticDynamicFP16Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + + ~ArithmeticDynamicFP16Coder() override = default; + + int DoCode(CoderContext *const context) override; + + private: + int Prepare(CoderContext *const context) override; + + void InitFunTable(); + + void InitRunFunction(int primitive_type); + + void InitDynamicParams(); + + void ResetStatus(); + + void CalcMultiplesAndStrides(); + + void ComputeStrides(const std::vector &shape, std::vector &strides); + + int ExecuteCode(const std::string &input0, const std::string &input1, const std::string &output, + const std::string size, CoderContext *const context, NNaclFp32Serializer *const code); + + std::vector fun_table_; + ArithmeticFuncType arithmetic_func_type_{kArithmeticFuncUnknow}; + ArithmeticParameter *param_{nullptr}; + ArithmeticDynamicParameter dynamic_param_; + BroadcastShapeInfo broadcast_info_; + BroadcastDynamicShapeInfo dynamic_shape_info_; + Tensor *filter_tensor_{nullptr}; + std::string input0_ptr_str_; + std::string input1_ptr_str_; + std::string output_ptr_str_; + std::string arithmetic_run_; + std::string arithmetic_run_int_; + std::string arithmetic_opt_run_; + std::string arithmetic_opt_run_int_; + std::string arithmetic_run_bool_; + std::string arithmetic_func_str_; + std::vector in0_shape_; + std::vector in1_shape_; + std::vector out_shape_; + std::vector in0_strides_; + std::vector in1_strides_; + std::vector out_strides_; + // MATRIC_INFO a_matric_; + // MATRIC_INFO b_matric_; + // MATRIC_INFO c_matric_; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_ARITHMETIC_DYNAMIC_FP16_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.cc new file mode 100644 index 00000000..bf8bd06b --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.cc @@ -0,0 +1,92 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.h" +#include +#include +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/opcoders/file_collector.h" +#include "coder/opcoders/parallel.h" +#include "coder/utils/coder_utils.h" + +using mindspore::schema::PrimitiveType_Concat; + +namespace mindspore::lite::micro::nnacl { +int ConcatDynamicFP16Coder::Prepare(CoderContext *const context) { + for (size_t i = 0; i < input_tensors_.size(); ++i) { + MS_CHECK_TRUE_MSG(input_tensors_.at(i)->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "input tensor data type is invalid."); + } + concat_param_ = reinterpret_cast(parameter_); + MS_CHECK_PTR(concat_param_); + auto input_shape = shape_info_container_->GetTemplateShape(input_tensor_); + axis_ = + concat_param_->axis_ >= 0 ? concat_param_->axis_ : static_cast(input_shape.size()) + concat_param_->axis_; + return RET_OK; +} + +int ConcatDynamicFP16Coder::DoCode(CoderContext *const context) { + Collect(context, + { + "nnacl/base/concat_base.h", + }, + { + "concat_base.c", + }); + + size_t input_num = input_tensors_.size(); + + NNaclFp32Serializer code; + code << "\t\tvoid *inputs_addr[] = {"; + for (size_t i = 0; i < input_num; ++i) { + code << "(void *)(" + << GetTensorAddr(input_tensors_.at(i), input_tensors_.at(i)->IsConst(), dynamic_mem_manager_, allocator_) + << "), "; + } + code << "};\n"; + + size_t i; + for (i = 0; i < input_num; ++i) { + code << "\t\tint shape_" << i << "[] = {"; + auto in_shape = shape_info_container_->GetTemplateShape(input_tensors_.at(i)); + for (auto &shape : in_shape) { + code << shape << ", "; + } + code << "};\n"; + } + + auto out_shape = shape_info_container_->GetTemplateShape(output_tensor_); + code << "\t\tint shape_" << i << "[] = {"; + for (auto &shape : out_shape) { + code << shape << ", "; + } + code << "};\n"; + + code << "\t\tint *inputs_output_shape[] = {"; + for (i = 0; i <= input_num; ++i) { + code << "shape_" << i << ", "; + } + code << "};\n"; + std::string output_data = GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_); + code.CodeFunction("Concat", "inputs_addr", input_num, axis_, "inputs_output_shape", out_shape.size(), output_data, 0, + 1, sizeof(uint16_t)); + context->AppendCode(code.str()); + return RET_OK; +} + +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Concat, CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Concat, CPUOpCoderCreator) + +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.h new file mode 100644 index 00000000..bd1b7ff6 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/concat_dynamic_fp16_coder.h @@ -0,0 +1,40 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONCAT_DYNAMIC_FP16_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONCAT_DYNAMIC_FP16_CODER_H_ + +#include +#include "coder/opcoders/op_coder.h" +#include "nnacl/concat_parameter.h" + +namespace mindspore::lite::micro::nnacl { +class ConcatDynamicFP16Coder final : public OperatorCoder { + public: + ConcatDynamicFP16Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + ~ConcatDynamicFP16Coder() override = default; + + int Prepare(CoderContext *const context) override; + int DoCode(CoderContext *const context) override; + + private: + int axis_{0}; + ConcatParameter *concat_param_{nullptr}; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONCAT_DYNAMIC_FP16_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.cc new file mode 100644 index 00000000..2f4e42e7 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.cc @@ -0,0 +1,155 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.h" +#include "src/common/version_manager.h" +#include "src/common/tensor_util.h" +#include "src/common/ops/populate/populate_register.h" +#include "nnacl/fp32/winograd_utils.h" +#include "nnacl/base/conv_common_base.h" +#include "nnacl/infer/conv2d_infer.h" +#include "coder/shape_info_container.h" +#include "coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.h" +#include "coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.h" + +using mindspore::schema::PrimitiveType_Conv2DFusion; +namespace mindspore::lite::micro::nnacl { +int ConvDelegateDynamicFP16Coder::Prepare(CoderContext *const context) { + for (size_t i = 0; i < input_tensors_.size(); ++i) { + MS_CHECK_TRUE_MSG(input_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "Input tensor data type is invalid"); + } + for (size_t i = 0; i < output_tensors_.size(); ++i) { + MS_CHECK_TRUE_MSG(output_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "Output tensor data type is invalid"); + } + // Update shape info of input and output + ConvDynamicParameter dynamic_param; + SetInputOutputShapeInfo(reinterpret_cast(parameter_), dynamic_param, input_tensor_, output_tensor_); + if (conv_coder_ == nullptr) { + // need to select actual execute coder here + conv_coder_ = + CPUConvFP16DynamicCoderSelect(input_tensors_, output_tensors_, node_, node_index(), target_, schema_version_); + MS_CHECK_PTR(conv_coder_); + ConvParameter *op_parameter = static_cast(malloc(sizeof(ConvParameter))); + if (op_parameter == nullptr) { + MS_LOG(ERROR) << "malloc ConvParameter failed."; + return RET_ERROR; + } + if (memcpy_s(op_parameter, sizeof(ConvParameter), parameter_, sizeof(ConvParameter)) != EOK) { + MS_LOG(ERROR) << "memcpy_s failed."; + free(op_parameter); + return RET_ERROR; + } + conv_coder_->set_type(GetPrimitiveType(node_->primitive_, schema_version_)); + conv_coder_->set_thread_num(thread_num_); + conv_coder_->set_parameter(reinterpret_cast(op_parameter)); + conv_coder_->set_shape_info_container(shape_info_container_); + conv_coder_->set_dynamic_mem_manager(dynamic_mem_manager_); + } + return conv_coder_->Prepare(context); +} + +int ConvDelegateDynamicFP16Coder::DoCode(CoderContext *const context) { return conv_coder_->DoCode(context); } + +void ConvDelegateDynamicFP16Coder::SetInputOutputShapeInfo(ConvParameter *conv_param, + ConvDynamicParameter &dynamic_param, + const lite::Tensor *input, const lite::Tensor *output) { + dynamic_param.input_batch_ = shape_info_container_->GetTemplateShape(input_tensor_).at(0); + conv_param->input_h_ = input->Height(); + conv_param->input_w_ = input->Width(); + conv_param->input_channel_ = input->Channel(); + dynamic_param.output_batch_ = shape_info_container_->GetTemplateShape(output_tensor_).at(0); + conv_param->output_h_ = output->Height(); + conv_param->output_w_ = output->Width(); + conv_param->output_channel_ = output->Channel(); +} + +std::unique_ptr CPUConvFP16DynamicCoderSelect(const std::vector &in_tensors, + const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, + Target target, int schema_version) { + const void *primitive = node->primitive_; + if (primitive == nullptr) { + return nullptr; + } + ParameterGen paramGen = PopulateRegistry::GetInstance()->GetParameterCreator( + GetPrimitiveType(node->primitive_, schema_version), schema_version); + MS_CHECK_PTR_RET_NULL(paramGen); + auto conv_param = reinterpret_cast(paramGen(node->primitive_)); + MS_CHECK_PTR_RET_NULL(conv_param); + int kernel_h = conv_param->kernel_h_; + int kernel_w = conv_param->kernel_w_; + conv_param->input_h_ = in_tensors.at(kInputIndex)->Height(); + conv_param->input_w_ = in_tensors.at(kInputIndex)->Width(); + conv_param->input_channel_ = in_tensors.at(kInputIndex)->Channel(); + conv_param->output_h_ = out_tensors.at(kOutputIndex)->Height(); + conv_param->output_w_ = out_tensors.at(kOutputIndex)->Width(); + conv_param->output_channel_ = out_tensors.at(kOutputIndex)->Channel(); + conv_param->op_parameter_.thread_num_ = 1; + free(conv_param); + std::unique_ptr coder; + if (kernel_h == 1 && kernel_w == 1) { + MS_LOG(DEBUG) << "create Convolution1x1DynamicFP16CPUKernel"; + coder = CPUOpCoderCreator(in_tensors, out_tensors, node, node_index, target, + schema_version); + } else { + MS_LOG(DEBUG) << "create ConvolutionDynamicFP16Coder"; + coder = + CPUOpCoderCreator(in_tensors, out_tensors, node, node_index, target, schema_version); + } + return coder; +} + +std::unique_ptr CreateConvDelegateFp16(const std::vector &in_tensors, + const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target, + int schema_version) { + return CPUOpCoderCreator(in_tensors, out_tensors, node, node_index, target, + schema_version); +} + +std::unique_ptr CPUConv2DFusionDynamicFP16CoderCreator(const std::vector &in_tensors, + const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, + Target target, int schema_version) { + const void *primitive = node->primitive_; + if (primitive == nullptr) { + return nullptr; + } + ParameterGen param_gen = PopulateRegistry::GetInstance()->GetParameterCreator( + GetPrimitiveType(node->primitive_, schema_version), schema_version); + if (param_gen == nullptr) { + MS_LOG(ERROR) << "parameter generator is null"; + return nullptr; + } + auto conv_param = reinterpret_cast(param_gen(node->primitive_)); + std::unique_ptr coder; + if (conv_param->group_ == 1) { + coder = CreateConvDelegateFp16(in_tensors, out_tensors, node, node_index, target, schema_version); + } else { + // GroupConv + MS_LOG(ERROR) << "currently, only support conv_param->group_ == 1 in dynamic coder scene"; + return nullptr; + } + return coder; +} + +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Conv2DFusion, + CPUConv2DFusionDynamicFP16CoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Conv2DFusion, + CPUConv2DFusionDynamicFP16CoderCreator) +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.h new file mode 100644 index 00000000..c352c469 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/conv2d_delegate_dynamic_fp16_coder.h @@ -0,0 +1,56 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONV2D_DELEGATE_DYNAMIC_FP16_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONV2D_DELEGATE_DYNAMIC_FP16_CODER_H_ +#include +#include +#include "coder/opcoders/op_coder.h" +#include "coder/opcoders/nnacl/dynamic_parameter/conv_dynamic_parameter.h" +#include "nnacl/conv_parameter.h" + +namespace mindspore::lite::micro::nnacl { +class ConvDelegateDynamicFP16Coder : public OperatorCoder { + public: + ConvDelegateDynamicFP16Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + + ~ConvDelegateDynamicFP16Coder() override = default; + int Prepare(CoderContext *const context) override; + int DoCode(CoderContext *const context) override; + + protected: + std::unique_ptr conv_coder_ = nullptr; + ConvParameter *conv_param_{nullptr}; + ConvDynamicParameter dynamic_param_; + + private: + void SetInputOutputShapeInfo(ConvParameter *conv_param, ConvDynamicParameter &dynamic_param, + const lite::Tensor *input, const lite::Tensor *output); +}; + +std::unique_ptr CPUConvFP16DynamicCoderSelect(const std::vector &in_tensors, + const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, + Target target, int schema_version); + +std::unique_ptr CPUConv2DFusionDynamicFP16CoderCreator(const std::vector &in_tensors, + const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, + Target target, int schema_version); +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONV2D_DELEGATE_DYNAMIC_FP16_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.cc new file mode 100644 index 00000000..c682b2ed --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.cc @@ -0,0 +1,252 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.h" +#include +#include +#include "nnacl/fp32/winograd_utils.h" +#include "coder/opcoders/file_collector.h" +#include "coder/opcoders/parallel.h" +#include "coder/utils/coder_utils.h" + +namespace mindspore::lite::micro::nnacl { +int Convolution1x1DynamicFP16Coder::Prepare(CoderContext *const context) { + CHECK_LESS_RETURN(input_tensors_.size(), C2NUM); + CHECK_LESS_RETURN(output_tensors_.size(), 1); + for (size_t i = 0; i < input_tensors_.size(); ++i) { + MS_CHECK_TRUE_MSG(input_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "Tensor data type is invalid"); + } + for (size_t i = 0; i < output_tensors_.size(); ++i) { + MS_CHECK_TRUE_MSG(output_tensors_[i]->data_type() == kNumberTypeFloat16, RET_PARAM_INVALID, + "Tensor data type is invalid"); + } + if (target_ == kARM64) { + row_tile_ = (output_tensor_->format() == NC4HW4) ? C16NUM : C12NUM; + col_tile_ = (output_tensor_->format() == NC4HW4) ? C8NUM : C16NUM; + } + if (matmul_param_ == nullptr) { + matmul_param_ = new (std::nothrow) MatMulParameter(); + if (matmul_param_ == nullptr) { + MS_LOG(ERROR) << "Init matmul_param_ failed."; + return RET_ERROR; + } + } + conv_param_ = reinterpret_cast(parameter_); + filter_tensor_ = input_tensors_.at(kWeightIndex); + MS_CHECK_PTR(filter_tensor_); + if (input_tensors_.size() == kInputSize2) { + bias_tensor_ = input_tensors_.at(kBiasIndex); + MS_CHECK_PTR(bias_tensor_); + } else { + MS_CHECK_TRUE(input_tensors_.size() == kInputSize1, "wrong input size"); + } + dynamic_param_.input_batch_ = shape_info_container_->GetTemplateShape(input_tensor_)[0]; + conv_param_->input_h_ = input_tensor_->Height(); + conv_param_->input_w_ = input_tensor_->Width(); + conv_param_->input_channel_ = input_tensor_->Channel(); + dynamic_param_.output_batch_ = shape_info_container_->GetTemplateShape(output_tensor_)[0]; + conv_param_->output_h_ = output_tensor_->Height(); + conv_param_->output_w_ = output_tensor_->Width(); + conv_param_->output_channel_ = output_tensor_->Channel(); + MS_CHECK_RET_CODE(InitWeightBias(context), "Init weight bias failed."); + MS_CHECK_RET_CODE(InitMatmulParam(), "Init matmul param failed."); + MS_CHECK_RET_CODE(InitTmpBuffer(context), "Init tmp buffer failed."); + return RET_OK; +} + +int Convolution1x1DynamicFP16Coder::DoCode(CoderContext *const context) { + CollectFilesForFunc(context); + NNaclFp32Serializer code; + MS_CHECK_RET_CODE(ComputeWorkspace(), "ComputeWorkspace failed."); + auto tmp_input_str = "(float16_t *)(" + allocator_->GetRuntimeAddr(static_cast(tmp_input_)) + ")"; + auto input_str = + "(float16_t *)(" + GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")"; + auto output_str = + "(float16_t *)(" + GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")"; + auto packed_weight_str = allocator_->GetRuntimeAddr(static_cast(packed_weight_)); + + code << " for (int batch_index = 0; batch_index < " << dynamic_param_.input_batch_ << "; batch_index++) {\n"; + output_ptr_ = output_str + " + batch_index * " + std::to_string(matmul_param_->row_ * matmul_param_->col_); + auto batch_in = input_str + " + batch_index * " + + std::to_string(conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_); + if (pre_trans_input_) { + code.CodeStruct("conv_parameter", *conv_param_, dynamic_param_); + code.CodeFunction("Conv1x1InputPack", batch_in, tmp_input_str, "&conv_parameter", DataTypeSize(data_type_)); + } else { + tmp_input_str = batch_in; + } + + if (output_tensor_->format() == NC4HW4) { + code.CodeFunction(target_ == kARM64 ? "RowMajor2Col16MajorFp16Opt" : "RowMajor2Col12MajorFp16Opt", tmp_input_str, + "(float16_t *)(" + pack_input_str_ + ")", matmul_param_->row_, matmul_param_->deep_); + } else { + code.CodeFunction("RowMajor2Col12MajorFp16Opt", tmp_input_str, "(float16_t *)(" + pack_input_str_ + ")", + matmul_param_->row_, matmul_param_->deep_); + } + + if (output_tensor_->format() == NC4HW4) { + code.CodeStruct("matmul_param", *matmul_param_); + code.CodeFunction("Conv1x1OutNc8hw8MultiThreadByWeightFp16", tmp_input_str, + "(float16_t *)(" + pack_input_str_ + ")", packed_weight_str, bias_data_, output_ptr_, + kDefaultTaskId, "&matmul_param"); + } else { + code.CodeFunction(target_ == kARM64 ? "MatMul12x16Fp16Opt" : "MatMul12x8A32Fp16", + "(float16_t *)(" + pack_input_str_ + ")", packed_weight_str, output_ptr_, bias_data_, + matmul_param_->act_type_, matmul_param_->deep_, matmul_param_->row_, matmul_param_->col_, + matmul_param_->col_, OutType_Nhwc); + } + code << " }\n"; + context->AppendCode(code.str()); + return RET_OK; +} + +Convolution1x1DynamicFP16Coder::~Convolution1x1DynamicFP16Coder() { + FreeTmpBuffer(); + if (matmul_param_ != nullptr) { + delete matmul_param_; + matmul_param_ = nullptr; + } + return; +} + +void Convolution1x1DynamicFP16Coder::FreeTmpBuffer() { + if (pre_trans_input_ && tmp_input_ != nullptr) { + free(tmp_input_); + tmp_input_ = nullptr; + } + return; +} + +int Convolution1x1DynamicFP16Coder::ComputeWorkspace() { + pack_input_size_ = matmul_param_->row_align_ * matmul_param_->deep_ * DataTypeSize(data_type_); + auto input_shape = shape_info_container_->GetTemplateShape(input_tensor_); + size_t scene_num = 0; + for (auto &dim_template : input_shape) { + auto dim_nums = shape_info_container_->GetRealNums(dim_template); + MS_CHECK_TRUE_MSG(!dim_nums.empty(), RET_ERROR, "Dynamic shape's num must be greater than 0."); + scene_num = std::max(scene_num, dim_nums.size()); + } + for (size_t i = 0; i < scene_num; ++i) { + pack_input_str_ = dynamic_mem_manager_->AllocWorkSpace(pack_input_size_, i); + MS_CHECK_TRUE_MSG(!pack_input_str_.empty(), RET_ERROR, "Convolution cannot alloc workspace."); + } + return RET_OK; +} + +int Convolution1x1DynamicFP16Coder::InitMatmulParam() { + matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_; + matmul_param_->col_ = conv_param_->output_channel_; + matmul_param_->deep_ = conv_param_->input_channel_; + matmul_param_->row_align_ = UP_ROUND(matmul_param_->row_, row_tile_); + matmul_param_->col_align_ = UP_ROUND(matmul_param_->col_, col_tile_); + matmul_param_->act_type_ = conv_param_->act_type_; + return RET_OK; +} + +int Convolution1x1DynamicFP16Coder::InitWeightBias(CoderContext *const context) { + auto input_channel = filter_tensor_->Channel(); + auto output_channel = filter_tensor_->Batch(); + MS_CHECK_TRUE_RET(input_channel > 0 && output_channel > 0, RET_ERROR); + pack_weight_size_ = input_channel * UP_ROUND(output_channel, col_tile_) * DataTypeSize(data_type_); + packed_weight_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight); + MS_CHECK_PTR(packed_weight_); + + NNaclFp32Serializer init_code; + std::string ori_weight_addr = allocator_->GetRuntimeAddr(filter_tensor_); + size_t w_buf_size = 0; + w_buf_size += pack_weight_size_; + auto packed_weight_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast(packed_weight_)); + init_code.CodeBufferOffsetExpression(packed_weight_, context->weight_name(), context->weight_offset_name(), + context->weight_size_name(), pack_weight_size_); + if (target_ == kARM64 && output_tensor_->format() != NC4HW4) { + init_code.CodeFunction("RowMajor2Col16MajorFp16Opt", ori_weight_addr, packed_weight_str, output_channel, + input_channel); + } else { + init_code.CodeFunction("ColMajor2Row8MajorFp16", ori_weight_addr, packed_weight_str, input_channel, output_channel, + true); + } + bias_data_size_ = UP_ROUND(output_channel, col_tile_) * DataTypeSize(data_type_); + if (input_tensors_.size() == kInputSize2) { + bias_data_ = + allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight, bias_tensor_->tensor_name() + "_online_pack"); + MS_CHECK_PTR(bias_data_); + init_code.CodeBufferOffsetExpression(bias_data_, context->weight_name(), context->weight_offset_name(), + context->weight_size_name(), bias_data_size_); + w_buf_size += bias_data_size_; + auto bias_data_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast(bias_data_)); + std::string bias_tensor_str = allocator_->GetRuntimeAddr(bias_tensor_); + init_code.CodeFunction("memcpy", bias_data_str, bias_tensor_str, bias_tensor_->Size()); + } else { + bias_data_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight, node_->name_ + "_bias_online_pack"); + MS_CHECK_PTR(bias_data_); + init_code.CodeFunction("memset", bias_data_, 0, bias_data_size_); + } + context->AppendInitWeightSizeCode(w_buf_size); + context->AppendInitCode(init_code.str()); + return RET_OK; +} + +int Convolution1x1DynamicFP16Coder::InitTmpBuffer(CoderContext *const context) { + NNaclFp32Serializer init_code; + pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 || + conv_param_->stride_w_ != 1); + size_t w_size = 0; + if (pre_trans_input_) { + tmp_input_size_ = matmul_param_->row_ * matmul_param_->deep_ * DataTypeSize(data_type_); + tmp_input_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight); + MS_CHECK_PTR(tmp_input_); + w_size += tmp_input_size_; + auto tmp_input_str = allocator_->GetRuntimeAddr(static_cast(tmp_input_)); + init_code.CodeBufferOffsetExpression(tmp_input_, context->weight_name(), context->weight_offset_name(), + context->weight_size_name(), tmp_input_size_); + init_code.CodeFunction("memset", tmp_input_, 0, tmp_input_size_); + } + context->AppendInitWeightSizeCode(w_size); + context->AppendInitCode(init_code.str()); + return RET_OK; +} + +void Convolution1x1DynamicFP16Coder::CollectFilesForFunc(CoderContext *const context) { + if (target_ == kARM64) { + Collect(context, {}, {}, + { + "MatmulFp16.S", + "MatmulFp16Opt.S", + "Matmul12X16Fp16.S", + }); + } else { + Collect(context, {}, {}, + { + "Matmul12x8Fp16.S", + }); + } + Collect(context, + { + "nnacl/fp16/matmul_fp16.h", + "nnacl/conv_parameter.h", + "nnacl/op_base.h", + "nnacl/fp16/conv_fp16.h", + "nnacl/base/conv1x1_base.h", + }, + { + "common_func.c", + "matmul_fp16.c", + "conv_fp16.c", + "conv1x1_base.c", + }); +} +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.h new file mode 100644 index 00000000..558eea53 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_1x1_dynamic_fp16_coder.h @@ -0,0 +1,68 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONVOLUTION_1X1_DYNAMIC_FP16_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONVOLUTION_1X1_DYNAMIC_FP16_CODER_H_ + +#include +#include +#include "nnacl/conv_parameter.h" +#include "nnacl/matmul_parameter.h" +#include "coder/opcoders/op_coder.h" +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/opcoders/nnacl/dynamic_parameter/conv_dynamic_parameter.h" +#include "base/float16.h" + +namespace mindspore::lite::micro::nnacl { +class Convolution1x1DynamicFP16Coder final : public OperatorCoder { + public: + Convolution1x1DynamicFP16Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + ~Convolution1x1DynamicFP16Coder() override; + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; + + private: + void CollectFilesForFunc(CoderContext *const context); + int InitWeightBias(CoderContext *const context); + int InitMatmulParam(); + int InitTmpBuffer(CoderContext *const context); + void FreeTmpBuffer(); + int ComputeWorkspace(); + MatMulParameter *matmul_param_{nullptr}; + ConvParameter *conv_param_{nullptr}; + ConvDynamicParameter dynamic_param_; + Tensor *filter_tensor_{nullptr}; + Tensor *bias_tensor_{nullptr}; + int row_tile_{C12NUM}; + int col_tile_{C8NUM}; + void *packed_weight_{nullptr}; + void *bias_data_{nullptr}; + std::string pack_input_str_; + void *tmp_input_{nullptr}; + size_t pack_weight_size_{0}; + size_t bias_data_size_{0}; + size_t tmp_input_size_{0}; + size_t pack_input_size_{0}; + bool pre_trans_input_{false}; + std::string output_ptr_; + TypeId data_type_ = kNumberTypeFloat16; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONVOLUTION_1X1_DYNAMIC_FP16_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.cc new file mode 100644 index 00000000..c917b89a --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.cc @@ -0,0 +1,172 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.h" +#include +#include +#include "coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.h" +#include "nnacl/fp32/winograd_utils.h" +#include "coder/opcoders/file_collector.h" +#include "coder/log.h" +#include "coder/opcoders/parallel.h" +#include "coder/utils/coder_utils.h" +#include "base/float16.h" + +using mindspore::schema::PrimitiveType_Conv2DFusion; +namespace mindspore::lite::micro::nnacl { +int ConvolutionDynamicFP16Coder::Prepare(CoderContext *const context) { + CHECK_LESS_RETURN(input_tensors_.size(), C2NUM); + CHECK_LESS_RETURN(output_tensors_.size(), 1); + if (target_ == kARM64) { + row_tile_ = C16NUM; + } + conv_param_ = reinterpret_cast(parameter_); + MS_CHECK_PTR(conv_param_); + dynamic_param_.input_batch_ = shape_info_container_->GetTemplateShape(input_tensor_)[0]; + conv_param_->input_h_ = input_tensor_->Height(); + conv_param_->input_w_ = input_tensor_->Width(); + conv_param_->input_channel_ = input_tensor_->Channel(); + dynamic_param_.output_batch_ = shape_info_container_->GetTemplateShape(output_tensor_)[0]; + conv_param_->output_h_ = output_tensor_->Height(); + conv_param_->output_w_ = output_tensor_->Width(); + conv_param_->output_channel_ = output_tensor_->Channel(); + conv_param_->thread_num_ = 1; + MS_CHECK_RET_CODE(InitWeightBias(context), "Init weight bias failed."); + MS_CHECK_RET_CODE(InitTmpBuffer(), "Init tmp buffer failed."); + return RET_OK; +} + +int ConvolutionDynamicFP16Coder::InitTmpBuffer() { + int uint_size = conv_param_->kernel_h_ * conv_param_->kernel_w_ * conv_param_->input_channel_ * row_tile_ * + conv_param_->thread_num_; + packed_input_size_ = uint_size * DataTypeSize(data_type_); + auto input_shape = shape_info_container_->GetTemplateShape(input_tensor_); + size_t scene_num = 0; + for (auto &dim_template : input_shape) { + auto dim_nums = shape_info_container_->GetRealNums(dim_template); + MS_CHECK_TRUE_MSG(!dim_nums.empty(), RET_ERROR, "Dynamic shape's num must be greater than 0."); + scene_num = std::max(scene_num, dim_nums.size()); + } + for (size_t i = 0; i < scene_num; ++i) { + packed_input_str_ = dynamic_mem_manager_->AllocWorkSpace(packed_input_size_ * 2, i); + MS_CHECK_TRUE_MSG(!packed_input_str_.empty(), RET_ERROR, "Convolution cannot alloc workspace."); + } + col_major_input_str_ = packed_input_str_ + " + " + std::to_string(packed_input_size_); + return RET_OK; +} + +int ConvolutionDynamicFP16Coder::InitWeightBias(CoderContext *const context) { + filter_tensor_ = input_tensors_.at(kWeightIndex); + CHECK_NULL_RETURN(filter_tensor_); + auto shape = filter_tensor_->shape(); + if (std::find(shape.begin(), shape.end(), -1) != shape.end()) { + MS_LOG(WARNING) << "The shape of weight tensor is not ready, the weight and bias would be inited in runtime."; + return RET_OK; + } + int in_channel = filter_tensor_->Channel(); + int out_channel = filter_tensor_->Batch(); + MS_CHECK_TRUE_RET(in_channel > 0 && out_channel > 0, RET_ERROR); + conv_param_->input_channel_ = in_channel; + conv_param_->output_channel_ = out_channel; + int oc8 = UP_ROUND(out_channel, col_tile_); + int kernel_plane = filter_tensor_->Height() * filter_tensor_->Width(); + pack_weight_size_ = oc8 * in_channel * kernel_plane * DataTypeSize(data_type_); + // init weight + packed_weight_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight); + MS_CHECK_PTR(packed_weight_); + NNaclFp32Serializer init_code; + std::string ori_weight_addr = allocator_->GetRuntimeAddr(filter_tensor_); + size_t w_buf_size = 0; + w_buf_size += pack_weight_size_; + auto packed_weight_str = allocator_->GetRuntimeAddr(static_cast(packed_weight_)); + init_code.CodeBufferOffsetExpression(packed_weight_, context->weight_name(), context->weight_offset_name(), + context->weight_size_name(), pack_weight_size_); + init_code.CodeFunction("RowMajor2Col8MajorFp16", ori_weight_addr, packed_weight_str, out_channel, + in_channel * kernel_plane, false); + if (input_tensors_.size() == C3NUM) { + bias_tensor_ = input_tensors_.at(kBiasIndex); + MS_CHECK_PTR(bias_tensor_); + bias_data_ = + allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight, bias_tensor_->tensor_name() + "_online_pack"); + MS_CHECK_PTR(bias_data_); + } else { + bias_data_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight, node_->name_ + "_bias_online_pack"); + MS_CHECK_PTR(bias_data_); + } + auto bias_data_size = static_cast(oc8 * DataTypeSize(data_type_)); + w_buf_size += bias_data_size; + init_code.CodeBufferOffsetExpression(bias_data_, context->weight_name(), context->weight_offset_name(), + context->weight_size_name(), bias_data_size); + bias_data_str_ = allocator_->GetRuntimeAddr(bias_data_); + if (input_tensors_.size() == C3NUM) { + auto origin_bias_str = allocator_->GetRuntimeAddr(bias_tensor_); + init_code.CodeFunction("memcpy", bias_data_str_, origin_bias_str, bias_tensor_->Size()); + } else { + init_code.CodeFunction("memset", bias_data_str_, 0, bias_data_size); + } + context->AppendInitWeightSizeCode(w_buf_size); + context->AppendInitCode(init_code.str()); + return RET_OK; +} + +void ConvolutionDynamicFP16Coder::CollectFilesForFunc(CoderContext *const context) { + Collect(context, {}, {}, + { + "MatmulFp16.S", + "MatmulFp16Opt.S", + "MatVecMulFp16.S", + "Matmul12X16Fp16.S", + }); + Collect(context, + { + "nnacl/fp16/matmul_fp16.h", + "nnacl/conv_parameter.h", + "nnacl/op_base.h", + "nnacl/fp16/conv_fp16.h", + }, + { + "common_func.c", + "matmul_fp16.c", + "pack_fp16.c", + "conv_fp16.c", + }); +} + +int ConvolutionDynamicFP16Coder::DoCode(CoderContext *const context) { + CollectFilesForFunc(context); + NNaclFp32Serializer code; + // call the op function + auto packed_weight_str = allocator_->GetRuntimeAddr(static_cast(packed_weight_)); + auto input_str = + "(float16_t *)(" + GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")"; + auto output_str = + "(float16_t *)(" + GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")"; + // code.CodeFunction("memset", packed_input_str_, "0", packed_input_size_); + // code.CodeFunction("memset", col_major_input_str_, "0", packed_input_size_); + code.CodeStruct("conv_parameter", *conv_param_, dynamic_param_); + packed_input_str_ = "(float16_t *)(" + packed_input_str_ + ")"; + col_major_input_str_ = "(float16_t *)(" + col_major_input_str_ + ")"; + if (output_tensor_->format() == NC4HW4) { + code.CodeFunction("ConvOutNc8hw8Fp16", input_str, packed_input_str_, packed_weight_str, bias_data_str_, + col_major_input_str_, output_str, kDefaultTaskId, "&conv_parameter"); + } else { + code.CodeFunction("ConvFp16", input_str, packed_input_str_, packed_weight_str, bias_data_str_, col_major_input_str_, + output_str, kDefaultTaskId, "&conv_parameter"); + } + context->AppendCode(code.str()); + return RET_OK; +} +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.h new file mode 100644 index 00000000..29d70796 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/convolution_dynamic_fp16_coder.h @@ -0,0 +1,59 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONVOLUTION_DYNAMIC_FP16_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONVOLUTION_DYNAMIC_FP16_CODER_H_ + +#include +#include +#include "nnacl/conv_parameter.h" +#include "coder/opcoders/op_coder.h" +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/opcoders/nnacl/dynamic_parameter/conv_dynamic_parameter.h" + +namespace mindspore::lite::micro::nnacl { +class ConvolutionDynamicFP16Coder final : public OperatorCoder { + public: + ConvolutionDynamicFP16Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + + ~ConvolutionDynamicFP16Coder() override = default; + + int Prepare(CoderContext *const context) override; + int DoCode(CoderContext *const context) override; + + private: + void CollectFilesForFunc(CoderContext *const context); + int InitWeightBias(CoderContext *const context); + int InitTmpBuffer(); + ConvParameter *conv_param_{nullptr}; + ConvDynamicParameter dynamic_param_; + TypeId data_type_{kNumberTypeFloat16}; + int row_tile_{C12NUM}; + int col_tile_{C8NUM}; + Tensor *filter_tensor_{nullptr}; + Tensor *bias_tensor_{nullptr}; + size_t pack_weight_size_{0}; + size_t packed_input_size_{0}; + void *packed_weight_{nullptr}; + void *bias_data_{nullptr}; + std::string packed_input_str_; + std::string col_major_input_str_; + std::string bias_data_str_; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_CONVOLUTION_DYNAMIC_FP16_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.cc new file mode 100644 index 00000000..8c4cc31b --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.cc @@ -0,0 +1,366 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.h" +#include +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/opcoders/file_collector.h" +#include "coder/utils/coder_utils.h" +#include "tools/common/string_util.h" + +using mindspore::schema::PrimitiveType_LSTM; + +namespace mindspore::lite::micro::nnacl { +namespace { +constexpr size_t kMindirInputTensorNum = 4; +} // namespace + +int LstmMindirDynamicFP16Coder::Prepare(CoderContext *const context) { + CHECK_NULL_RETURN(context); + CHECK_NOT_EQUAL_RETURN(input_tensors_.size(), kMindirInputTensorNum); + for (auto in : input_tensors_) { + MS_CHECK_TRUE_MSG(in != nullptr, RET_INPUT_TENSOR_ERROR, "LstmMindirDynamicFP16Coder input is a nullptr."); + MS_CHECK_TRUE_MSG(in->data_type() == kNumberTypeFloat16, RET_INPUT_TENSOR_ERROR, + "LstmMindirDynamicFP16Coder input must be fp16."); + MS_CHECK_TRUE_MSG(in->shape().size() == C3NUM, RET_INPUT_TENSOR_ERROR, + "LstmMindirDynamicFP16Coder input must be 3D."); + } + MS_CHECK_TRUE_MSG(input_tensors_[FOURTH_INPUT]->IsConst(), RET_INPUT_TENSOR_ERROR, + "LstmMindirDynamicFP16Coder last three inputs must be all constant."); + lstm_param_ = reinterpret_cast(parameter_); + return InitParam(); +} + +int LstmMindirDynamicFP16Coder::DoCode(CoderContext *const context) { + Collect(context, + { + "nnacl/lstm_parameter.h", + "nnacl/fp16/lstm_fp16.h", + }, + {"lstm_fp16.c", "activation_fp16.c", "arithmetic_fp16.c", "matmul_fp16.c", "pack_fp16.c"}, + {"MatmulBaseFp16Neon.S"}); + + auto ret = InitInputWeightBias(context); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "Lstm InitInputWeightBias failed."); + ret = InitStateWeightBias(context); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "Lstm InitStateWeightBias failed."); + ret = InitProjectWeight(context); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "Lstm InitProjectWeight failed."); + ret = ComputeWorkSpace(); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "Lstm ComputeWorkSpace failed."); + CreateBufferAddrStr(); + NNaclFp32Serializer code; + code << "float16_t *buffer[7] = {"; + for (const auto &buf : buffers_str_) { + code << "(float16_t *)(" << buf << "), "; + } + code << "};\n"; + + auto input1 = dynamic_mem_manager_->GetVarTensorAddr(input_tensors_[FIRST_INPUT]); + auto hidden_init = input_tensors_[SECOND_INPUT]->IsConst() + ? allocator_->GetRuntimeAddr(input_tensors_[SECOND_INPUT], true) + : dynamic_mem_manager_->GetVarTensorAddr(input_tensors_[SECOND_INPUT]); + auto cell_init = input_tensors_[THIRD_INPUT]->IsConst() + ? allocator_->GetRuntimeAddr(input_tensors_[THIRD_INPUT], true) + : dynamic_mem_manager_->GetVarTensorAddr(input_tensors_[THIRD_INPUT]); + auto output1 = dynamic_mem_manager_->GetVarTensorAddr(output_tensors_[FIRST_INPUT]); + auto hidden_output = dynamic_mem_manager_->GetVarTensorAddr(output_tensors_[SECOND_INPUT]); + auto cell_output = dynamic_mem_manager_->GetVarTensorAddr(output_tensors_[THIRD_INPUT]); + MS_CHECK_TRUE_MSG(!input1.empty() && !hidden_init.empty() && !cell_init.empty() && !output1.empty() && + !hidden_output.empty() && !cell_output.empty(), + RET_ERROR, "Lstm cannot get addr."); + code.CodeStruct("lstm_param", *lstm_param_, dynamic_lstm_param_); + auto input_shape2 = shape_info_container_->GetTemplateShape(input_tensors_[SECOND_INPUT]); + int64_t const_part = 1; + std::string non_const_part; + for (const auto &item : input_shape2) { + if (IsNumber(item)) { + const_part *= std::stoi(item); + } else { + if (!non_const_part.empty()) { + non_const_part += " * "; + } + non_const_part += item; + } + } + code.CodeFunction("memcpy", hidden_output, hidden_init, + non_const_part + " * " + std::to_string(const_part * DataTypeSize(kNumberTypeFloat16))); + auto input_shape3 = shape_info_container_->GetTemplateShape(input_tensors_[THIRD_INPUT]); + const_part = 1; + non_const_part = ""; + for (const auto &item : input_shape3) { + if (IsNumber(item)) { + const_part *= std::stoi(item); + } else { + if (!non_const_part.empty()) { + non_const_part += " * "; + } + non_const_part += item; + } + } + code.CodeFunction("memcpy", cell_output, cell_init, + non_const_part + " * " + std::to_string(const_part * DataTypeSize(kNumberTypeFloat16))); + auto weight_i_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast(weight_i_ptr_)); + auto weight_h_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast(weight_h_ptr_)); + auto weight_pro_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast(weight_project_ptr_)); + auto input_bias_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast(input_bias_)); + auto state_bias_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast(hh_bias_)); + auto pro_bias_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(static_cast(project_bias_)); + + code.CodeFunction("LstmFp16", "(float16_t *)(" + output1 + ")", "(float16_t *)(" + input1 + ")", weight_i_str, + weight_h_str, input_bias_str, state_bias_str, weight_pro_str, pro_bias_str, + "(float16_t *)(" + hidden_output + ")", "(float16_t *)(" + cell_output + ")", "buffer", + "&lstm_param"); + context->AppendCode(code.str()); + return RET_OK; +} + +int LstmMindirDynamicFP16Coder::InitParam() { + auto in_shape1 = shape_info_container_->GetTemplateShape(input_tensors_[FIRST_INPUT]); + MS_CHECK_TRUE_MSG(in_shape1.size() == C3NUM, RET_INPUT_TENSOR_ERROR, "LstmMindir first input's dim must be 3D."); + dynamic_lstm_param_.batch_ = in_shape1[1]; + dynamic_lstm_param_.seq_len_ = in_shape1[0]; + MS_CHECK_TRUE_MSG(IsNumber(in_shape1[C2NUM]), RET_NOT_SUPPORT, + "LstmMindir doesn't support input_size is dynamical in micro."); + lstm_param_->input_size_ = std::atoi(in_shape1[C2NUM].c_str()); + + auto h_init_shape = input_tensors_[SECOND_INPUT]->shape(); + auto c_init_shape = input_tensors_[THIRD_INPUT]->shape(); + lstm_param_->hidden_size_ = c_init_shape.back(); + lstm_param_->output_size_ = h_init_shape.back(); + + lstm_param_->output_step_ = lstm_param_->bidirectional_ ? C2NUM * lstm_param_->batch_ * lstm_param_->output_size_ + : lstm_param_->batch_ * lstm_param_->output_size_; + weight_segment_num_ = lstm_param_->bidirectional_ ? C8NUM : C4NUM; + dynamic_lstm_param_.input_row_align_ = + "(" + dynamic_lstm_param_.batch_ + " * " + dynamic_lstm_param_.seq_len_ + " + 3) / 4 * 4"; + lstm_param_->input_col_align_ = UP_ROUND(lstm_param_->hidden_size_, C4NUM); + + dynamic_lstm_param_.state_row_align_ = "(" + dynamic_lstm_param_.batch_ + " + 3) / 4 * 4"; + lstm_param_->state_col_align_ = UP_ROUND(lstm_param_->hidden_size_, C4NUM); + lstm_param_->proj_col_align_ = UP_ROUND(lstm_param_->project_size_, C4NUM); + dynamic_lstm_param_.output_step_ = + std::to_string((lstm_param_->bidirectional_ ? C2NUM : C1NUM) * lstm_param_->output_size_) + " * " + + dynamic_lstm_param_.batch_; + size_t scale = lstm_param_->bidirectional_ ? C2NUM : C1NUM; + hi_size_ = scale * C4NUM * lstm_param_->hidden_size_ * lstm_param_->input_size_; + hh_size_ = scale * C4NUM * lstm_param_->hidden_size_ * lstm_param_->output_size_; + hp_size_ = scale * lstm_param_->project_size_ * lstm_param_->hidden_size_; + bias_size_ = scale * C8NUM * lstm_param_->hidden_size_; + auto real_whole_size = input_tensors_[FOURTH_INPUT]->ElementsNum(); + gpu_state_ = (hi_size_ + hh_size_ + hp_size_ + bias_size_) == static_cast(real_whole_size); + if (gpu_state_) { + MS_LOG(ERROR) << "LstmMindirDynamicFP16Coder doesn't suuport model which exported from GPU."; + return RET_NOT_SUPPORT; + } + if (hi_size_ + hh_size_ + hp_size_ == static_cast(real_whole_size)) { + bias_size_ = 0; + return RET_OK; + } + bias_size_ /= C2NUM; + if ((hi_size_ + hh_size_ + hp_size_ + bias_size_) != static_cast(real_whole_size)) { + MS_LOG(ERROR) << "Bias of LstmMindir exported from cpu only exist in hi-part."; + return RET_INPUT_TENSOR_ERROR; + } + return RET_OK; +} + +int LstmMindirDynamicFP16Coder::InitInputWeightBias(CoderContext *const context) { + NNaclFp32Serializer init_code; + + size_t weight_hi_size = + weight_segment_num_ * lstm_param_->input_col_align_ * lstm_param_->input_size_ * DataTypeSize(data_type_); + weight_i_ptr_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight); + MS_CHECK_PTR(weight_i_ptr_); + + size_t w_buf_size = 0; + + init_code.CodeBufferOffsetExpression(weight_i_ptr_, context->weight_name(), context->weight_offset_name(), + context->weight_size_name(), weight_hi_size); + w_buf_size += weight_hi_size; + auto weight_i_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(input_tensors_[FOURTH_INPUT]); + MS_CHECK_TRUE_MSG(!weight_i_str.empty(), RET_INPUT_TENSOR_ERROR, "Lstm cannot get weight."); + auto packed_weight_i_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(reinterpret_cast(weight_i_ptr_)); + init_code << " int32_t order[4] = {0, 2, 3, 1};\n"; + init_code.CodeFunction("PackLstmWeightFp16", packed_weight_i_str, weight_i_str, weight_segment_num_, + lstm_param_->input_size_, lstm_param_->hidden_size_, lstm_param_->input_col_align_, "order"); + + auto bias_stride = hi_size_ + hh_size_ + hp_size_; + input_bias_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight); + MS_CHECK_PTR(input_bias_); + size_t bias_i_size = weight_segment_num_ * lstm_param_->input_col_align_ * DataTypeSize(data_type_); + w_buf_size += bias_i_size; + init_code.CodeBufferOffsetExpression(input_bias_, context->weight_name(), context->weight_offset_name(), + context->weight_size_name(), bias_i_size); + auto input_bias_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(reinterpret_cast(input_bias_)); + init_code.CodeFunction("memset", input_bias_str, 0, bias_i_size); + if (bias_size_ != 0) { + init_code.CodeFunction("PackLstmBiasFp16", input_bias_str, weight_i_str + " + " + std::to_string(bias_stride), + weight_segment_num_, lstm_param_->hidden_size_, lstm_param_->input_col_align_, + lstm_param_->bidirectional_, "order"); + } + + context->AppendInitWeightSizeCode(w_buf_size); + context->AppendInitCode(init_code.str()); + return RET_OK; +} + +int LstmMindirDynamicFP16Coder::InitStateWeightBias(CoderContext *const context) { + NNaclFp32Serializer init_code; + + size_t weight_hh_size = + weight_segment_num_ * lstm_param_->state_col_align_ * lstm_param_->project_size_ * DataTypeSize(data_type_); + weight_h_ptr_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight); + MS_CHECK_PTR(weight_h_ptr_); + + size_t w_buf_size = 0; + + init_code.CodeBufferOffsetExpression(weight_h_ptr_, context->weight_name(), context->weight_offset_name(), + context->weight_size_name(), weight_hh_size); + w_buf_size += weight_hh_size; + auto weight_hh_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(input_tensors_[FOURTH_INPUT]); + MS_CHECK_TRUE_MSG(!weight_hh_str.empty(), RET_INPUT_TENSOR_ERROR, "Lstm cannot get weight."); + auto packed_weight_hh_str = + MemoryAllocator::GetInstance()->GetRuntimeAddr(reinterpret_cast(weight_h_ptr_)); + init_code << " int32_t order[4] = {0, 2, 3, 1};\n"; + init_code.CodeFunction("PackLstmWeightFp16", packed_weight_hh_str, weight_hh_str + " + " + std::to_string(hi_size_), + weight_segment_num_, lstm_param_->project_size_, lstm_param_->hidden_size_, + lstm_param_->state_col_align_, "order"); + + hh_bias_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight); + MS_CHECK_PTR(hh_bias_); + size_t bias_hh_size = weight_segment_num_ * lstm_param_->state_col_align_ * DataTypeSize(data_type_); + w_buf_size += bias_hh_size; + init_code.CodeBufferOffsetExpression(hh_bias_, context->weight_name(), context->weight_offset_name(), + context->weight_size_name(), bias_hh_size); + auto hh_bias_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(reinterpret_cast(hh_bias_)); + init_code.CodeFunction("memset", hh_bias_str, 0, bias_hh_size); + + context->AppendInitWeightSizeCode(w_buf_size); + context->AppendInitCode(init_code.str()); + return RET_OK; +} + +int LstmMindirDynamicFP16Coder::InitProjectWeight(CoderContext *const context) { + if (hp_size_ == 0) { + return RET_OK; + } + + NNaclFp32Serializer init_code; + size_t w_buf_size = 0; + int scale = lstm_param_->bidirectional_ ? C2NUM : C1NUM; + int col_align = UP_ROUND(lstm_param_->project_size_, C8NUM); + size_t weight_pro_size = scale * lstm_param_->hidden_size_ * col_align * DataTypeSize(data_type_); + weight_project_ptr_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight); + MS_CHECK_PTR(weight_project_ptr_); + init_code.CodeBufferOffsetExpression(weight_project_ptr_, context->weight_name(), context->weight_offset_name(), + context->weight_size_name(), weight_pro_size); + w_buf_size += weight_pro_size; + auto weight_hp_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(input_tensors_[FOURTH_INPUT]); + MS_CHECK_TRUE_MSG(!weight_hp_str.empty(), RET_INPUT_TENSOR_ERROR, "Lstm cannot get weight."); + auto weight_pro_str = + MemoryAllocator::GetInstance()->GetRuntimeAddr(reinterpret_cast(weight_project_ptr_)); + init_code.CodeFunction("PackLstmWeightFp16", weight_pro_str, + weight_hp_str + " + " + std::to_string(hi_size_ + hh_size_), scale, lstm_param_->hidden_size_, + lstm_param_->project_size_, col_align, "NULL"); + + size_t bias_pro_size = col_align * DataTypeSize(data_type_); + project_bias_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight); + MS_CHECK_PTR(project_bias_); + init_code.CodeBufferOffsetExpression(project_bias_, context->weight_name(), context->weight_offset_name(), + context->weight_size_name(), bias_pro_size); + w_buf_size += bias_pro_size; + auto bias_pro_str = MemoryAllocator::GetInstance()->GetRuntimeAddr(reinterpret_cast(project_bias_)); + init_code.CodeFunction("memset", bias_pro_str, 0, bias_pro_size); + + context->AppendInitWeightSizeCode(w_buf_size); + context->AppendInitCode(init_code.str()); + return RET_OK; +} + +int LstmMindirDynamicFP16Coder::ComputeWorkSpace() { + auto in_shape1 = shape_info_container_->GetTemplateShape(input_tensors_[FIRST_INPUT]); + auto seq_lens = shape_info_container_->GetRealNums(in_shape1[0]); + MS_CHECK_TRUE_MSG(!seq_lens.empty(), RET_ERROR, "Lstm cannot get seq_len"); + auto batches = shape_info_container_->GetRealNums(in_shape1[1]); + MS_CHECK_TRUE_MSG(!batches.empty(), RET_ERROR, "Lstm cannot get batch"); + size_t scene_num = seq_lens.size() > batches.size() ? seq_lens.size() : batches.size(); + for (size_t i = 0; i < scene_num; ++i) { + int seq_len = seq_lens[i % seq_lens.size()]; + int batch = batches[i % batches.size()]; + size_t buffer1 = + seq_len * batch <= C3NUM ? 0 : seq_len * batch * lstm_param_->input_size_ * DataTypeSize(data_type_); + size_t buffer2 = C4NUM * seq_len * batch * lstm_param_->hidden_size_ * DataTypeSize(data_type_); + size_t buffer3 = batch <= C3NUM ? 0 : batch * lstm_param_->output_size_ * DataTypeSize(data_type_); + size_t buffer4 = C4NUM * batch * lstm_param_->hidden_size_ * DataTypeSize(data_type_); + size_t buffer5 = (lstm_param_->zoneout_cell_ >= -FLT_EPSILON && lstm_param_->zoneout_cell_ <= FLT_EPSILON) + ? 0 + : batch * lstm_param_->hidden_size_ * DataTypeSize(data_type_); + size_t buffer6 = (lstm_param_->zoneout_hidden_ >= -FLT_EPSILON && lstm_param_->zoneout_hidden_ <= FLT_EPSILON) + ? 0 + : batch * lstm_param_->output_size_ * DataTypeSize(data_type_); + size_t buffer7 = (batch <= C3NUM || lstm_param_->project_size_ == 0) + ? 0 + : batch * lstm_param_->hidden_size_ * DataTypeSize(data_type_); + auto whole_size = buffer1 + buffer2 + buffer3 + buffer4 + buffer5 + buffer6 + buffer7; + buffers_start_ = dynamic_mem_manager_->AllocWorkSpace(whole_size, i); + MS_CHECK_TRUE_MSG(!buffers_start_.empty(), RET_ERROR, "Lstm cannot alloc workspace."); + } + + return RET_OK; +} + +void LstmMindirDynamicFP16Coder::CreateBufferAddrStr() { + auto in_shape1 = shape_info_container_->GetTemplateShape(input_tensors_[FIRST_INPUT]); + auto seq_len = in_shape1[0]; + auto batch = in_shape1[1]; + auto input_row_align = "(" + seq_len + " * " + batch + " + 3) / 4 * 4"; + auto state_row_align = "(" + batch + " + 3) / 4 * 4"; + buffers_str_.push_back("(" + seq_len + " * " + batch + " <= 3) ? NULL : " + buffers_start_); + auto offset = "((" + seq_len + " * " + batch + " <= 3) ? 0 : (" + seq_len + " * " + batch + ") * " + + std::to_string(lstm_param_->input_size_ * DataTypeSize(data_type_)) + ")"; + buffers_str_.push_back(buffers_start_ + " + " + offset); + offset = "(" + offset + " + " + seq_len + " * " + batch + " * " + + std::to_string(C4NUM * lstm_param_->hidden_size_ * DataTypeSize(data_type_)) + ")"; + buffers_str_.push_back(batch + " <= 3 ? NULL : (" + buffers_start_ + " + " + offset + ")"); + offset = "(" + offset + " + (" + batch + " <= 3 ? 0 : (" + batch + ") * " + + std::to_string(lstm_param_->output_size_ * DataTypeSize(data_type_)) + "))"; + buffers_str_.push_back(buffers_start_ + " + " + offset); + offset = "(" + offset + " + " + batch + " * " + + std::to_string(C4NUM * lstm_param_->hidden_size_ * DataTypeSize(data_type_)) + ")"; + if (lstm_param_->zoneout_cell_ < -FLT_EPSILON || lstm_param_->zoneout_cell_ > FLT_EPSILON) { + buffers_str_.push_back(buffers_start_ + " + " + offset); + offset = + "(" + offset + " + " + batch + " * " + std::to_string(lstm_param_->hidden_size_ * DataTypeSize(data_type_)) + ")"; + } else { + buffers_str_.emplace_back("NULL"); + } + if (lstm_param_->zoneout_hidden_ < -FLT_EPSILON && lstm_param_->zoneout_hidden_ > FLT_EPSILON) { + buffers_str_.push_back(buffers_start_ + " + " + offset); + offset = + "(" + offset + " + " + batch + " * " + std::to_string(lstm_param_->output_size_ * DataTypeSize(data_type_)) + ")"; + } else { + buffers_str_.emplace_back("NULL"); + } + if (lstm_param_->project_size_ == 0) { + buffers_str_.emplace_back("NULL"); + } else { + buffers_str_.emplace_back(batch + " <= 3 ? NULL : " + "(" + buffers_start_ + " + " + offset + ")"); + } +} +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_LSTM, + CPUOpCoderCreator) +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.h new file mode 100644 index 00000000..1084fa82 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/lstm_mindir_dynamic_fp16_coder.h @@ -0,0 +1,66 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_LSTM_DYNAMIC_FP16_CODER_H +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_LSTM_DYNAMIC_FP16_CODER_H + +#include +#include +#include "nnacl/lstm_parameter.h" +#include "coder/opcoders/nnacl/dynamic_parameter/dynamic_lstm_parameter.h" +#include "coder/opcoders/op_coder.h" + +namespace mindspore::lite::micro::nnacl { + +class LstmMindirDynamicFP16Coder : public OperatorCoder { + public: + LstmMindirDynamicFP16Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + + ~LstmMindirDynamicFP16Coder() override = default; + + int Prepare(CoderContext *const context) override; + int DoCode(CoderContext *const context) override; + + private: + int InitParam(); + int ComputeWorkSpace(); + void CreateBufferAddrStr(); + int InitInputWeightBias(CoderContext *const context); + int InitStateWeightBias(CoderContext *const context); + int InitProjectWeight(CoderContext *const context); + bool gpu_state_{false}; + TypeId data_type_{kNumberTypeFloat16}; + int weight_segment_num_{0}; + size_t hi_size_{0}; + size_t hh_size_{0}; + size_t hp_size_{0}; + size_t bias_size_{0}; + void *weight_i_ptr_{nullptr}; + void *weight_h_ptr_{nullptr}; + void *weight_project_ptr_{nullptr}; + void *input_bias_{nullptr}; + void *hh_bias_{nullptr}; + void *project_bias_{nullptr}; + LstmParameter *lstm_param_{nullptr}; + DynamicLstmParameter dynamic_lstm_param_; + std::string buffers_start_; + std::vector buffers_str_; +}; +} // namespace mindspore::lite::micro::nnacl + +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_LSTM_DYNAMIC_FP16_CODER_H diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.cc new file mode 100644 index 00000000..f6c56f86 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.cc @@ -0,0 +1,228 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.h" +#include +#include +#include "tools/converter/micro/coder/log.h" +#include "tools/converter/micro/coder/opcoders/file_collector.h" +#include "base/float16.h" +#include "tools/common/string_util.h" +#include "coder/utils/coder_utils.h" + +using mindspore::schema::PrimitiveType_MatMulFusion; + +namespace mindspore::lite::micro::nnacl { +int MatMulDynamicFP16BaseCoder::Prepare(CoderContext *const context) { + row_tile_ = C1NUM; + col_tile_ = C4NUM; + auto ret = InitAShape(); + MS_CHECK_TRUE_MSG(ret == RET_OK, RET_ERROR, "init A-metrics' info failed"); + ret = InitBShape(); + MS_CHECK_TRUE_MSG(ret == RET_OK, RET_ERROR, "init B-metrics' info failed"); + params_->col_align_ = UP_ROUND(params_->col_, col_tile_); + return RET_OK; +} + +int MatMulDynamicFP16BaseCoder::DoCode(CoderContext *const context) { + CollectFilesForTarget(context); + auto ret = InitMatrixB(context); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "InitMatrixB failed."); + ret = InitBiasData(context); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "InitBiasData failed."); + + ret = ComputeWorkSpace(); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "Matmul alloc workspace failed."); + auto input_a_str = dynamic_mem_manager_->GetVarTensorAddr(input_tensor_); + MS_CHECK_TRUE_MSG(!input_a_str.empty(), RET_ERROR, "Matmul cannot get matrixA"); + auto output_str = dynamic_mem_manager_->GetVarTensorAddr(output_tensor_); + MS_CHECK_TRUE_MSG(!output_str.empty(), RET_ERROR, "Matmul cannot get output"); + NNaclFp32Serializer code; + if (params_->a_transpose_) { + code << " if (" << dynamic_params_.row_ << " == 1) {\n"; + code << " if (" << dynamic_params_.batch_ << " <= 3) {\n"; + code.CodeFunction("MatmulFp16OptV2", "(float16_t *)(" + input_a_str + ")", input_b_pack_str_, + "(float16_t *)(" + output_str + ")", bias_str_, params_->act_type_, params_->deep_, + dynamic_params_.batch_, params_->col_, params_->col_, OutType_Nhwc); + code << " } else {\n"; + code.CodeFunction("RowMajor2ColLadder12MajorFp16", "(float16_t *)(" + input_a_str + ")", + "(float16_t *)(" + buffer_start_ + ")", dynamic_params_.batch_, params_->deep_); + code.CodeFunction("MatmulFp16OptV2", "(float16_t *)(" + buffer_start_ + ")", input_b_pack_str_, + "(float16_t *)(" + output_str + ")", bias_str_, params_->act_type_, params_->deep_, + dynamic_params_.batch_, params_->col_, params_->col_, OutType_Nhwc); + code << " } else {\n"; + code << " int in_stride = " << dynamic_params_.row_ << " * " << params_->deep_ << ";\n"; + code << " int out_stride = " << dynamic_params_.row_ << " * " << params_->col_ << ";\n"; + code << " for (int i = 0; i < " << dynamic_params_.batch_ << "; ++i) {\n"; + code.CodeFunction("RowMajor2RowLadder12MajorFp16", "(float16_t *)(" + input_a_str + ")" + " + in_stride * i", + "(float16_t *)(" + buffer_start_ + ")", params_->deep_, dynamic_params_.row_); + code.CodeFunction("MatmulFp16OptV2", "(float16_t *)(" + buffer_start_ + ")", input_b_pack_str_, + "(float16_t *)(" + output_str + ")" + " + out_stride * i", bias_str_, params_->act_type_, + params_->deep_, dynamic_params_.row_, params_->col_, OutType_Nhwc); + code << " }\n"; + code << " }\n"; + } else { + code << " if (" << dynamic_params_.batch_ << " * " << dynamic_params_.row_ << " <= 3) {\n"; + code.CodeFunction("MatmulFp16OptV2", "(float16_t *)(" + input_a_str + ")", input_b_pack_str_, + "(float16_t *)(" + output_str + ")", bias_str_, params_->act_type_, params_->deep_, + dynamic_params_.batch_ + " * " + dynamic_params_.row_, params_->col_, params_->col_, + OutType_Nhwc); + code << " } else {\n"; + code.CodeFunction("RowMajor2ColLadder12MajorFp16", "(float16_t *)(" + input_a_str + ")", + "(float16_t *)(" + buffer_start_ + ")", dynamic_params_.batch_ + " * " + dynamic_params_.row_, + params_->deep_); + code.CodeFunction("MatmulFp16OptV2", "(float16_t *)(" + buffer_start_ + ")", input_b_pack_str_, + "(float16_t *)(" + output_str + ")", bias_str_, params_->act_type_, params_->deep_, + dynamic_params_.batch_ + " * " + dynamic_params_.row_, params_->col_, params_->col_, + OutType_Nhwc); + } + code << " }\n"; + context->AppendCode(code.str()); + return RET_OK; +} + +int MatMulDynamicFP16BaseCoder::InitMatrixB(CoderContext *const context) { + NNaclFp32Serializer init_code; + if (b_pack_ptr_ != nullptr) { + return RET_OK; + } + auto b_pack_ptr_size = static_cast(params_->col_align_ * params_->deep_ * DataTypeSize(data_type_)); + b_pack_ptr_ = allocator_->GetSharedWeightAddr(filter_tensor_); + if (b_pack_ptr_ == nullptr) { + b_pack_ptr_ = allocator_->Malloc(data_type_, b_pack_ptr_size, kOnlinePackWeight, + filter_tensor_->tensor_name() + "_online_pack"); + allocator_->MarkSharedWeight(filter_tensor_, b_pack_ptr_); + } + MS_CHECK_PTR(b_pack_ptr_); + std::string input_b_str = allocator_->GetRuntimeAddr(filter_tensor_); + input_b_pack_str_ = allocator_->GetRuntimeAddr(static_cast(b_pack_ptr_)); + init_code.CodeBufferOffsetExpression(b_pack_ptr_, context->weight_name(), context->weight_offset_name(), + context->weight_size_name(), b_pack_ptr_size); + if (b_batch_ == C1NUM) { + if (params_->b_transpose_) { + init_code.CodeFunction("RowMajor2ColNMajorFp16", input_b_str, input_b_pack_str_, params_->col_, params_->deep_, + "false"); + } else { + init_code.CodeFunction("RowMajor2RowNMajorFp16", input_b_str, input_b_pack_str_, params_->deep_, params_->col_, + "false"); + } + } else { + init_code << " for (int i = 0; i < " << b_batch_ << "; i++) {\n" + << " float16_t *src = " << input_b_str << " + i * " << params_->deep_ * params_->col_ << ";\n" + << " float16_t *dst = " << input_b_pack_str_ << " + i * " << params_->deep_ * params_->col_align_ + << ";\n"; + if (params_->b_transpose_) { + init_code << " RowMajor2ColNMajorFp16(src, dst, " << params_->col_ << ", " << params_->deep_ << ", false);\n"; + } else { + init_code << " RowMajor2RowNMajorFp16(src, dst, " << params_->deep_ << ", " << params_->col_ << ", false);\n"; + } + init_code << " }\n"; + } + context->AppendInitWeightSizeCode(b_pack_ptr_size); + context->AppendInitCode(init_code.str()); + return RET_OK; +} + +int MatMulDynamicFP16BaseCoder::InitBiasData(CoderContext *const context) { + NNaclFp32Serializer init_code; + if (bias_ptr_ != nullptr) { + return RET_OK; + } + auto bias_pack_ptr_size = static_cast(params_->col_align_ * DataTypeSize(data_type_)); + if (input_tensors_.size() == C3NUM) { + bias_ptr_ = + allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight, bias_tensor_->tensor_name() + "_online_pack"); + MS_CHECK_PTR(bias_ptr_); + } else { + bias_ptr_ = allocator_->Malloc(data_type_, kOnlineSize, kOnlinePackWeight, node_->name_ + "_bias_online_pack"); + MS_CHECK_PTR(bias_ptr_); + } + init_code.CodeBufferOffsetExpression(bias_ptr_, context->weight_name(), context->weight_offset_name(), + context->weight_size_name(), bias_pack_ptr_size); + bias_str_ = allocator_->GetRuntimeAddr(bias_ptr_); + if (input_tensors_.size() == DIMENSION_3D) { + auto origin_bias_str = allocator_->GetRuntimeAddr(bias_tensor_); + init_code.CodeFunction("memcpy", bias_str_, origin_bias_str, bias_tensor_->Size()); + } else { + init_code.CodeFunction("memset", bias_str_, 0, bias_pack_ptr_size); + } + context->AppendInitWeightSizeCode(bias_pack_ptr_size); + context->AppendInitCode(init_code.str()); + return RET_OK; +} + +int MatMulDynamicFP16BaseCoder::ComputeWorkSpace() { + auto a_shape = shape_info_container_->GetTemplateShape(input_tensor_); + std::map> real_nums; + size_t scene_num = 0; + for (auto &dim_template : a_shape) { + auto dim_nums = shape_info_container_->GetRealNums(dim_template); + MS_CHECK_TRUE_MSG(!dim_nums.empty(), RET_ERROR, "Dynamic shape's num must be greater than 0."); + real_nums[dim_template] = dim_nums; + scene_num = std::max(scene_num, dim_nums.size()); + } + for (size_t i = 0; i < scene_num; ++i) { + std::vector real_shape(a_shape.size()); + for (size_t j = 0; j < a_shape.size(); ++j) { + if (IsNumber(a_shape[j])) { + real_shape[j] = std::stoi(a_shape[j]); + } else { + real_shape[j] = real_nums[a_shape[j]][i % real_nums[a_shape[j]].size()]; + } + } + int a_batch = 1; + for (size_t j = 0; j < a_shape.size() - C2NUM; ++j) { + MS_CHECK_INT_MUL_NOT_OVERFLOW(a_batch, real_shape[j], RET_ERROR); + a_batch *= real_shape[j]; + } + int row = params_->a_transpose_ ? real_shape.back() : real_shape[real_shape.size() - C2NUM]; + int deep = params_->a_transpose_ ? real_shape[real_shape.size() - C2NUM] : real_shape.back(); + MS_CHECK_TRUE_MSG(deep == params_->deep_, RET_INPUT_TENSOR_ERROR, + "Matmul's matrixA doesn't match matrixB, becase their deeps are not same."); + int workspace = 0; + if (params_->a_transpose_) { + workspace = (row == 1 ? (a_batch <= C3NUM ? 0 : UP_ROUND(a_batch, row_tile_)) : UP_ROUND(row, row_tile_)) * deep; + } else { + workspace = (a_batch * row <= C3NUM ? 0 : UP_ROUND(a_batch * row, row_tile_)) * deep; + } + buffer_start_ = dynamic_mem_manager_->AllocWorkSpace(workspace, i); + MS_CHECK_TRUE_MSG(!buffer_start_.empty(), RET_ERROR, "Matmul cannot alloc workspace."); + } + return RET_OK; +} + +int MatMulDynamicFP16BaseCoder::CollectFilesForTarget(CoderContext *const context) { + Collect(context, + { + "nnacl/fp16/pack_fp16.h", + "nnacl/fp16/matmul_fp16.h", + }, + { + "pack_fp16.c", + "matmul_fp16.c", + }); + if (target_ == kARM32) { + Collect(context, {}, {}, + { + "Matmul12x8Fp16.S", + "MatVecMulFp16.S", + }); + } else if (target_ == kARM64) { + Collect(context, {}, {}, {"MatmulFp16OptV2.S"}); + } + return RET_OK; +} +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.h new file mode 100644 index 00000000..f73cfff7 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.h @@ -0,0 +1,73 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_MATMUL_FP16_BASE_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_MATMUL_FP16_BASE_CODER_H_ + +#include +#include +#include "tools/converter/micro/coder/opcoders/op_coder.h" +#include "tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "nnacl/matmul_parameter.h" +#include "tools/converter/micro/coder/shape_info_container.h" +#include "tools/converter/micro/coder/dynamic_mem_manager.h" +#include "base/float16.h" +#include "coder/opcoders/nnacl/dynamic_parameter/matmul_dynamic_parameter.h" + +namespace mindspore::lite::micro::nnacl { +class MatMulDynamicFP16BaseCoder : public OperatorCoder { + public: + MatMulDynamicFP16BaseCoder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + + ~MatMulDynamicFP16BaseCoder() override = default; + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; + + private: + int InitBiasData(CoderContext *const context); + int InitMatrixB(CoderContext *const context); + int CollectFilesForTarget(CoderContext *const context); + int ComputeWorkSpace(); + + protected: + virtual int InitAShape() = 0; + virtual int InitBShape() = 0; + + protected: + Tensor *filter_tensor_{nullptr}; + Tensor *bias_tensor_{nullptr}; + MatMulParameter *params_{nullptr}; + MatmulDynamicParameter dynamic_params_; + void *a_pack_ptr_ = nullptr; + void *b_pack_ptr_ = nullptr; + void *bias_ptr_{nullptr}; + int col_tile_{0}; + int row_tile_{0}; + size_t a_pack_ptr_size_{0}; + TypeId data_type_{kNumberTypeFloat16}; + int a_batch_; + int b_batch_; + std::string buffer_start_; + std::string bias_str_; + std::string input_a_pack_str_; + std::string input_b_pack_str_; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_MATMUL_FP16_BASE_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.cc new file mode 100644 index 00000000..24cf7120 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.cc @@ -0,0 +1,100 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.h" +#include +#include "coder/log.h" +#include "coder/opcoders/file_collector.h" +#include "tools/common/string_util.h" +#include "coder/utils/coder_utils.h" + +using mindspore::schema::PrimitiveType_MatMulFusion; + +namespace mindspore::lite::micro::nnacl { +int MatMulDynamicFP16Coder::InitAShape() { + auto a_shape = shape_info_container_->GetTemplateShape(input_tensor_); + auto a_shape_size = a_shape.size(); + MS_CHECK_TRUE_MSG(a_shape_size >= DIMENSION_2D, RET_NOT_SUPPORT, "Matmul's a_shape_size must be not less than two."); + int64_t const_part = 1; + std::string non_const_part; + for (size_t i = 0; i < a_shape_size - C2NUM; ++i) { + if (IsNumber(a_shape[i])) { + const_part *= std::atoi(a_shape[i].c_str()); + } else { + if (!non_const_part.empty()) { + non_const_part += " * "; + } + non_const_part += a_shape[i]; + } + } + dynamic_params_.batch_ = non_const_part + " * " + std::to_string(const_part); + dynamic_params_.row_ = params_->a_transpose_ ? a_shape[a_shape.size() - C1NUM] : a_shape[a_shape.size() - C2NUM]; + return RET_OK; +} + +int MatMulDynamicFP16Coder::InitBShape() { + std::vector b_shape = filter_tensor_->shape(); + MS_CHECK_TRUE_MSG(b_shape.size() >= DIMENSION_2D, RET_NOT_SUPPORT, + "Matmul's b_shape_size must be not less than two."); + int batch = 1; + for (size_t i = 0; i < b_shape.size() - DIMENSION_2D; ++i) { + batch *= b_shape[i]; + } + if (batch != 1) { + MS_LOG(ERROR) << "Currently, Matmul only support matrixB's batch is 1."; + } + b_batch_ = batch; + params_->col_ = params_->b_transpose_ ? b_shape[b_shape.size() - C2NUM] : b_shape[b_shape.size() - C1NUM]; + params_->col_8_ = UP_ROUND(params_->col_, C8NUM); + params_->deep_ = params_->b_transpose_ ? b_shape[b_shape.size() - C1NUM] : b_shape[b_shape.size() - C2NUM]; + return RET_OK; +} + +int MatMulDynamicFP16Coder::Prepare(CoderContext *const context) { + for (size_t i = 0; i < input_tensors_.size(); ++i) { + MS_CHECK_TRUE_MSG(input_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "Input tensor data type is invalid."); + } + MS_CHECK_TRUE_MSG(output_tensor_->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "Input tensor data type is invalid."); + MS_CHECK_TRUE_MSG(input_tensors_.size() == C2NUM || input_tensors_.size() == C3NUM, RET_INPUT_PARAM_INVALID, + "MatMul's input-num must be 2 or 3."); + MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->IsConst(), RET_NOT_SUPPORT, + "Currently, only support the first input of matmul is non-const when shape is dynamical."); + if (input_tensors_.size() == C3NUM) { + MS_CHECK_TRUE_MSG(input_tensors_[THIRD_INPUT]->IsConst(), RET_NOT_SUPPORT, + "Currently, only support the first input of matmul is non-const when shape is dynamical."); + } + params_ = reinterpret_cast(parameter_); + filter_tensor_ = input_tensors_.at(kWeightIndex); + MS_CHECK_PTR(filter_tensor_); + if (input_tensors_.size() == kInputSize2) { + bias_tensor_ = input_tensors_.at(kBiasIndex); + MS_CHECK_PTR(bias_tensor_); + MS_CHECK_PTR(bias_tensor_->data()); + } + params_->a_const_ = (input_tensor_->data() != nullptr); + params_->b_const_ = (filter_tensor_->data() != nullptr); + MS_CHECK_RET_CODE(MatMulDynamicFP16BaseCoder::Prepare(context), "MatMulDynamicFP16Coder prepare failed"); + return RET_OK; +} + +int MatMulDynamicFP16Coder::DoCode(CoderContext *const context) { return MatMulDynamicFP16BaseCoder::DoCode(context); } + +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_MatMulFusion, + CPUOpCoderCreator) +// REG_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_MatMulFusion, CPUOpCoderCreator) +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.h new file mode 100644 index 00000000..1a16798c --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_coder.h @@ -0,0 +1,44 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_MATMUL_DYNAMIC_FP16_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_MATMUL_DYNAMIC_FP16_CODER_H_ + +#include +#include "tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_dynamic_fp16_base_coder.h" +#include "nnacl/matmul_parameter.h" +#include "tools/converter/micro/coder/shape_info_container.h" +#include "tools/converter/micro/coder/dynamic_mem_manager.h" + +namespace mindspore::lite::micro::nnacl { +class MatMulDynamicFP16Coder final : public MatMulDynamicFP16BaseCoder { + public: + MatMulDynamicFP16Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : MatMulDynamicFP16BaseCoder(in_tensors, out_tensors, node, node_index, target) {} + + ~MatMulDynamicFP16Coder() override = default; + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; + + private: + int InitAShape() override; + int InitBShape() override; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_MATMUL_DYNAMIC_FP16_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_fp16_base_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_fp16_base_coder.cc index 67f633fe..415e912d 100644 --- a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_fp16_base_coder.cc +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/matmul_fp16_base_coder.cc @@ -102,14 +102,15 @@ std::string MatMulFP16BaseCoder::InitMatrixA(NNaclFp32Serializer *const code, NN if (a_batch_ == 1) { if (params_.a_transpose_) { if (target_ == kARM64) { - pack_code.CodeFunction("RowMajor2RowNMajorFp16", input_a_str, input_a_pack_str, params_.deep_, params_.row_); + pack_code.CodeFunction("RowMajor2RowNMajorFp16", input_a_str, input_a_pack_str, params_.deep_, params_.row_, + "false"); } else { pack_code.CodeFunction("RowMajor2Row12MajorFp16", input_a_str, input_a_pack_str, params_.deep_, params_.row_, false); } } else { if (target_ == kARM64) { - pack_code.CodeFunction("RowMajor2ColNMajorFp16", input_a_str, input_a_pack_str, params_.row_, params_.deep_); + pack_code.CodeFunction("RowMajor2ColNMajorFp16", input_a_str, input_a_pack_str, params_.row_, params_.deep_, false); } else { pack_code.CodeFunction("RowMajor2Col12MajorFp16", input_a_str, input_a_pack_str, params_.row_, params_.deep_, false); @@ -122,13 +123,13 @@ std::string MatMulFP16BaseCoder::InitMatrixA(NNaclFp32Serializer *const code, NN << ";\n"; if (params_.a_transpose_) { if (target_ == kARM64) { - pack_code << " RowMajor2RowNMajorFp16(src, dst, " << params_.deep_ << ", " << params_.row_ << ");\n"; + pack_code << " RowMajor2RowNMajorFp16(src, dst, " << params_.deep_ << ", " << params_.row_ << ", false);\n"; } else { pack_code << " RowMajor2Row12MajorFp16(src, dst, " << params_.deep_ << ", " << params_.row_ << ", false);\n"; } } else { if (target_ == kARM64) { - pack_code << " RowMajor2ColNMajorFp16(src, dst, " << params_.row_ << ", " << params_.deep_ << ");\n"; + pack_code << " RowMajor2ColNMajorFp16(src, dst, " << params_.row_ << ", " << params_.deep_ << ", false);\n"; } else { pack_code << " RowMajor2Col12MajorFp16(src, dst, " << params_.row_ << ", " << params_.deep_ << ", false);\n"; } diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.cc new file mode 100644 index 00000000..c565f5b2 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.cc @@ -0,0 +1,89 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.h" +#include +#include +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/log.h" +#include "coder/opcoders/parallel.h" +#include "coder/opcoders/file_collector.h" +#include "coder/utils/coder_utils.h" + +using mindspore::schema::PrimitiveType_AvgPoolFusion; +using mindspore::schema::PrimitiveType_MaxPoolFusion; + +namespace mindspore::lite::micro::nnacl { +int PoolingDynamicFP16Coder::Prepare(CoderContext *const context) { + if (input_tensor_->data_type() != kNumberTypeFloat16 || output_tensor_->data_type() != kNumberTypeFloat16) { + MS_LOG(ERROR) << "Tensor data type is invalid"; + return lite::RET_INPUT_PARAM_INVALID; + } + param_ = reinterpret_cast(parameter_); + MS_CHECK_PTR(param_); + dynamic_param_.input_batch_ = shape_info_container_->GetTemplateShape(input_tensor_)[0]; + compute_.input_channel_ = input_tensor_->Channel(); + compute_.input_h_ = input_tensor_->Height(); + compute_.input_w_ = input_tensor_->Width(); + dynamic_param_.output_batch_ = shape_info_container_->GetTemplateShape(output_tensor_)[0]; + compute_.output_channel_ = output_tensor_->Channel(); + compute_.output_h_ = output_tensor_->Height(); + compute_.output_w_ = output_tensor_->Width(); + if (param_->global_) { + param_->window_h_ = compute_.input_h_; + param_->window_w_ = compute_.input_w_; + } + return RET_OK; +} + +int PoolingDynamicFP16Coder::DoCode(CoderContext *const context) { + Collect(context, + { + "nnacl/fp16/pooling_fp16.h", + }, + { + "pooling_fp16.c", + }); + NNaclFp32Serializer code; + code.CodeStruct("pooling_parameter", *param_); + code.CodeStruct("pooling_compute", compute_, dynamic_param_); + + auto input_data = + "(float16_t *)(" + GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")"; + auto output_data = + "(float16_t *)(" + GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")"; + if (param_->pool_mode_ == PoolMode_MaxPool) { + code.CodeFunction("MaxPoolingFp16", input_data, output_data, "&pooling_parameter", "&pooling_compute", + kDefaultTaskId, param_->op_parameter_.thread_num_); + } else if (param_->pool_mode_ == PoolMode_AvgPool) { + code.CodeFunction("AvgPoolingFp16", input_data, output_data, "&pooling_parameter", "&pooling_compute", + kDefaultTaskId, param_->op_parameter_.thread_num_); + } else { + MS_LOG(ERROR) << "Unsupported pooling mode."; + return lite::RET_ERROR; + } + context->AppendCode(code.str()); + return lite::RET_OK; +} + +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_AvgPoolFusion, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_AvgPoolFusion, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_MaxPoolFusion, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_MaxPoolFusion, + CPUOpCoderCreator) +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.h new file mode 100644 index 00000000..7b138b61 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/pooling_dynamic_fp16_coder.h @@ -0,0 +1,44 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_POOLING_DYNAMIC_FP16_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_POOLING_DYNAMIC_FP16_CODER_H_ + +#include +#include "coder/opcoders/op_coder.h" +#include "coder/opcoders/nnacl/dynamic_parameter/pooling_dynamic_parameter.h" +#include "nnacl/pooling_parameter.h" +#include "nnacl/kernel/pooling.h" + +namespace mindspore::lite::micro::nnacl { +class PoolingDynamicFP16Coder final : public OperatorCoder { + public: + PoolingDynamicFP16Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + ~PoolingDynamicFP16Coder() override = default; + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; + + private: + PoolingParameter *param_{nullptr}; + PoolingComputeParam compute_; + PoolingDynamicParameter dynamic_param_; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_POOLING_DYNAMIC_FP16_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.cc new file mode 100644 index 00000000..733cf49d --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.cc @@ -0,0 +1,128 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.h" +#include +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/opcoders/file_collector.h" +#include "coder/opcoders/parallel.h" +#include "coder/utils/coder_utils.h" + +using mindspore::schema::PrimitiveType_ScaleFusion; + +namespace mindspore::lite::micro::nnacl { +int ScaleDynamicFP16Coder::Prepare(CoderContext *const context) { + for (size_t i = 0; i < input_tensors_.size(); ++i) { + MS_CHECK_TRUE_MSG(input_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "Input tensor data type should be fp16, now is " << input_tensors_[i]->data_type()); + } + MS_CHECK_TRUE_MSG(output_tensor_->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "Output tensor data type should be fp16, now is " << output_tensor_->data_type()); + + scale_param_ = reinterpret_cast(parameter_); + MS_CHECK_PTR(scale_param_); + scale_struct_.base_.param_ = parameter_; + if (input_tensors_.size() < DIMENSION_2D || input_tensors_.size() > DIMENSION_3D) { + MS_LOG(ERROR) << "inputs to Scale operator should be 2 or 3, but " << input_tensors_.size() << " is given."; + return RET_ERROR; + } + scale_tensor_ = input_tensors_.at(kWeightIndex); + MS_CHECK_PTR(scale_tensor_); + MS_CHECK_RET_CODE(CalculateParameter(), "Scale fp16 CalculateParameter failed."); + return RET_OK; +} + +int ScaleDynamicFP16Coder::DoCode(CoderContext *const context) { + // init struct ScaleParameters + Collect(context, + { + "nnacl/kernel/scale.h", + "nnacl/fp16/scale_fp16.h", + }, + { + "scale_fp16.c", + }); + + NNaclFp32Serializer code; + code.CodeStruct("scale_struct", scale_struct_, dynamic_param_); + + auto scale = GetTensorAddr(scale_tensor_, scale_tensor_->IsConst(), dynamic_mem_manager_, allocator_); + std::string offset{"NULL"}; + if (input_tensors_.size() == DIMENSION_3D) { + auto offset_tensor = input_tensors_.at(kBiasIndex); + offset = GetTensorAddr(offset_tensor, offset_tensor->IsConst(), dynamic_mem_manager_, allocator_); + } + std::string input_str = + "(float16_t *)(" + GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")"; + std::string output_str = + "(float16_t *)(" + GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")"; + switch (scale_param_->activation_type_) { + case schema::ActivationType_RELU6: + code.CodeFunction("DoScaleRelu6Fp16", input_str, output_str, scale, offset, kDefaultTaskId, "&scale_struct"); + break; + case schema::ActivationType_RELU: + code.CodeFunction("Fp16DoScaleRelu", input_str, output_str, scale, offset, kDefaultTaskId, "&scale_struct"); + break; + case schema::ActivationType_NO_ACTIVATION: + code.CodeFunction("DoScaleFp16", input_str, output_str, scale, offset, kDefaultTaskId, "&scale_struct"); + break; + default: + MS_LOG(ERROR) << "Scale does not support activation type " << scale_param_->activation_type_; + return RET_ERROR; + } + context->AppendCode(code.str()); + return RET_OK; +} + +int ScaleDynamicFP16Coder::CalculateParameter() { + auto in_shape = shape_info_container_->GetTemplateShape(input_tensor_); + std::vector scale_shape; + if (scale_tensor_->IsConst()) { + for (auto dim : scale_tensor_->shape()) { + scale_shape.emplace_back(std::to_string(dim)); + } + } else { + scale_shape = shape_info_container_->GetTemplateShape(scale_tensor_); + } + if (scale_param_->axis_ < 0) { + scale_struct_.axis_ = scale_param_->axis_ + in_shape.size(); + } + if (scale_shape.size() + scale_struct_.axis_ > in_shape.size()) { + MS_LOG(ERROR) << "Scale tensor shape is incorrect."; + return RET_ERROR; + } + dynamic_param_.outer_size_ = AccumulateShape(in_shape, 0, scale_struct_.axis_); + if (scale_tensor_->IsConst() && scale_tensor_->shape().size() == 1) { + dynamic_param_.axis_size_ = in_shape.at(scale_struct_.axis_); + } else { + dynamic_param_.axis_size_ = "{"; + for (size_t i = 0; i < scale_shape.size(); i++) { + if (in_shape.at(i + scale_struct_.axis_) != scale_shape.at(i)) { + MS_LOG(ERROR) << "Scale tensor shape is incorrect."; + return RET_ERROR; + } + dynamic_param_.axis_size_ += in_shape.at(i + scale_struct_.axis_) + ", "; + } + dynamic_param_.axis_size_ += "}"; + } + dynamic_param_.inner_size_ = AccumulateShape(in_shape, scale_struct_.axis_ + scale_shape.size(), in_shape.size()); + return RET_OK; +} + +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_ScaleFusion, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_ScaleFusion, + CPUOpCoderCreator) +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.h new file mode 100644 index 00000000..02ec35ba --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/scale_dynamic_fp16_coder.h @@ -0,0 +1,46 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SCALE_DYNAMIC_FP16_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SCALE_DYNAMIC_FP16_CODER_H_ + +#include +#include "coder/opcoders/op_coder.h" +#include "coder/opcoders/nnacl/dynamic_parameter/scale_dynamic_parameter.h" +#include "nnacl/kernel/scale.h" +#include "nnacl/scale_parameter.h" + +namespace mindspore::lite::micro::nnacl { +class ScaleDynamicFP16Coder final : public OperatorCoder { + public: + ScaleDynamicFP16Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + ~ScaleDynamicFP16Coder() override = default; + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; + + private: + int CalculateParameter(); + ScaleParameter *scale_param_{nullptr}; + ScaleStruct scale_struct_; + ScaleDynamicParameter dynamic_param_; + Tensor *scale_tensor_{nullptr}; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SCALE_DYNAMIC_FP16_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.cc new file mode 100644 index 00000000..1c6969b2 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.cc @@ -0,0 +1,160 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.h" +#include "coder/opcoders/file_collector.h" +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/utils/coder_utils.h" + +using mindspore::schema::PrimitiveType_SliceFusion; + +namespace mindspore::lite::micro::nnacl { +int SliceDynamicFP16Coder::Prepare(CoderContext *const context) { + CHECK_LESS_RETURN(input_tensors_.size(), C3NUM); + CHECK_LESS_RETURN(output_tensors_.size(), 1); + CHECK_NULL_RETURN(input_tensors_[FIRST_INPUT]); + CHECK_NULL_RETURN(input_tensors_[SECOND_INPUT]); + CHECK_NULL_RETURN(input_tensors_[THIRD_INPUT]); + CHECK_NULL_RETURN(output_tensor_); + param_ = reinterpret_cast(parameter_); + CHECK_NULL_RETURN(param_); + MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->IsConst() && input_tensors_[THIRD_INPUT]->IsConst(), RET_NOT_SUPPORT, + "The second and third input of slice is non-const."); + MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->data_type() == kNumberTypeInt32 && + input_tensors_[THIRD_INPUT]->data_type() == kNumberTypeInt32, + RET_INPUT_PARAM_INVALID, "second or third input tensor data type need to be int32."); + if (input_tensor_->data_type() != kNumberTypeFloat16 || output_tensor_->data_type() != kNumberTypeFloat16) { + MS_LOG(ERROR) << "Tensor data type is invalid"; + return lite::RET_INPUT_PARAM_INVALID; + } + return Init(); +} + +int SliceDynamicFP16Coder::DoCode(CoderContext *const context) { + Collect(context, + { + "nnacl/base/slice_base.h", + }, + { + "slice_base.c", + }); + NNaclFp32Serializer code; + code.CodeStruct("slice_param", *param_, dynamic_param_); + std::string input_data = GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_); + std::string output_data = GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_); + if (!support_parallel_) { + code.CodeFunction("DoSliceNoParallel", input_data, output_data, "&slice_param", + DataTypeSize(input_tensor_->data_type())); + } + context->AppendCode(code.str()); + return NNACL_OK; +} + +int SliceDynamicFP16Coder::Init() { + auto begin_tensor = input_tensors_[SECOND_INPUT]; + auto size_tensor = input_tensors_[THIRD_INPUT]; + data_shape_ = shape_info_container_->GetTemplateShape(input_tensor_); + MS_CHECK_TRUE_MSG(data_shape_.size() == static_cast(begin_tensor->ElementsNum()), RET_ERROR, + "The begin tensor is invalid."); + MS_CHECK_TRUE_MSG(data_shape_.size() == static_cast(size_tensor->ElementsNum()), RET_ERROR, + "The size tensor is invalid."); + auto begin = reinterpret_cast(begin_tensor->data()); + CHECK_NULL_RETURN(begin); + auto size = reinterpret_cast(size_tensor->data()); + CHECK_NULL_RETURN(size); + param_->param_length_ = static_cast(data_shape_.size()); + if (param_->param_length_ > DIMENSION_8D) { + MS_LOG(ERROR) << "input dimension num should <= " << DIMENSION_8D; + return RET_ERROR; + } + dynamic_param_.shape_ = "{"; + dynamic_param_.size_ = "{"; + dynamic_param_.end_ = "{"; + for (int i = 0; i < param_->param_length_; ++i) { + dynamic_param_.shape_ += data_shape_[i] + ", "; + param_->begin_[i] = begin[i]; + if (size[i] < 0) { + std::string cur_size = data_shape_[i] + " - " + std::to_string(begin[i]); + slice_size_.emplace_back(cur_size); + dynamic_param_.size_ += cur_size + ", "; + } else { + slice_size_.emplace_back(std::to_string(size[i])); + dynamic_param_.size_ += std::to_string(size[i]) + ", "; + } + std::string cur_end = std::to_string(param_->begin_[i]) + " + " + slice_size_[i]; + end_.emplace_back(cur_end); + dynamic_param_.end_ += cur_end + ", "; + } + dynamic_param_.shape_ += "}"; + dynamic_param_.size_ += "}"; + dynamic_param_.end_ += "}"; + if (param_->param_length_ < DIMENSION_8D) { + PadSliceParameterTo8D(); + } + return RET_OK; +} + +void SliceDynamicFP16Coder::PadSliceParameterTo8D() { + std::vector begin(DIMENSION_8D, 0); + std::vector end(DIMENSION_8D, ""); + std::vector slice_size(DIMENSION_8D, ""); + std::vector data_shape(DIMENSION_8D, ""); + for (int32_t i = 0; i < param_->param_length_; ++i) { + begin[i] = param_->begin_[i]; + end[i] = end_[i]; + slice_size[i] = + slice_size_[i] + " < 0 ? " + data_shape[i] + " - " + std::to_string(begin[i]) + " : " + slice_size_[i]; + data_shape[i] = data_shape_[i]; + } + data_shape_.resize(DIMENSION_8D); + slice_size_.resize(DIMENSION_8D); + end_.resize(DIMENSION_8D); + int32_t real_index = param_->param_length_ - 1; + for (int32_t i = DIMENSION_8D - 1; i >= 0; --i) { + if (real_index >= 0) { + param_->begin_[i] = begin[real_index]; + end_[i] = end[real_index]; + slice_size_[i] = slice_size[real_index]; + data_shape_[i] = data_shape[real_index--]; + } else { + param_->begin_[i] = 0; + end_[i] = "1"; + slice_size_[i] = "1"; + data_shape_[i] = "1"; + } + } + param_->param_length_ = DIMENSION_8D; + dynamic_param_.shape_.clear(); + dynamic_param_.size_.clear(); + dynamic_param_.end_.clear(); + dynamic_param_.shape_ = "{"; + dynamic_param_.size_ = "{"; + dynamic_param_.end_ = "{"; + for (int i = 0; i < DIMENSION_8D; ++i) { + dynamic_param_.end_ += end_[i] + ", "; + dynamic_param_.size_ += slice_size_[i] + ", "; + dynamic_param_.shape_ += data_shape_[i] + ", "; + } + dynamic_param_.shape_ += "}"; + dynamic_param_.size_ += "}"; + dynamic_param_.end_ += "}"; +} + +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_SliceFusion, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_SliceFusion, + CPUOpCoderCreator) +}; // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.h new file mode 100644 index 00000000..21b1b27b --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/slice_dynamic_fp16_coder.h @@ -0,0 +1,51 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SLICE_DYNAMIC_FP16_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SLICE_DYNAMIC_FP16_CODER_H_ + +#include +#include "mindspore/lite/tools/converter/micro/coder/opcoders/op_coder.h" +#include "coder/opcoders/nnacl/dynamic_parameter/slice_dynamic_parameter.h" +#include "nnacl/slice_parameter.h" +#include "nnacl/op_base.h" + +namespace mindspore::lite::micro::nnacl { +class SliceDynamicFP16Coder final : public OperatorCoder { + public: + SliceDynamicFP16Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + + ~SliceDynamicFP16Coder() override = default; + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; + + protected: + int Init(); + void PadSliceParameterTo8D(); + SliceParameter *param_{nullptr}; + SliceDynamicParameter dynamic_param_; + std::vector in_shapes_; + std::vector out_shapes_; + std::vector data_shape_; + std::vector slice_size_; + std::vector end_; +}; +}; // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SLICE_DYNAMIC_FP16_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.cc new file mode 100644 index 00000000..1bd09fb5 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.cc @@ -0,0 +1,137 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.h" +#include +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "schema/inner/ops_generated.h" +#include "coder/opcoders/file_collector.h" +#include "coder/utils/coder_utils.h" +#include "tools/common/string_util.h" +#include "base/float16.h" + +using mindspore::schema::PrimitiveType_LogSoftmax; +using mindspore::schema::PrimitiveType_Softmax; + +namespace mindspore::lite::micro::nnacl { +int SoftmaxDynamicFP16Coder::Prepare(CoderContext *const context) { + for (size_t i = 0; i < input_tensors_.size(); ++i) { + MS_CHECK_TRUE_MSG(input_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "Input tensor data type is invalid"); + } + for (size_t i = 0; i < output_tensors_.size(); ++i) { + MS_CHECK_TRUE_MSG(output_tensors_[i]->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "Output tensor data type is invalid"); + } + auto ret = Init(); + MS_CHECK_RET_CODE(ret, "Init failed!"); + return RET_OK; +} + +int SoftmaxDynamicFP16Coder::DoCode(CoderContext *const context) { + Collect(context, + { + "nnacl/fp16/softmax_fp16.h", + "nnacl/fp16/log_softmax_fp16.h", + }, + { + "softmax_fp16.c", + "log_softmax_fp16.c", + "exp_fp16.c", + }); + + auto ret = ComputeWorkSpace(); + MS_CHECK_RET_CODE(ret, "ComputeWorkSpace failed!"); + NNaclFp32Serializer code; + sum_data_str_ = "(float16_t *)(" + buffer_start_ + ")"; + auto primitive_type = param_->op_parameter_.type_; + std::string input_data = + "(float16_t *)(" + GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")"; + std::string output_data = + "(float16_t *)(" + GetTensorAddr(output_tensor_, output_tensor_->IsConst(), dynamic_mem_manager_, allocator_) + ")"; + code << " int input_shape[" << input_shape_.size() << "] = " << dynamic_param_.input_shape_ << ";\n"; + if (primitive_type == schema::PrimitiveType_Softmax) { + code.CodeFunction("SoftmaxFp16", input_data, output_data, sum_data_str_, softmax_struct_.axis_, + softmax_struct_.n_dim_, "&input_shape"); + } else { + code.CodeFunction("LogSoftmaxFp16", input_data, output_data, sum_data_str_, "&input_shape", softmax_struct_.n_dim_, + softmax_struct_.axis_); + } + context->AppendCode(code.str()); + return RET_OK; +} + +int SoftmaxDynamicFP16Coder::Init() { + param_ = reinterpret_cast(parameter_); + MS_CHECK_PTR(param_); + softmax_struct_.base_.param_ = parameter_; + input_shape_ = shape_info_container_->GetTemplateShape(input_tensor_); + size_t in_dims = input_shape_.size(); + softmax_struct_.n_dim_ = in_dims; + softmax_struct_.axis_ = param_->axis_ < 0 ? param_->axis_ + softmax_struct_.n_dim_ : param_->axis_; + dynamic_param_.element_size_ = AccumulateShape(input_shape_, 0, input_shape_.size()); + dynamic_param_.input_shape_ = "{"; + for (size_t i = 0; i < input_shape_.size(); ++i) { + dynamic_param_.input_shape_ += input_shape_[i] + ", "; + } + dynamic_param_.input_shape_ += "}"; + return RET_OK; +} + +int SoftmaxDynamicFP16Coder::ComputeWorkSpace() { + std::map> real_nums; + size_t scene_num = 0; + for (auto &dim_template : input_shape_) { + auto dim_nums = shape_info_container_->GetRealNums(dim_template); + MS_CHECK_TRUE_MSG(!dim_nums.empty(), RET_ERROR, "Dynamic shape's num must be greater than 0."); + real_nums[dim_template] = dim_nums; + scene_num = std::max(scene_num, dim_nums.size()); + } + for (size_t i = 0; i < scene_num; ++i) { + std::vector real_shape(input_shape_.size()); + for (size_t j = 0; j < input_shape_.size(); ++j) { + if (IsNumber(input_shape_[j])) { + real_shape[j] = std::stoi(input_shape_[j]); + } else { + real_shape[j] = real_nums[input_shape_[j]][i % real_nums[input_shape_[j]].size()]; + } + } + int out_plane_size = 1; + for (int j = 0; j < softmax_struct_.axis_; ++j) { + MS_CHECK_INT_MUL_NOT_OVERFLOW(out_plane_size, real_shape[j], RET_ERROR); + out_plane_size *= real_shape[j]; + } + int in_plane_size = 1; + for (int j = softmax_struct_.axis_ + 1; j < softmax_struct_.n_dim_; ++j) { + MS_CHECK_INT_MUL_NOT_OVERFLOW(in_plane_size, real_shape[j], RET_ERROR); + in_plane_size *= real_shape[j]; + } + int workspace = out_plane_size * in_plane_size * sizeof(float16); + buffer_start_ = dynamic_mem_manager_->AllocWorkSpace(workspace, i); + MS_CHECK_TRUE_MSG(!buffer_start_.empty(), RET_ERROR, "Softmax cannot alloc workspace."); + } + return RET_OK; +} + +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Softmax, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Softmax, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_LogSoftmax, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_LogSoftmax, + CPUOpCoderCreator) +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.h new file mode 100644 index 00000000..913f5ad4 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/softmax_dynamic_fp16_coder.h @@ -0,0 +1,50 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SOFTMAX_DYNAMIC_FP16_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SOFTMAX_DYNAMIC_FP16_CODER_H_ + +#include +#include +#include "coder/opcoders/op_coder.h" +#include "coder/opcoders/nnacl/dynamic_parameter/softmax_dynamic_parameter.h" +#include "nnacl/softmax_parameter.h" +#include "nnacl/kernel/softmax.h" + +namespace mindspore::lite::micro::nnacl { +class SoftmaxDynamicFP16Coder final : public OperatorCoder { + public: + SoftmaxDynamicFP16Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + ~SoftmaxDynamicFP16Coder() override = default; + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; + + private: + int Init(); + int ComputeWorkSpace(); + SoftmaxParameter *param_{nullptr}; + SoftmaxStruct softmax_struct_; + SoftmaxDynamicParameter dynamic_param_; + std::vector input_shape_; + std::string buffer_start_; + std::string sum_data_str_; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_SOFTMAX_DYNAMIC_FP16_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.cc new file mode 100644 index 00000000..59c8d8b8 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.cc @@ -0,0 +1,76 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.h" +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/opcoders/file_collector.h" +#include "coder/opcoders/parallel.h" +#include "coder/utils/coder_utils.h" + +using mindspore::schema::PrimitiveType_Transpose; +namespace mindspore::lite::micro::nnacl { +int TransposeDynamicFp16Coder::Prepare(CoderContext *const context) { + MS_CHECK_TRUE_MSG(input_tensor_->data_type() == kNumberTypeFloat16, RET_INPUT_PARAM_INVALID, + "Input tensor data type is invalid."); + MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->data_type() == kNumberTypeInt32, RET_INPUT_PARAM_INVALID, + "Perm tensor data type is invalid."); + MS_CHECK_TRUE_MSG( + output_tensor_->data_type() == kNumberTypeInt32 || output_tensor_->data_type() == kNumberTypeFloat16, + RET_INPUT_PARAM_INVALID, "Output tensor data type is invalid."); + MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->IsConst(), RET_NOT_SUPPORT, + "The second input of transpose is non-const."); + thread_num_ = 1; + MS_CHECK_RET_CODE(Init(), "init failed"); + return RET_OK; +} + +int TransposeDynamicFp16Coder::DoCode(CoderContext *const context) { + Collect(context, + { + "nnacl/transpose_parameter.h", + "nnacl/errorcode.h", + "nnacl/fp16/transpose_fp16.h", + }, + { + "transpose_fp16.c", + }); + + NNaclFp32Serializer code; + dims_ = static_cast(out_shapes_.size()); + code << "const int32_t output_shape[" << dims_ << "] = {"; + for (size_t i = 0; i < out_shapes_.size(); ++i) { + code << out_shapes_[i] << ", "; + } + code << "};\n"; + code.CodeStruct("trans_param", *param_, dynamic_param_); + auto input_str = dynamic_mem_manager_->GetVarTensorAddr(input_tensor_); + auto output_str = dynamic_mem_manager_->GetVarTensorAddr(output_tensor_); + if (param_->num_axes_ > DIMENSION_6D) { + code.CodeFunction("TransposeDimsFp16", input_str, output_str, "output_shape", "trans_param.perm_", + "trans_param.strides_", "trans_param.out_strides_", "trans_param.num_axes_", kDefaultTaskId, + kDefaultThreadNum); + } else { + code.CodeFunction("DoTransposeFp16", input_str, output_str, "output_shape", "trans_param.perm_", + "trans_param.strides_", "trans_param.out_strides_", "trans_param.data_num_", + "trans_param.num_axes_"); + } + context->AppendCode(code.str()); + return RET_OK; +} + +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Transpose, + CPUOpCoderCreator) +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.h new file mode 100644 index 00000000..e008a794 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp16/transpose_dynamic_fp16_coder.h @@ -0,0 +1,37 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_TRANSPOSE_DYNAMIC_FP16_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_TRANSPOSE_DYNAMIC_FP16_CODER_H_ +#include +#include +#include "coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.h" + +namespace mindspore::lite::micro::nnacl { +class TransposeDynamicFp16Coder : public TransposeDynamicFp32Coder { + public: + TransposeDynamicFp16Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : TransposeDynamicFp32Coder(in_tensors, out_tensors, node, node_index, target) {} + + ~TransposeDynamicFp16Coder() override = default; + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP16_TRANSPOSE_DYNAMIC_FP16_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.cc new file mode 100644 index 00000000..1dd33bbd --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.cc @@ -0,0 +1,112 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.h" +#include +#include "nnacl/fp32/activation_fp32.h" +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/opcoders/file_collector.h" +#include "coder/opcoders/parallel.h" +#include "tools/common/string_util.h" +#include "coder/utils/coder_utils.h" + +using mindspore::schema::PrimitiveType_Activation; + +namespace mindspore::lite::micro::nnacl { +int ActivationDynamicFP32Coder::Preprocess() { + // attribute + auto in_shape = shape_info_container_->GetTemplateShape(input_tensor_); + int64_t const_part = 1; + std::string non_const_part; + for (const auto &item : in_shape) { + if (IsNumber(item)) { + const_part *= std::atoi(item.c_str()); + } else { + if (!non_const_part.empty()) { + non_const_part += " * "; + } + non_const_part += item; + } + } + count_ = std::to_string(const_part) + " * " + non_const_part; + input_data_ = dynamic_mem_manager_->GetVarTensorAddr(input_tensor_); + MS_CHECK_TRUE_MSG(!input_data_.empty(), RET_ERROR, "pointer is not allocated by the allocator"); + output_data_ = dynamic_mem_manager_->GetVarTensorAddr(output_tensor_); + MS_CHECK_TRUE_MSG(!output_data_.empty(), RET_ERROR, "pointer is not allocated by the allocator"); + return RET_OK; +} + +int ActivationDynamicFP32Coder::DoCode(CoderContext *const context) { + Collect(context, + { + "wrapper/fp32/activation_fp32_wrapper.h", + "nnacl/fp32/activation_fp32.h", + }, + { + "activation_fp32_wrapper.c", + "activation_fp32.c", + }); + NNaclFp32Serializer code; + auto *activation_parameter = reinterpret_cast(parameter_); + int ret = Preprocess(); + MS_CHECK_TRUE_MSG(ret == RET_OK, RET_ERROR, "Preprocess failed"); + + switch (activation_parameter->type_) { + case schema::ActivationType_RELU: + code.CodeFunction("Fp32Relu", input_data_, count_, output_data_); + break; + case schema::ActivationType_RELU6: + code.CodeFunction("Fp32Relu6", input_data_, count_, output_data_); + break; + case schema::ActivationType_LEAKY_RELU: + code.CodeFunction("LRelu", input_data_, count_, output_data_, activation_parameter->alpha_); + break; + case schema::ActivationType_SIGMOID: + if (!support_parallel_) { + code.CodeFunction("Sigmoid", input_data_, count_, output_data_); + } else { + code.CodeStruct("activation_param", *activation_parameter); + code.CodeBaseStruct("ActivationFp32Args", kRunArgs, input_data_, count_, output_data_, 0.0f, + "&activation_param"); + code.CodeFunction(kParallelLaunch, "DoSigmoid", kRunArgsAddr, "activation_param.op_parameter_.thread_num_"); + } + break; + case schema::ActivationType_TANH: + code.CodeFunction("Tanh", input_data_, count_, output_data_); + break; + case schema::ActivationType_HSWISH: + code.CodeFunction("HSwish", input_data_, count_, output_data_); + break; + case schema::ActivationType_SWISH: + code.CodeFunction("Swish", input_data_, count_, output_data_); + break; + case schema::ActivationType_HSIGMOID: + code.CodeFunction("HSigmoid", input_data_, count_, output_data_); + break; + case schema::ActivationType_ELU: + code.CodeFunction("Elu", input_data_, count_, output_data_, activation_parameter->alpha_); + break; + default: + MS_LOG(ERROR) << "Activation type error"; + return RET_ERROR; + } + MS_LOG(DEBUG) << "ActivationFP32Code has been called"; + context->AppendCode(code.str()); + return lite::RET_OK; +} + +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Activation, + CPUOpCoderCreator) +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.h new file mode 100644 index 00000000..1560afbb --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/activation_dynamic_fp32_coder.h @@ -0,0 +1,46 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_ACTIVATION_DYNAMIC_FP32_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_ACTIVATION_DYNAMIC_FP32_CODER_H_ + +#include +#include +#include "tools/converter/micro/coder/opcoders/op_coder.h" +#include "tools/converter/micro/coder/shape_info_container.h" +#include "tools/converter/micro/coder/dynamic_mem_manager.h" + +namespace mindspore::lite::micro::nnacl { +class ActivationDynamicFP32Coder : public OperatorCoder { + public: + ActivationDynamicFP32Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + + ~ActivationDynamicFP32Coder() override = default; + + int Prepare(CoderContext *const context) override { return RET_OK; } + + int DoCode(CoderContext *const context) override; + + protected: + int Preprocess(); + std::string count_; + std::string input_data_; + std::string output_data_; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_ACTIVATION_DYNAMIC_FP32_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc index c15d3101..1b827283 100644 --- a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/convolution_winograd_fp32_coder.cc @@ -266,7 +266,6 @@ void ConvolutionWinogradFP32Coder::CollectFilesForFunc(CoderContext *const conte } else if (target_ == kARM64) { Collect(context, {}, {}, { - "BigMatmulFp32Opt.S", "MatmulFp32.S", "MatmulFp32Opt.S", "PreSum4x16Int8Peroc.S", diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.cc new file mode 100644 index 00000000..57d7a5dd --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.cc @@ -0,0 +1,106 @@ +/** + * Copyright 2021-2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.h" +#include +#include "nnacl/gather_parameter.h" +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/opcoders/file_collector.h" +#include "coder/utils/coder_utils.h" +#include "tools/common/string_util.h" + +using mindspore::schema::PrimitiveType_Gather; + +namespace mindspore::lite::micro::nnacl { +int GatherDynamicFP32Coder::Prepare(CoderContext *const context) { + MS_CHECK_TRUE_MSG(input_tensors_.size() == C3NUM, RET_ERROR, "Gather's input-num must be 3."); + MS_CHECK_TRUE_MSG(input_tensors_[FIRST_INPUT]->IsConst() && input_tensors_[THIRD_INPUT]->IsConst(), RET_NOT_SUPPORT, + "Currently, only support the second input of gather is non-const when shape is dynamical."); + MS_CHECK_TRUE_MSG(input_tensors_[THIRD_INPUT]->data_type() == kNumberTypeInt32 || + input_tensors_[THIRD_INPUT]->data_type() == kNumberTypeInt, + RET_ERROR, "The data-type of Gather's third input must be int."); + auto axis = input_tensors_[THIRD_INPUT]->data(); + MS_CHECK_TRUE_MSG(axis != nullptr, RET_NULL_PTR, "Gather has no axis."); + axis_ = *(static_cast(axis)); + auto in_shape0 = input_tensors_[FIRST_INPUT]->shape(); + axis_ = axis_ >= 0 ? axis_ : axis_ + static_cast(in_shape0.size()); + MS_CHECK_TRUE_MSG(axis_ >= 0 && axis_ < static_cast(in_shape0.size()), RET_INPUT_TENSOR_ERROR, + "Gather's axis is out of range."); + return RET_OK; +} + +int GatherDynamicFP32Coder::DoCode(CoderContext *const context) { + Collect(context, + { + "nnacl/base/gather_base.h", + }, + { + "gather_base.c", + }); + auto in_shape0 = input_tensors_[FIRST_INPUT]->shape(); + auto data_item_size = static_cast(lite::DataTypeSize(input_tensors_[FIRST_INPUT]->data_type())); + int64_t out_size = 1; + for (size_t i = 0; i < static_cast(axis_); ++i) { + out_size *= in_shape0[i]; + } + int64_t byte_inner_size = data_item_size; + for (size_t i = axis_ + 1; i < in_shape0.size(); ++i) { + byte_inner_size *= in_shape0[i]; + } + int64_t limit = in_shape0[axis_]; + auto in_shape1 = shape_info_container_->GetTemplateShape(input_tensors_[SECOND_INPUT]); + int64_t const_part = 1; + std::string non_const_part; + for (const auto &item : in_shape1) { + if (IsNumber(item)) { + const_part *= std::stoi(item); + } else { + if (!non_const_part.empty()) { + non_const_part += " * "; + } + non_const_part += item; + } + } + std::string byte_out_stride_str = std::to_string(const_part * byte_inner_size); + std::string index_num_str = std::to_string(const_part); + if (!non_const_part.empty()) { + byte_out_stride_str += " * " + non_const_part; + index_num_str += " * " + non_const_part; + } + std::string input0_data = MemoryAllocator::GetInstance()->GetRuntimeAddr(input_tensors_[FIRST_INPUT], true); + MS_CHECK_TRUE_MSG(!input0_data.empty(), RET_ERROR, "pointer is not allocated by the allocator"); + std::string input1_data = dynamic_mem_manager_->GetVarTensorAddr(input_tensors_[SECOND_INPUT]); + MS_CHECK_TRUE_MSG(!input1_data.empty(), RET_ERROR, "pointer is not allocated by the allocator"); + std::string output_data = dynamic_mem_manager_->GetVarTensorAddr(output_tensors_[FIRST_INPUT]); + MS_CHECK_TRUE_MSG(!output_data.empty(), RET_ERROR, "pointer is not allocated by the allocator"); + NNaclFp32Serializer code; + code << "\t\tconst int8_t *int8_in = (const int8_t *)(" << input0_data << ");\n"; + code << "\t\tconst int *index_data = (const int *)(" << input1_data << ");\n"; + code << "\t\tint8_t *int8_out = (int8_t *)(" << output_data << ");\n"; + // call the op function + code.CodeFunction("Gather", "int8_in", out_size, byte_inner_size, limit, "index_data", index_num_str, "int8_out", + byte_out_stride_str); + context->AppendCode(code.str()); + return RET_OK; +} + +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Gather, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_Gather, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat16, PrimitiveType_Gather, CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM32, kNumberTypeFloat16, PrimitiveType_Gather, CPUOpCoderCreator) +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.h new file mode 100644 index 00000000..9e58e1fa --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/gather_dynamic_fp32_coder.h @@ -0,0 +1,42 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_GATHER_DYNAMIC_FP32_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_GATHER_DYNAMIC_FP32_CODER_H_ + +#include +#include +#include "coder/opcoders/op_coder.h" +#include "nnacl/base/tile_base.h" + +namespace mindspore::lite::micro::nnacl { +class GatherDynamicFP32Coder final : public OperatorCoder { + public: + GatherDynamicFP32Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + + ~GatherDynamicFP32Coder() override = default; + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; + + private: + int axis_{0}; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_GATHER_DYNAMIC_FP32_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.cc new file mode 100644 index 00000000..4ec7f317 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.cc @@ -0,0 +1,94 @@ +/** + * Copyright 2022 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.h" +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/opcoders/file_collector.h" +#include "coder/opcoders/parallel.h" +#include "coder/utils/coder_utils.h" +#include "nnacl/op_base.h" + +using mindspore::schema::PrimitiveType_Split; + +namespace mindspore::lite::micro::nnacl { +int SplitDynamicFP32Coder::Prepare(CoderContext *const context) { + auto input_shape = shape_info_container_->GetTemplateShape(input_tensor_); + int in_shape_size = static_cast(input_shape.size()); + CHECK_LESS_RETURN(in_shape_size, 1); + CHECK_LESS_RETURN(SPLIT_STRIDES_SIZE - 1, in_shape_size); + param_ = reinterpret_cast(parameter_); + CHECK_NULL_RETURN(param_); + + auto split_dim = param_->split_dim_; + param_->split_dim_ = split_dim >= 0 ? split_dim : in_shape_size + split_dim; + std::vector strides(in_shape_size); + strides[in_shape_size - 1] = "1"; + for (int i = static_cast(in_shape_size) - C2NUM; i >= 0; i--) { + strides[i] = strides[i + 1] + " * " + input_shape[i + 1]; + } + dynamic_param_.strides_ = "{"; + for (int i = 0; i < in_shape_size; ++i) { + dynamic_param_.strides_ += strides[i] + ", "; + } + dynamic_param_.strides_ += "}"; + CHECK_LESS_RETURN(in_shape_size, param_->split_dim_ + 1); + if (input_shape.at(param_->split_dim_) == "0") { + MS_LOG(ERROR) << "input_shape[" << param_->split_dim_ << "] must not be zero!"; + return RET_ERROR; + } + CHECK_LESS_RETURN(SPLIT_STRIDES_SIZE, param_->split_dim_ + 1); + if (strides[param_->split_dim_] == "0") { + MS_LOG(ERROR) << "strides[" << param_->split_dim_ << "] must not be zero!"; + return RET_ERROR; + } + dynamic_param_.split_count_ = strides[0] + " * " + input_shape[0] + " / (" + input_shape.at(param_->split_dim_) + + " * " + strides[param_->split_dim_] + ")"; + param_->n_dims_ = static_cast(input_shape.size()); + CHECK_LESS_RETURN(param_->num_split_, 1); + MS_CHECK_TRUE_MSG(param_->split_sizes_[0] != 0 && param_->split_sizes_[param_->num_split_ - 1] != -1, + lite::RET_PARAM_INVALID, "Currently, split not support split_size 0 or -1"); + return RET_OK; +} + +int SplitDynamicFP32Coder::DoCode(CoderContext *const context) { + Collect(context, {"nnacl/base/split_base.h"}, {"split_base.c"}); + NNaclFp32Serializer code; + code << " void *output_ptrs[" << output_tensors_.size() << "] = {"; + for (int i = 0; i < param_->num_split_; i++) { + code << GetTensorAddr(output_tensors_.at(i), output_tensors_.at(i)->IsConst(), dynamic_mem_manager_, allocator_) + << ", "; + } + code << "};\n"; + auto input_shape = shape_info_container_->GetTemplateShape(input_tensor_); + code << " int input_dim[" << input_shape.size() << "] = {"; + for (auto &dim : input_shape) { + code << dim << ", "; + } + code << "};\n"; + std::string input_data = GetTensorAddr(input_tensor_, input_tensor_->IsConst(), dynamic_mem_manager_, allocator_); + std::string num_unit = dynamic_param_.split_count_ + " * " + std::to_string(param_->num_split_); + code.CodeStruct("split_param", *param_, dynamic_param_); + code.CodeFunction("DoSplit", input_data, "output_ptrs", "input_dim", "0", num_unit, "&split_param", + lite::DataTypeSize(input_tensor_->data_type())); + context->AppendCode(code.str()); + return RET_OK; +} + +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat32, PrimitiveType_Split, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeInt32, PrimitiveType_Split, CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kAllTargets, kNumberTypeFloat16, PrimitiveType_Split, + CPUOpCoderCreator) +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.h new file mode 100644 index 00000000..e3e64cb3 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/split_dynamic_fp32_coder.h @@ -0,0 +1,42 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_SPLIT_DYNAMIC_FP32_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_SPLIT_DYNAMIC_FP32_CODER_H_ + +#include +#include "coder/opcoders/op_coder.h" +#include "coder/opcoders/nnacl/dynamic_parameter/split_dynamic_parameter.h" +#include "nnacl/split_parameter.h" + +namespace mindspore::lite::micro::nnacl { +class SplitDynamicFP32Coder : public OperatorCoder { + public: + SplitDynamicFP32Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + ~SplitDynamicFP32Coder() override = default; + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; + + protected: + SplitParameter *param_{nullptr}; + SplitDynamicParameter dynamic_param_; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_SPLIT_DYNAMIC_FP32_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.cc new file mode 100644 index 00000000..7fb160d5 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.cc @@ -0,0 +1,171 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.h" +#include +#include +#include "coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h" +#include "coder/opcoders/file_collector.h" +#include "coder/opcoders/parallel.h" +#include "coder/utils/coder_utils.h" + +using mindspore::schema::PrimitiveType_Transpose; +namespace mindspore::lite::micro::nnacl { +int TransposeDynamicFp32Coder::Prepare(CoderContext *const context) { + MS_CHECK_TRUE_MSG(input_tensor_->data_type() == kNumberTypeInt32 || input_tensor_->data_type() == kNumberTypeFloat32, + RET_INPUT_PARAM_INVALID, "Input tensor data type is invalid."); + MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->data_type() == kNumberTypeInt32, RET_INPUT_PARAM_INVALID, + "Perm tensor data type is invalid."); + MS_CHECK_TRUE_MSG( + output_tensor_->data_type() == kNumberTypeInt32 || output_tensor_->data_type() == kNumberTypeFloat32, + RET_INPUT_PARAM_INVALID, "Output tensor data type is invalid."); + MS_CHECK_TRUE_MSG(input_tensors_[SECOND_INPUT]->IsConst(), RET_NOT_SUPPORT, + "The second input of transpose is non-const."); + thread_num_ = 1; + MS_CHECK_RET_CODE(Init(), "init failed"); + return RET_OK; +} + +int TransposeDynamicFp32Coder::DoCode(CoderContext *const context) { + Collect(context, + { + "nnacl/transpose_parameter.h", + "nnacl/errorcode.h", + "nnacl/fp32/transpose_fp32.h", + }, + { + "transpose_fp32.c", + }); + + NNaclFp32Serializer code; + dims_ = static_cast(out_shapes_.size()); + code << "const int32_t output_shape[" << dims_ << "] = {"; + for (size_t i = 0; i < out_shapes_.size(); ++i) { + code << out_shapes_[i] << ", "; + } + code << "};\n"; + code.CodeStruct("trans_param", *param_, dynamic_param_); + auto input_str = dynamic_mem_manager_->GetVarTensorAddr(input_tensor_); + auto output_str = dynamic_mem_manager_->GetVarTensorAddr(output_tensor_); + if (param_->num_axes_ > DIMENSION_6D) { + code.CodeFunction("TransposeDimsFp32", input_str, output_str, "output_shape", "trans_param.perm_", + "trans_param.strides_", "trans_param.out_strides_", "trans_param.num_axes_", kDefaultTaskId, + kDefaultThreadNum); + } else { + code.CodeFunction("DoTransposeFp32", input_str, output_str, "output_shape", "trans_param.perm_", + "trans_param.strides_", "trans_param.out_strides_", "trans_param.data_num_", + "trans_param.num_axes_"); + } + context->AppendCode(code.str()); + return RET_OK; +} + +int TransposeDynamicFp32Coder::Init() { + param_ = reinterpret_cast(parameter_); + MS_CHECK_PTR(param_); + param_->num_axes_ = 0; + if (input_tensors_.size() == C2NUM) { + param_->num_axes_ = input_tensors_[SECOND_INPUT]->ElementsNum(); + } + if (input_tensor_->shape().size() != static_cast(param_->num_axes_)) { + return RET_OK; + } + // get perm data + auto ret = ResetStatus(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Do transpose reset failed."; + return ret; + } + + ret = ComputeOfflineInfo(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Do compute transpose offline info failed."; + return ret; + } + return RET_OK; +} + +int TransposeDynamicFp32Coder::ResetStatus() { + auto in_shape = shape_info_container_->GetTemplateShape(input_tensor_); + if (in_shape.size() > MAX_TRANSPOSE_DIM_SIZE) { + MS_LOG(ERROR) << "input shape out of range."; + return RET_ERROR; + } + int trans_nd[MAX_TRANSPOSE_DIM_SIZE] = {0, 2, 1}; + int *perm_data{nullptr}; + if (in_shape.size() != static_cast(param_->num_axes_)) { + perm_data = trans_nd; + if (in_shape.size() == C3NUM && param_->num_axes_ == C4NUM) { + param_->num_axes_ = C3NUM; + } + if (param_->num_axes_ == 0) { + for (int i = 0; i < static_cast(in_shape.size()); ++i) { + trans_nd[i] = static_cast(in_shape.size()) - 1 - i; + } + param_->num_axes_ = static_cast(in_shape.size()); + } + } else { + if (input_tensors_.size() != C2NUM) { + MS_LOG(ERROR) << "input tensors size is not equal to 2."; + return RET_ERROR; + } + auto perm_tensor = input_tensors_.at(SECOND_INPUT); + perm_data = reinterpret_cast(perm_tensor->data()); + MSLITE_CHECK_PTR(perm_data); + std::vector perm(perm_data, perm_data + input_tensors_[SECOND_INPUT]->ElementsNum()); + if (perm.size() != std::unordered_set(perm.cbegin(), perm.cend()).size()) { + MS_LOG(ERROR) << "Invalid perm, the same element exits in perm."; + return RET_ERROR; + } + } + MS_CHECK_TRUE_MSG(param_->num_axes_ <= MAX_TRANSPOSE_DIM_SIZE, RET_ERROR, "transpose perm is invalid."); + for (int i = 0; i < param_->num_axes_; ++i) { + param_->perm_[i] = perm_data[i]; + } + return RET_OK; +} + +int TransposeDynamicFp32Coder::ComputeOfflineInfo() { + in_shapes_ = shape_info_container_->GetTemplateShape(input_tensor_); + out_shapes_ = shape_info_container_->GetTemplateShape(output_tensor_); + const int ori_stride = 1; + dynamic_param_.strides_ = std::to_string(ori_stride) + ", "; + dynamic_param_.out_strides_ = std::to_string(ori_stride) + ", "; + dynamic_param_.data_num_ = AccumulateShape(in_shapes_, 0, in_shapes_.size()); + std::vector strides(param_->num_axes_); + std::vector out_strides(param_->num_axes_); + strides[param_->num_axes_ - 1] = "1"; + out_strides[param_->num_axes_ - 1] = "1"; + for (int i = param_->num_axes_ - C2NUM; i >= 0; --i) { + strides[i] = in_shapes_[i + 1] + " * " + strides[i + 1]; + out_strides[i] = out_shapes_[i + 1] + " * " + out_strides[i + 1]; + } + dynamic_param_.strides_ = "{"; + dynamic_param_.out_strides_ = "{"; + for (int i = 0; i < param_->num_axes_; ++i) { + dynamic_param_.strides_ += strides[i] + ", "; + dynamic_param_.out_strides_ += out_strides[i] + ", "; + } + dynamic_param_.strides_ += "}"; + dynamic_param_.out_strides_ += "}"; + return RET_OK; +} + +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeFloat32, PrimitiveType_Transpose, + CPUOpCoderCreator) +REG_DYNAMIC_OPERATOR_CODER(kARM64, kNumberTypeInt32, PrimitiveType_Transpose, + CPUOpCoderCreator) +} // namespace mindspore::lite::micro::nnacl diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.h new file mode 100644 index 00000000..9230b8e3 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/nnacl/fp32/transpose_dynamic_fp32_coder.h @@ -0,0 +1,49 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_TRANSPOSE_DYNAMIC_FP32_CODER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_TRANSPOSE_DYNAMIC_FP32_CODER_H_ +#include +#include +#include "coder/opcoders/op_coder.h" +#include "nnacl/transpose_parameter.h" +#include "coder/opcoders/nnacl/dynamic_parameter/transpose_dynamic_parameter.h" + +namespace mindspore::lite::micro::nnacl { +class TransposeDynamicFp32Coder : public OperatorCoder { + public: + TransposeDynamicFp32Coder(const std::vector &in_tensors, const std::vector &out_tensors, + const LiteGraph::Node *node, size_t node_index, Target target) + : OperatorCoder(in_tensors, out_tensors, node, node_index, target) {} + + ~TransposeDynamicFp32Coder() override = default; + + int Prepare(CoderContext *const context) override; + + int DoCode(CoderContext *const context) override; + + protected: + int Init(); + int ResetStatus(); + int ComputeOfflineInfo(); + TransposeParameter *param_{nullptr}; + TransposeDynamicParameter dynamic_param_; + int dims_{0}; + std::vector in_shapes_; + std::vector out_shapes_; +}; +} // namespace mindspore::lite::micro::nnacl +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_NNACL_FP32_TRANSPOSE_DYNAMIC_FP32_CODER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder.h index dffaf14b..fa59e483 100644 --- a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder.h +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder.h @@ -28,6 +28,8 @@ #include "securec/include/securec.h" #include "tools/converter/micro/coder/opcoders/op_coder_register.h" #include "tools/converter/micro/coder/log.h" +#include "tools/converter/micro/coder/shape_info_container.h" +#include "tools/converter/micro/coder/dynamic_mem_manager.h" namespace mindspore::lite::micro { constexpr int kPrecision = 19; @@ -71,6 +73,8 @@ class OperatorCoder { void set_parameter(OpParameter *parameter); + OpParameter *get_parameter() const { return parameter_; } + const LiteGraph::Node *node() const { return this->node_; } void AddInitialParameters(Tensor *parameter) { initial_parameters_.push_back(parameter); } @@ -88,6 +92,12 @@ class OperatorCoder { void set_thread_num(int thread_num); + void set_shape_info_container(ShapeInfoContainer *shape_info_container) { + shape_info_container_ = shape_info_container; + } + + void set_dynamic_mem_manager(DynamicMemManager *dynamic_mem_manager) { dynamic_mem_manager_ = dynamic_mem_manager; } + protected: std::vector input_tensors_; std::vector output_tensors_; @@ -103,6 +113,8 @@ class OperatorCoder { bool support_parallel_{false}; int thread_num_{1}; int schema_version_ = lite::SCHEMA_VERSION::SCHEMA_CUR; + ShapeInfoContainer *shape_info_container_{nullptr}; + DynamicMemManager *dynamic_mem_manager_{nullptr}; private: size_t node_index_{0}; diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.cc index 45b2e37f..e2d70c12 100644 --- a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.cc +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.cc @@ -35,7 +35,7 @@ std::unique_ptr OpCoderBuilder::build(int schema_version) { } coder_key = CoderKey(target_, data_type_, schema::PrimitiveType_Custom, custom_type->str()); } - CoderCreatorFunc creator_func = OpCoderFactory::GetInstance()->FindOpCoder(coder_key); + CoderCreatorFunc creator_func = OpCoderFactory::GetInstance()->FindOpCoder(coder_key, dynamic_); if (creator_func == nullptr) { MS_LOG(ERROR) << "caught unsupported layer: " << node_->name_; return nullptr; @@ -125,5 +125,10 @@ OpCoderBuilder &OpCoderBuilder::is_builtin_custom(bool builtin_custom) { return *this; } +OpCoderBuilder &OpCoderBuilder::is_dynamic(bool dynamic) { + dynamic_ = dynamic; + return *this; +} + void OpCoderBuilder::Reset() {} } // namespace mindspore::lite::micro diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.h b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.h index d85f1c32..bdd815ef 100644 --- a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.h +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_builder.h @@ -48,6 +48,8 @@ class OpCoderBuilder { OpCoderBuilder &is_builtin_custom(bool builtin_custom); + OpCoderBuilder &is_dynamic(bool dynamic); + void Reset(); private: @@ -74,6 +76,8 @@ class OpCoderBuilder { bool support_parallel_{false}; bool builtin_custom_{false}; + + bool dynamic_{false}; }; } // namespace mindspore::lite::micro #endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_OP_CODER_BUILDER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.cc index cf26d51d..1dac9c73 100644 --- a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.cc +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.cc @@ -37,33 +37,38 @@ OpCoderFactory *OpCoderFactory::GetInstance() { } int OpCoderFactory::RegistOpCoder(Target target, TypeId data_type, schema::PrimitiveType operator_type, - const std::string &builtin_custom_type, const CoderCreatorFunc &creator_func) { + const std::string &builtin_custom_type, const CoderCreatorFunc &creator_func, + bool dynamic) { + auto &op_sets = dynamic ? dynamic_opcoder_sets_ : static_opcoder_sets_; // check key CoderKey key(target, data_type, operator_type, builtin_custom_type); // insert pair to registry - if (this->opcoder_sets_.find(key) != this->opcoder_sets_.end()) { + if (op_sets.find(key) != op_sets.end()) { MS_LOG(ERROR) << "coder already exist: " << key.ToString(); return RET_ERROR; } - this->opcoder_sets_.insert(std::pair(key, creator_func)); + op_sets.insert(std::pair(key, creator_func)); return RET_OK; } -CoderCreatorFunc OpCoderFactory::FindOpCoder(const CoderKey &key) { - auto iterator = this->opcoder_sets_.find(key); - if (iterator != this->opcoder_sets_.end()) { +CoderCreatorFunc OpCoderFactory::FindOpCoder(const CoderKey &key, bool dynamic) { + const auto &op_sets = dynamic ? dynamic_opcoder_sets_ : static_opcoder_sets_; + auto iterator = op_sets.find(key); + if (iterator != op_sets.end()) { return iterator->second; } // matching kAllTargets - iterator = this->opcoder_sets_.find(key.AllKey()); - if (iterator != this->opcoder_sets_.end()) { + iterator = op_sets.find(key.AllKey()); + if (iterator != op_sets.end()) { return iterator->second; } return nullptr; } OpCoderRegister::OpCoderRegister(Target target, TypeId data_type, schema::PrimitiveType operator_type, - const std::string &builtin_custom_type, const CoderCreatorFunc &creatorFunc) { - OpCoderFactory::GetInstance()->RegistOpCoder(target, data_type, operator_type, builtin_custom_type, creatorFunc); + const std::string &builtin_custom_type, const CoderCreatorFunc &creatorFunc, + bool dynamic) { + OpCoderFactory::GetInstance()->RegistOpCoder(target, data_type, operator_type, builtin_custom_type, creatorFunc, + dynamic); } } // namespace mindspore::lite::micro diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.h b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.h index 30c8a64d..b616e287 100644 --- a/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.h +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/op_coder_register.h @@ -65,15 +65,19 @@ class OpCoderFactory { static OpCoderFactory *GetInstance(); int RegistOpCoder(Target target, TypeId data_type, schema::PrimitiveType operator_type, - const std::string &builtin_custom_type, const CoderCreatorFunc &creator_func); + const std::string &builtin_custom_type, const CoderCreatorFunc &creator_func, bool dynamic); - CoderCreatorFunc FindOpCoder(const CoderKey &key); + CoderCreatorFunc FindOpCoder(const CoderKey &key, bool dynamic = false); - ~OpCoderFactory() { opcoder_sets_.clear(); } + ~OpCoderFactory() { + static_opcoder_sets_.clear(); + dynamic_opcoder_sets_.clear(); + } private: // target || data type || primitive type - std::map opcoder_sets_; + std::map static_opcoder_sets_; + std::map dynamic_opcoder_sets_; }; class OpCoderRegister { @@ -81,16 +85,20 @@ class OpCoderRegister { OpCoderRegister() = delete; OpCoderRegister(Target target, TypeId data_type, schema::PrimitiveType operator_type, - const std::string &builtin_custom_type, const CoderCreatorFunc &creator_func); + const std::string &builtin_custom_type, const CoderCreatorFunc &creator_func, bool dynamic = false); ~OpCoderRegister() = default; }; -#define REG_OPERATOR_CODER(target, data_type, operator_type, creator_func) \ - static OpCoderRegister g_##target##data_type##operator_type##Creator(target, data_type, operator_type, "", \ - creator_func); +#define REG_OPERATOR_CODER(target, data_type, operator_type, creator_func) \ + static OpCoderRegister g_##target##data_type##operator_type##StaticCreator(target, data_type, operator_type, "", \ + creator_func); #define REG_BUILIN_CUSTOM_CODER(target, data_type, custom_type, creator_func) \ static OpCoderRegister g_##target##data_type##operator_type##Creator( \ target, data_type, schema::PrimitiveType_Custom, custom_type, creator_func); + +#define REG_DYNAMIC_OPERATOR_CODER(target, data_type, operator_type, creator_func) \ + static OpCoderRegister g_##target##data_type##operator_type##DynamicCreator(target, data_type, operator_type, "", \ + creator_func, true); } // namespace mindspore::lite::micro #endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_OPCODERS_OP_CODER_REGISTER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.cc b/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.cc index a3743b48..920f2723 100644 --- a/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.cc +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.cc @@ -38,6 +38,15 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const PoolingCompu pooling_compute.maxf); } +void NNaclFp32Serializer::CodeStruct(const std::string &name, const PoolingComputeParam &pooling_compute, + const PoolingDynamicParameter &dynamic_pooling_param) { + CodeBaseStruct("PoolingComputeParam", name, pooling_compute.input_w_, pooling_compute.input_h_, + dynamic_pooling_param.input_batch_, pooling_compute.input_channel_, pooling_compute.output_w_, + pooling_compute.output_h_, dynamic_pooling_param.output_batch_, pooling_compute.output_channel_, + pooling_compute.window_w_, pooling_compute.window_h_, pooling_compute.minf, + pooling_compute.maxf); +} + void NNaclFp32Serializer::CodeStruct(const std::string &name, const BatchNormParameter &batch_norm_parameter) { CodeBaseStruct("BatchNormParameter", name, batch_norm_parameter.op_parameter_, batch_norm_parameter.epsilon_, batch_norm_parameter.momentum_, batch_norm_parameter.unit_, batch_norm_parameter.units_, @@ -85,6 +94,29 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const ConvParamete conv_parameter.output_padding_w_, conv_parameter.output_padding_h_); } +void NNaclFp32Serializer::CodeStruct(const std::string &name, const ConvParameter &conv_parameter, + const ConvDynamicParameter &dynamic_conv_param) { + CodeBaseStruct( + "ConvParameter", name, conv_parameter.op_parameter_, "{0}", conv_parameter.kernel_h_, conv_parameter.kernel_w_, + conv_parameter.stride_h_, conv_parameter.stride_w_, conv_parameter.dilation_h_, conv_parameter.dilation_w_, + conv_parameter.pad_u_, conv_parameter.pad_d_, conv_parameter.pad_l_, conv_parameter.pad_r_, conv_parameter.group_, + conv_parameter.tile_num_, dynamic_conv_param.input_batch_, conv_parameter.input_h_, conv_parameter.input_w_, + conv_parameter.input_channel_, dynamic_conv_param.output_batch_, conv_parameter.output_h_, conv_parameter.output_w_, + conv_parameter.output_channel_, conv_parameter.thread_num_, conv_parameter.input_unit_, conv_parameter.output_unit_, + conv_parameter.pad_mode_, conv_parameter.act_type_, conv_parameter.channel_multiplie_, + conv_parameter.output_padding_w_, conv_parameter.output_padding_h_); +} + +void NNaclFp32Serializer::CodeStruct(const std::string &name, const MatMulParameter &mat_mul_parameter) { + CodeBaseStruct( + "MatMulParameter", name, mat_mul_parameter.op_parameter_, mat_mul_parameter.has_bias_, mat_mul_parameter.use_axis_, + mat_mul_parameter.a_transpose_, mat_mul_parameter.b_transpose_, mat_mul_parameter.act_type_, mat_mul_parameter.row_, + mat_mul_parameter.col_, mat_mul_parameter.row_4_, mat_mul_parameter.row_16_, mat_mul_parameter.row_align_, + mat_mul_parameter.col_8_, mat_mul_parameter.col_align_, mat_mul_parameter.deep_, mat_mul_parameter.deep_4_, + mat_mul_parameter.deep_16_, mat_mul_parameter.deep_align_, mat_mul_parameter.batch, mat_mul_parameter.a_const_, + mat_mul_parameter.b_const_, mat_mul_parameter.axis_, mat_mul_parameter.matmul_type_); +} + void NNaclFp32Serializer::CodeStruct(const std::string &name, const MicroMatmulParameter µ_matmul_parameter) { CodeBaseStruct("MicroMatmulParameter", name, micro_matmul_parameter.act_type_, micro_matmul_parameter.thread_num_, micro_matmul_parameter.row_, micro_matmul_parameter.col_, @@ -102,18 +134,41 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const ScaleStruct scale_struct.outer_size_, scale_struct.inner_size_); } +void NNaclFp32Serializer::CodeStruct(const std::string &name, const ScaleStruct &scale_struct, + const ScaleDynamicParameter &dynamic_scale_param) { + CodeBaseStruct("ScaleStruct", name, "{}", scale_struct.axis_, scale_struct.data_type_, + dynamic_scale_param.axis_size_, dynamic_scale_param.outer_size_, + dynamic_scale_param.inner_size_); +} + void NNaclFp32Serializer::CodeStruct(const std::string &name, const SliceParameter &slice_parameter) { CodeBaseStruct("SliceParameter", name, slice_parameter.op_parameter_, ToString(slice_parameter.shape_), ToString(slice_parameter.begin_), ToString(slice_parameter.end_), ToString(slice_parameter.size_), "{0}", slice_parameter.param_length_); } +void NNaclFp32Serializer::CodeStruct(const std::string &name, const SliceParameter &slice_parameter, + const SliceDynamicParameter &dynamic_slice_param) { + CodeBaseStruct("SliceParameter", name, slice_parameter.op_parameter_, dynamic_slice_param.shape_, + ToString(slice_parameter.begin_), dynamic_slice_param.end_, dynamic_slice_param.size_, "{0}", + slice_parameter.param_length_); +} + void NNaclFp32Serializer::CodeStruct(const std::string &name, const SplitParameter &split_parameter) { CodeBaseStruct("SplitParameter", name, split_parameter.op_parameter_, split_parameter.num_split_, "split_sizes", split_parameter.split_dim_, ToString(split_parameter.strides_), "{0}", split_parameter.n_dims_, split_parameter.split_count_); } +void NNaclFp32Serializer::CodeStruct(const std::string &name, const SplitParameter &split_parameter, + const SplitDynamicParameter &dynamic_split_param) { + CodeArray("split_sizes", split_parameter.split_sizes_, split_parameter.num_split_, false); + CodeBaseStruct("SplitParameter", name, split_parameter.op_parameter_, split_parameter.num_split_, nullptr, + split_parameter.split_dim_, dynamic_split_param.strides_, "{0}", split_parameter.n_dims_, + dynamic_split_param.split_count_); + code << " " << name << ".split_sizes_ = split_sizes;\n"; +} + void NNaclFp32Serializer::CodeStruct(const std::string &name, const TileParameter &tile_parameter) { CodeBaseStruct("TileParameter", name, tile_parameter.op_parameter_, ToString(tile_parameter.multiples_), ToString(tile_parameter.in_shape_), ToString(tile_parameter.out_shape_), @@ -127,12 +182,32 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const TransposePar ToString(transpose_parameter.out_strides_), transpose_parameter.num_axes_, transpose_parameter.data_num_); } +void NNaclFp32Serializer::CodeStruct(const std::string &name, const TransposeParameter &transpose_param, + const TransposeDynamicParameter &dynamic_transpose_param) { + CodeBaseStruct("TransposeParameter", name, transpose_param.op_parameter_, ToString(transpose_param.perm_), + transpose_param.perm_size_, transpose_param.conjugate_, dynamic_transpose_param.strides_, + dynamic_transpose_param.out_strides_, transpose_param.num_axes_, + dynamic_transpose_param.data_num_); +} + void NNaclFp32Serializer::CodeStruct(const std::string &name, const LstmParameter &lstm_parameter) { CodeBaseStruct("LstmParameter", name, lstm_parameter.op_parameter_, lstm_parameter.input_size_, - lstm_parameter.hidden_size_, lstm_parameter.project_size_, lstm_parameter.seq_len_, - lstm_parameter.batch_, lstm_parameter.output_step_, lstm_parameter.bidirectional_, - lstm_parameter.zoneout_cell_, lstm_parameter.zoneout_hidden_, lstm_parameter.input_row_align_, - lstm_parameter.input_col_align_, lstm_parameter.state_row_align_, lstm_parameter.state_col_align_); + lstm_parameter.hidden_size_, lstm_parameter.project_size_, lstm_parameter.output_size_, + lstm_parameter.seq_len_, lstm_parameter.batch_, lstm_parameter.output_step_, + lstm_parameter.bidirectional_, lstm_parameter.zoneout_cell_, lstm_parameter.zoneout_hidden_, + lstm_parameter.input_row_align_, lstm_parameter.input_col_align_, lstm_parameter.state_row_align_, + lstm_parameter.state_col_align_, lstm_parameter.proj_col_align_, lstm_parameter.has_bias_); +} + +void NNaclFp32Serializer::CodeStruct(const std::string &name, const LstmParameter &lstm_parameter, + const DynamicLstmParameter &dynamic_lstm_param) { + CodeBaseStruct("LstmParameter", name, lstm_parameter.op_parameter_, lstm_parameter.input_size_, + lstm_parameter.hidden_size_, lstm_parameter.project_size_, lstm_parameter.output_size_, + dynamic_lstm_param.seq_len_, dynamic_lstm_param.batch_, dynamic_lstm_param.output_step_, + lstm_parameter.bidirectional_, lstm_parameter.zoneout_cell_, lstm_parameter.zoneout_hidden_, + dynamic_lstm_param.input_row_align_, lstm_parameter.input_col_align_, + dynamic_lstm_param.state_row_align_, lstm_parameter.state_col_align_, lstm_parameter.proj_col_align_, + lstm_parameter.has_bias_); } void NNaclFp32Serializer::CodeStruct(const std::string &name, const DeQuantArg &de_quant_arg) { @@ -165,6 +240,17 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const StridedSlice strided_slice_parameter.newAxisMask_, strided_slice_parameter.shrinkAxisMask_); } +void NNaclFp32Serializer::CodeStruct(const std::string &name, const StridedSliceParameter &strided_slice_parameter, + const StridedSliceDynamicParameter &dynamic_strided_slice_param) { + CodeBaseStruct("StridedSliceParameter", name, strided_slice_parameter.op_parameter_, + ToString(strided_slice_parameter.begins_), ToString(strided_slice_parameter.ends_), + ToString(strided_slice_parameter.strides_), strided_slice_parameter.isScale, + strided_slice_parameter.in_shape_length_, dynamic_strided_slice_param.in_shape_, + strided_slice_parameter.num_axes_, strided_slice_parameter.data_type, + strided_slice_parameter.begins_mask_, strided_slice_parameter.ellipsisMask_, + strided_slice_parameter.newAxisMask_, strided_slice_parameter.shrinkAxisMask_); +} + void NNaclFp32Serializer::CodeStruct(const std::string &name, const ArithmeticWrapperInfo &arithmetic_wrapper_info) { CodeBaseStruct("ArithmeticWrapperInfo", name, arithmetic_wrapper_info.offset0_, arithmetic_wrapper_info.stride0_, arithmetic_wrapper_info.offset1_, arithmetic_wrapper_info.stride1_, @@ -207,6 +293,12 @@ void NNaclFp32Serializer::CodeStruct(const std::string &name, const BroadcastSha ToString(param.output_shape_), param.output_shape_size_); } +void NNaclFp32Serializer::CodeStruct(const std::string &name, const BroadcastShapeInfo &op_param, + const BroadcastDynamicShapeInfo &dynamic_param) { + CodeBaseStruct("BroadcastShapeInfo", name, dynamic_param.input_shape_, op_param.input_shape_size_, + dynamic_param.output_shape_, op_param.output_shape_size_); +} + void NNaclFp32Serializer::CodeStruct(const std::string &name, const CustomGruParameter &op_param) { CodeBaseStruct("CustomGruParameter", name, op_param.op_parameter_, op_param.num_step, op_param.batch_size, op_param.input_size, op_param.hidden_size); diff --git a/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h b/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h index d1435dea..2b1536c6 100644 --- a/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h +++ b/mindspore/lite/tools/converter/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_fp32_serializer.h @@ -53,6 +53,15 @@ #include "nnacl/kernel/pooling.h" #include "nnacl/kernel/layer_norm.h" #include "nnacl/kernel/fill.h" +#include "coder/opcoders/nnacl/dynamic_parameter/dynamic_lstm_parameter.h" +#include "coder/opcoders/nnacl/dynamic_parameter/transpose_dynamic_parameter.h" +#include "coder/opcoders/nnacl/dynamic_parameter/slice_dynamic_parameter.h" +#include "coder/opcoders/nnacl/dynamic_parameter/split_dynamic_parameter.h" +#include "coder/opcoders/nnacl/dynamic_parameter/strided_slice_dynamic_parameter.h" +#include "coder/opcoders/nnacl/dynamic_parameter/scale_dynamic_parameter.h" +#include "coder/opcoders/nnacl/dynamic_parameter/conv_dynamic_parameter.h" +#include "coder/opcoders/nnacl/dynamic_parameter/arithmetic_dynamic_parameter.h" +#include "coder/opcoders/nnacl/dynamic_parameter/pooling_dynamic_parameter.h" namespace mindspore::lite::micro::nnacl { class NNaclFp32Serializer : public Serializer { @@ -66,6 +75,7 @@ class NNaclFp32Serializer : public Serializer { void CodeStruct(const std::string &name, const InstanceNormParameter ¶m); void CodeStruct(const std::string &name, const ArithmeticParameter &arithmetic_parameter); void CodeStruct(const std::string &name, const ConvParameter &conv_parameter); + void CodeStruct(const std::string &name, const MatMulParameter &mat_mul_parameter); void CodeStruct(const std::string &name, const MicroMatmulParameter µ_matmul_parameter); void CodeStruct(const std::string &name, const LstmParameter &lstm_parameter); void CodeStruct(const std::string &name, const ScaleStruct &scale_struct); @@ -89,6 +99,24 @@ class NNaclFp32Serializer : public Serializer { void CodeStruct(const std::string &name, const SlidingWindowParam ¶m); void CodeStruct(const std::string &name, const UnstackParameter ¶m); void CodeStruct(const std::string &name, const FillStruct ¶m); + void CodeStruct(const std::string &name, const TransposeParameter &transpose_param, + const TransposeDynamicParameter &dynamic_transpose_param); + void CodeStruct(const std::string &name, const SplitParameter &split_parameter, + const SplitDynamicParameter &dynamic_split_param); + void CodeStruct(const std::string &name, const BroadcastShapeInfo ¶m, + const BroadcastDynamicShapeInfo &dynamic_param); + void CodeStruct(const std::string &name, const LstmParameter &lstm_param, + const DynamicLstmParameter &dynamic_lstm_param); + void CodeStruct(const std::string &name, const SliceParameter &slice_parameter, + const SliceDynamicParameter &dynamic_slice_param); + void CodeStruct(const std::string &name, const StridedSliceParameter &strided_slice_parameter, + const StridedSliceDynamicParameter &dynamic_strided_slice_param); + void CodeStruct(const std::string &name, const ScaleStruct &scale_struct, + const ScaleDynamicParameter &dynamic_scale_param); + void CodeStruct(const std::string &name, const ConvParameter &conv_parameter, + const ConvDynamicParameter &dynamic_conv_param); + void CodeStruct(const std::string &name, const PoolingComputeParam &pooling_compute, + const PoolingDynamicParameter &dynamic_pooling_param); void CodeStruct(const std::string &name, const int *list, int size); void CodeArrayStruct(const std::string &name, TensorC *tensorC, std::vector tensor); diff --git a/mindspore/lite/tools/converter/micro/coder/session.cc b/mindspore/lite/tools/converter/micro/coder/session.cc index 55df7a22..374f662d 100644 --- a/mindspore/lite/tools/converter/micro/coder/session.cc +++ b/mindspore/lite/tools/converter/micro/coder/session.cc @@ -75,7 +75,10 @@ int CoderSession::PassArgsToContext(const std::string &model_name) { context_->set_total_buffer_size(final_total_size); context_->set_graph_inputs(coder_graph_->input_tensors()); context_->set_graph_outputs(coder_graph_->output_tensors()); - if (Configurator::GetInstance()->debug_mode()) { + context_->set_shape_info_container(&shape_info_container_); + context_->set_dynamic_mem_manager(&dynamic_mem_manager_); + Configurator *config = Configurator::GetInstance(); + if (config->debug_mode()) { std::vector blocks; blocks = AddDumpDataInfo(context_->code_blocks(), op_coders_); if (blocks.size() == 0) { @@ -100,7 +103,16 @@ int CoderSession::Preprocess() { Configurator::GetInstance()->changeable_weights_name()); MS_CHECK_RET_CODE(ret, "assign memory failed"); - // prepare, init model parameters + if (dynamic_) { + auto config = Configurator::GetInstance(); + MS_CHECK_TRUE_MSG(config != nullptr, RET_NULL_PTR, "Config is a nullptr."); + ret = shape_info_container_.Init(op_coders_, graph_inputs_shape_infos_); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "Init ShapeInfoContainer failed."); + auto outputs = coder_graph_->output_tensors(); + ret = dynamic_mem_manager_.AllocDynamicMem(op_coders_, inputs, outputs, &shape_info_container_); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "DynamicMemManager AllocDynamicMem failed."); + } + // 2. prepare, init model parameters for (const auto &op_coder : op_coders_) { MS_CHECK_PTR(op_coder); MS_LOG(DEBUG) << "prepare: " << op_coder->name(); @@ -133,7 +145,7 @@ int CoderSession::Run(const std::string &model_name) { ret = PassArgsToContext(model_name); MS_CHECK_RET_CODE(ret, "PassArgsToContext failed"); MS_LOG(INFO) << "run opcoders success"; - return RET_OK; + return ret; } int CoderSession::GenerateCode() { @@ -161,6 +173,9 @@ int CoderSession::Init(const void *content, int size, const int model_index, boo context_ = std::make_unique(model_index); context_->set_end_flag(end_flag); enable_fp16_ = enable_fp16; + Configurator *config = Configurator::GetInstance(); + MS_CHECK_TRUE_MSG(config != nullptr, RET_NULL_PTR, "Config is a nullptr."); + dynamic_ = !config->graph_inputs_shape_infos().empty(); MS_LOG(INFO) << "CoderSession::Init done"; return RET_OK; } @@ -227,6 +242,7 @@ int CoderSession::InitTensorsRef() { } } tensor->set_ref_count(refcount); + tensor->set_init_ref_count(refcount); } return RET_OK; } @@ -325,6 +341,7 @@ int CoderSession::CreateOpCoders() { .input_indices(input_indices) .output_indices(output_indices) .is_builtin_custom(is_built_in_custom_op) + .is_dynamic(dynamic_) .build(schema_version_); if (op_coder == nullptr) { coder_graph_->DumpUnSupportLayer(code_target); @@ -348,6 +365,20 @@ int CoderSession::CompileGraph() { MS_CHECK_RET_CODE(InitCodeGraph(), "InitGraphInOutTensors failed"); MS_CHECK_RET_CODE(CreateOpCoders(), "CreateOpCoders failed!"); MS_CHECK_RET_CODE(InitTensorsRef(), "InitTensorsRefcount failed!"); + if (dynamic_) { + Configurator::GetInstance()->set_dynamic_shape(true); + std::vector inputs = coder_graph_->input_tensors(); + auto &graph_inputs_shape_infos = Configurator::GetInstance()->graph_inputs_shape_infos(); + MS_CHECK_TRUE_MSG(inputs.size() == graph_inputs_shape_infos.size(), RET_ERROR, + "Config graph_inputs_shape's num cannot match."); + for (size_t i = 0; i < inputs.size(); ++i) { + graph_inputs_shape_infos_[inputs[i]] = graph_inputs_shape_infos[i]; + } + } + for (auto &op_coder : op_coders_) { + op_coder->set_shape_info_container(&shape_info_container_); + op_coder->set_dynamic_mem_manager(&dynamic_mem_manager_); + } return RET_OK; } CoderSession::~CoderSession() { allocator_->Free(); } diff --git a/mindspore/lite/tools/converter/micro/coder/session.h b/mindspore/lite/tools/converter/micro/coder/session.h index 98a8d008..452e3245 100644 --- a/mindspore/lite/tools/converter/micro/coder/session.h +++ b/mindspore/lite/tools/converter/micro/coder/session.h @@ -65,6 +65,10 @@ class CoderSession { private: int schema_version_ = SCHEMA_VERSION::SCHEMA_CUR; bool enable_fp16_{false}; + bool dynamic_{false}; + DynamicMemManager dynamic_mem_manager_; + ShapeInfoContainer shape_info_container_; + std::map>> graph_inputs_shape_infos_; }; } // namespace mindspore::lite::micro #endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_SESSION_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/shape_info_container.cc b/mindspore/lite/tools/converter/micro/coder/shape_info_container.cc new file mode 100644 index 00000000..c914be6c --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/shape_info_container.cc @@ -0,0 +1,131 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "coder/shape_info_container.h" +#include "src/litert/infer_manager.h" +#include "coder/opcoders/op_coder.h" +#include "coder/utils/coder_utils.h" +#include "tools/common/string_util.h" + +namespace mindspore::lite::micro { +int ShapeInfoContainer::Init(const std::vector> &nodes_coder, + const std::map>> &graph_inputs) { + MS_CHECK_TRUE_MSG(!graph_inputs.empty(), RET_ERROR, "Cannot get graph_inputs's shape-info"); + auto scene_num = graph_inputs.begin()->second.size(); + for (const auto &item : graph_inputs) { + MS_CHECK_TRUE_MSG(item.first, RET_NULL_PTR, "Find a nullptr in graph_inputs"); + MS_CHECK_TRUE_MSG(item.second.size() == scene_num, RET_ERROR, "Graph inputs are invalid."); + } + var_tensor_shapes_.insert(graph_inputs.begin(), graph_inputs.end()); + for (size_t i = 0; i < scene_num; ++i) { + for (const auto &item : graph_inputs) { + item.first->set_shape(item.second[i]); + } + for (const auto &node_coder : nodes_coder) { + auto in_tensors = node_coder->input_tensors(); + auto out_tensors = node_coder->output_tensors(); + auto op_param = node_coder->get_parameter(); + MS_CHECK_TRUE_MSG(op_param, RET_NULL_PTR, "NodeCoder's op_param is a nullptr."); + auto node = node_coder->node(); + MS_CHECK_TRUE_MSG(node, RET_NULL_PTR, "NodeCoder's node is a nullptr."); + auto prim = node->primitive_; + auto ret = DoInferShape(in_tensors, out_tensors, op_param, prim); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "ShapeInfoContainer Init failed."); + } + } + auto ret = DetermineShapeVarInfos(); + MS_CHECK_TRUE_MSG(ret == RET_OK, ret, "DetermineShapeVarInfos failed."); + return RET_OK; +} + +int ShapeInfoContainer::DoInferShape(const std::vector &in_tensors, std::vector &out_tensors, + OpParameter *op_param, const void *primitive) { + auto ret = KernelInferShape(in_tensors, out_tensors, primitive, {}, lite::SCHEMA_CUR); + if (ret == lite::RET_NOT_SUPPORT) { + ret = KernelInferShape(in_tensors, out_tensors, op_param); + } + if (ret != RET_OK) { + MS_LOG(ERROR) << "Infer shape failed."; + return ret; + } + for (const auto out_tensor : out_tensors) { + var_tensor_shapes_[out_tensor].push_back(out_tensor->shape()); + } + return RET_OK; +} + +int ShapeInfoContainer::DetermineShapeVarInfos() { + MS_CHECK_TRUE_MSG(kShapePrefixName, RET_NULL_PTR, "kShapePrefixName is a nullptr."); + int index = 0; + for (const auto &item : var_tensor_shapes_) { + auto &tensor = item.first; + auto &shapes = item.second; + MS_CHECK_TRUE_MSG(!shapes.empty(), RET_ERROR, "Cannot get some tensor's shape."); + auto shape = shapes.front(); + auto dims = shape.size(); + auto is_same_dim = + std::all_of(shapes.begin(), shapes.end(), [dims](const std::vector &item) { return item.size() == dims; }); + MS_CHECK_TRUE_MSG(is_same_dim, RET_ERROR, "Tensor's shape-dims-num are not same."); + std::vector shape_symbols; + for (size_t i = 0; i < dims; ++i) { + int dim = shape[i]; + std::vector real_nums; + auto is_same_pos = + std::all_of(shapes.begin(), shapes.end(), [dim, i](const std::vector &item) { return item[i] == dim; }); + if (is_same_pos) { + shape_symbols.push_back(std::to_string(dim)); + continue; + } + (void)std::transform(shapes.begin(), shapes.end(), std::back_inserter(real_nums), + [i](const std::vector &item) { return item[i]; }); + std::string shape_symbol; + for (const auto &shape_to_num : shape_to_nums_) { + if (shape_to_num.second == real_nums) { + shape_symbol = shape_to_num.first; + break; + } + } + if (shape_symbol.empty()) { + for (size_t scene_index = 0; scene_index < real_nums.size(); ++scene_index) { + shapes_whole_scenes_[scene_index].push_back(real_nums[scene_index]); + } + shape_symbol = std::string(kShapePrefixName) + "[" + std::to_string(index++) + "]"; + shape_to_nums_[shape_symbol] = real_nums; + } + shape_symbols.push_back(shape_symbol); + } + shape_templates_[tensor] = shape_symbols; + } + return RET_OK; +} + +std::vector ShapeInfoContainer::GetTemplateShape(const Tensor *tensor) const { + if (shape_templates_.find(tensor) == shape_templates_.end()) { + return {}; + } + return shape_templates_.at(tensor); +} + +std::vector ShapeInfoContainer::GetRealNums(const std::string &shape_var) const { + if (IsNumber(shape_var)) { + return {std::stoi(shape_var)}; + } + if (shape_to_nums_.find(shape_var) == shape_to_nums_.end()) { + return {}; + } + return shape_to_nums_.at(shape_var); +} +} // namespace mindspore::lite::micro diff --git a/mindspore/lite/tools/converter/micro/coder/shape_info_container.h b/mindspore/lite/tools/converter/micro/coder/shape_info_container.h new file mode 100644 index 00000000..9268b249 --- /dev/null +++ b/mindspore/lite/tools/converter/micro/coder/shape_info_container.h @@ -0,0 +1,59 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_SHAPE_INFO_CONTAINER_H_ +#define MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_SHAPE_INFO_CONTAINER_H_ + +#include +#include +#include +#include "tools/converter/micro/coder/config.h" +#include "include/model.h" +#include "src/tensor.h" +#include "nnacl/op_base.h" + +namespace mindspore::lite::micro { +class OperatorCoder; +class ShapeInfoContainer { + public: + ShapeInfoContainer() = default; + ~ShapeInfoContainer() = default; + + int Init(const std::vector> &nodes_coder, + const std::map>> &graph_inputs); + + const std::map>> &GetVarTensorInfos() const { return var_tensor_shapes_; } + + std::vector GetTemplateShape(const Tensor *tensor) const; + + const std::map> &GetWholeTemplateShape() { return shape_templates_; } + + std::vector GetRealNums(const std::string &shape_var) const; + + const std::map> &GetShapesWholeScenes() const { return shapes_whole_scenes_; } + + private: + int DoInferShape(const std::vector &in_tensors, std::vector &out_tensors, OpParameter *op_param, + const void *primitive); + int DetermineShapeVarInfos(); + std::map>> var_tensor_shapes_; + std::map> shape_templates_; + std::map> shape_to_nums_; + std::map> shapes_whole_scenes_; + Model *model_{nullptr}; +}; +} // namespace mindspore::lite::micro +#endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_SHAPE_INFO_CONTAINER_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.cc b/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.cc index c86a967d..a4c15c83 100644 --- a/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.cc +++ b/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.cc @@ -1,5 +1,5 @@ /** - * Copyright 2021-2022 Huawei Technologies Co., Ltd + * Copyright 2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include "tools/converter/micro/coder/log.h" #include "tools/converter/micro/coder/utils/type_cast.h" #include "tools/converter/micro/coder/allocator/allocator.h" +#include "tools/common/string_util.h" namespace mindspore::lite::micro { bool CheckConstantTensor(const Tensor *const tensor) { @@ -145,4 +146,36 @@ std::vector SplitString(std::string str, const std::string &pattern } return results; } + +std::string AccumulateShape(const std::vector &shape_template, size_t start_index, size_t end_index) { + int64_t const_part = 1; + std::string non_const_part; + for (size_t i = start_index; i < end_index; ++i) { + auto item = shape_template[i]; + if (IsNumber(item)) { + const_part *= std::stoi(item); + } else { + if (!non_const_part.empty()) { + non_const_part += " * "; + } + non_const_part += item; + } + } + std::string accumulate_shape = std::to_string(const_part); + if (!non_const_part.empty()) { + accumulate_shape += " * " + non_const_part; + } + return accumulate_shape; +} + +std::string GetTensorAddr(lite::Tensor *tensor, bool is_const, DynamicMemManager *dynamic_mem_manager, + MemoryAllocator *allocator) { + if (is_const) { + return allocator->GetRuntimeAddr(tensor, true); + } + if (dynamic_mem_manager == nullptr) { + return allocator->GetRuntimeAddr(tensor); + } + return dynamic_mem_manager->GetVarTensorAddr(tensor); +} } // namespace mindspore::lite::micro diff --git a/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.h b/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.h index eabae70e..70a973cb 100644 --- a/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.h +++ b/mindspore/lite/tools/converter/micro/coder/utils/coder_utils.h @@ -41,5 +41,10 @@ std::string ArrayToString(std::vector array) { std::for_each(array.begin(), array.end(), [&result](const T &t) { result += std::to_string(t) + ", "; }); return "{" + result + "}"; } + +std::string AccumulateShape(const std::vector &shape_template, size_t start_index, size_t end_index); + +std::string GetTensorAddr(lite::Tensor *tensor, bool is_const, DynamicMemManager *dynamic_mem_manager, + MemoryAllocator *allocator); } // namespace mindspore::lite::micro #endif // MINDSPORE_LITE_TOOLS_CONVERTER_MICRO_CODER_UTILS_CODER_UTILS_H_ diff --git a/mindspore/lite/tools/converter/micro/coder/utils/type_cast.cc b/mindspore/lite/tools/converter/micro/coder/utils/type_cast.cc index 61b22bae..1d3c02a0 100644 --- a/mindspore/lite/tools/converter/micro/coder/utils/type_cast.cc +++ b/mindspore/lite/tools/converter/micro/coder/utils/type_cast.cc @@ -54,32 +54,30 @@ std::string EnumNameDataType(TypeId type) { std::string EnumNameMSDataType(TypeId type) { switch (type) { case kNumberTypeInt: - return "kMSDataTypeNumberTypeInt32"; + case kNumberTypeInt32: + return "OH_AI_DATATYPE_NUMBERTYPE_INT32"; case kNumberTypeInt8: - return "kMSDataTypeNumberTypeInt8"; + return "OH_AI_DATATYPE_NUMBERTYPE_INT8"; case kNumberTypeInt16: - return "kMSDataTypeNumberTypeInt16"; - case kNumberTypeInt32: - return "kMSDataTypeNumberTypeInt32"; + return "OH_AI_DATATYPE_NUMBERTYPE_INT16"; case kNumberTypeInt64: - return "kMSDataTypeNumberTypeUInt64"; + return "OH_AI_DATATYPE_NUMBERTYPE_INT64"; case kNumberTypeUInt: - return "kMSDataTypeNumberTypeUInt32"; + case kNumberTypeUInt32: + return "OH_AI_DATATYPE_NUMBERTYPE_UINT32"; case kNumberTypeUInt8: - return "kMSDataTypeNumberTypeUInt8"; + return "OH_AI_DATATYPE_NUMBERTYPE_UINT8"; case kNumberTypeUInt16: - return "kMSDataTypeNumberTypeUInt16"; - case kNumberTypeUInt32: - return "kMSDataTypeNumberTypeUInt32"; + return "OH_AI_DATATYPE_NUMBERTYPE_UINT16"; case kNumberTypeFloat: case kNumberTypeFloat32: - return "kMSDataTypeNumberTypeFloat32"; + return "OH_AI_DATATYPE_NUMBERTYPE_FLOAT32"; case kNumberTypeFloat16: - return "kMSDataTypeNumberTypeFloat16"; + return "OH_AI_DATATYPE_NUMBERTYPE_FLOAT16"; case kNumberTypeFloat64: - return "kMSDataTypeNumberTypeFloat64"; + return "OH_AI_DATATYPE_NUMBERTYPE_FLOAT64"; case kTypeUnknown: - return "kMSDataTypeUnknown"; + return "OH_AI_DATATYPE_UNKNOWN"; default: return "unsupported"; } diff --git a/mindspore/lite/tools/converter/parser/third_party/third_party_model_parser.cc b/mindspore/lite/tools/converter/parser/third_party/third_party_model_parser.cc index 652db4af..a82feb07 100644 --- a/mindspore/lite/tools/converter/parser/third_party/third_party_model_parser.cc +++ b/mindspore/lite/tools/converter/parser/third_party/third_party_model_parser.cc @@ -62,7 +62,7 @@ STATUS ThirdPartyModelParser::InitConfig(const std::string &config_file) { MS_LOG(ERROR) << "Missing config file in converting third party model"; return RET_ERROR; } - auto ret = config_parser.ParseConfigFile(config_file); + auto ret = config_parser.ParseConfigFile(config_file, nullptr); if (ret != RET_OK) { MS_LOG(ERROR) << "Get third party model section from config file failed"; return RET_ERROR; diff --git a/mindspore/lite/tools/optimizer/fusion/tile_matmul_fusion.cc b/mindspore/lite/tools/optimizer/fusion/tile_matmul_fusion.cc new file mode 100644 index 00000000..4caef237 --- /dev/null +++ b/mindspore/lite/tools/optimizer/fusion/tile_matmul_fusion.cc @@ -0,0 +1,120 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define USE_DEPRECATED_API +#include "tools/optimizer/fusion/tile_matmul_fusion.h" +#include +#include "tools/optimizer/common/gllo_utils.h" +#include "nnacl/op_base.h" +#include "tools/lite_exporter/fetch_content.h" +#include "ops/op_utils.h" +#include "ops/lite_ops.h" +#include "ops/fusion/tile_fusion.h" +#include "ops/fusion/mat_mul_fusion.h" + +namespace mindspore { +namespace opt { +bool TileMatMulFusion::CheckCanFuse(const FuncGraphPtr &func_graph, const AnfNodePtr &node) const { + auto tile_cnode = node->cast(); + MS_CHECK_TRUE_RET(tile_cnode != nullptr, false); + auto tile_primc = ops::GetOperator(tile_cnode->input(0)); + MS_CHECK_TRUE_RET(tile_primc != nullptr, false); + auto tile_prim_c = tile_primc->GetPrim(); + MS_CHECK_TRUE_RET(tile_prim_c != nullptr, false); + if (IsQuantParameterNode(tile_prim_c)) { + MS_LOG(INFO) << tile_primc->name() << " is quant node"; + return false; + } + auto manager = func_graph->manager(); + MS_CHECK_TRUE_RET(manager != nullptr, false); + auto node_users = manager->node_users()[tile_cnode]; + for (auto &node_user : node_users) { + auto post_node = node_user.first; + auto post_node_index = node_user.second; + if (!utils::isa(post_node) || !CheckPrimitiveType(post_node, prim::kPrimMatMulFusion) || + post_node_index != C2NUM) { + MS_LOG(INFO) << "The post node of tile must be matmul's matirxB."; + return false; + } + auto matmul_primc = ops::GetOperator(GetInputs(post_node).at(0)); + MS_CHECK_TRUE_RET(matmul_primc != nullptr, false); + auto matmul_prim_c = matmul_primc->GetPrim(); + MS_CHECK_TRUE_RET(matmul_prim_c != nullptr, false); + if (IsQuantParameterNode(matmul_prim_c)) { + MS_LOG(INFO) << matmul_prim_c->name() << " is quant node"; + return false; + } + } + + lite::DataInfo data_info; + auto status = lite::FetchConstData(tile_cnode, C2NUM, converter::kFmkTypeMs, &data_info, false); + MS_CHECK_TRUE_MSG(status == RET_OK, false, "Fetch tile_cnode third input's const data failed."); + if ((data_info.data_type_ != kNumberTypeInt32 && data_info.data_type_ != kNumberTypeInt) || + data_info.data_.size() / sizeof(int) < DIMENSION_2D) { + MS_LOG(INFO) << "Tile index data is invalid."; + return false; + } + auto data = reinterpret_cast(data_info.data_.data()); + int dim = static_cast(data_info.data_.size() / sizeof(int)); + for (int i = dim - C1NUM; i > dim - C3NUM; --i) { + if (data[i] != C1NUM) { + return false; + } + } + lite::DataInfo weights_info; + auto left_pre_node = tile_cnode->input(C1NUM); + if (left_pre_node->isa() || left_pre_node->isa()) { + status = lite::FetchConstData(tile_cnode, C1NUM, converter::kFmkTypeMs, &weights_info, false); + } else { + status = lite::FetchDataFromCNode(tile_cnode, C1NUM, &weights_info); + } + MS_CHECK_TRUE_RET(status == RET_OK, false); + MS_CHECK_TRUE_MSG(weights_info.shape_.size() == static_cast(dim), false, + "Tile_cnode second input's shape size is invalid."); + for (int i = 0; i < dim - C2NUM; i++) { + if (data[i] != C1NUM && weights_info.shape_[i] != C1NUM) { + return false; + } + } + return true; +} + +bool TileMatMulFusion::Run(const FuncGraphPtr &func_graph) { + MS_CHECK_TRUE_RET(func_graph != nullptr, false); + auto node_list = TopoSort(func_graph->get_return()); + for (auto &node : node_list) { + MS_CHECK_TRUE_RET(node != nullptr, false); + if (!utils::isa(node)) { + continue; + } + if (!CheckPrimitiveType(node, prim::kPrimTileFusion)) { + continue; + } + if (!CheckCanFuse(func_graph, node)) { + continue; + } + auto tile_cnode = node->cast(); + MS_CHECK_TRUE_RET(tile_cnode != nullptr, false); + auto left_pre_node = tile_cnode->input(SECOND_INPUT); + auto manage = func_graph->manager(); + MS_CHECK_TRUE_RET(manage != nullptr, false); + auto success = manage->Replace(tile_cnode, left_pre_node); + MS_CHECK_TRUE_MSG(success, false, "Replace old node failed."); + } + return true; +} +} // namespace opt +} // namespace mindspore diff --git a/mindspore/lite/tools/optimizer/fusion/tile_matmul_fusion.h b/mindspore/lite/tools/optimizer/fusion/tile_matmul_fusion.h new file mode 100644 index 00000000..280dc265 --- /dev/null +++ b/mindspore/lite/tools/optimizer/fusion/tile_matmul_fusion.h @@ -0,0 +1,37 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_TILE_MATMUL_FUSION_H_ +#define MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_TILE_MATMUL_FUSION_H_ + +#include +#include "tools/optimizer/common/multiple_pattern_process_pass.h" +#include "utils/check_convert_utils.h" + +namespace mindspore { +namespace opt { +class TileMatMulFusion : public Pass { + public: + TileMatMulFusion() : Pass("TileMatMulFusion") {} + ~TileMatMulFusion() override = default; + bool Run(const FuncGraphPtr &func_graph) override; + + private: + bool CheckCanFuse(const FuncGraphPtr &func_graph, const AnfNodePtr &node) const; +}; +} // namespace opt +} // namespace mindspore +#endif // MINDSPORE_LITE_TOOLS_OPTIMIZER_FUSION_TILE_MATMUL_FUSION_H_ diff --git a/mindspore/python/mindspore/ops/operations/_grad_ops.py b/mindspore/python/mindspore/ops/operations/_grad_ops.py index 59c9c883..5714b832 100644 --- a/mindspore/python/mindspore/ops/operations/_grad_ops.py +++ b/mindspore/python/mindspore/ops/operations/_grad_ops.py @@ -1521,7 +1521,7 @@ class LSTMGrad(Primitive): """Computes the data and weight gradients of LSTM.""" @prim_attr_register - def __init__(self, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout): + def __init__(self, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout, proj_size=0): self.input_size = validator.check_positive_int(input_size, 'input_size', self.name) self.hidden_size = validator.check_positive_int(hidden_size, 'hidden_size', self.name) self.num_layers = validator.check_positive_int(num_layers, 'num_layers', self.name) @@ -1529,12 +1529,53 @@ class LSTMGrad(Primitive): self.bidirectional = validator.check_value_type('bidirectional', bidirectional, (bool,), self.name) self.dropout = validator.check_value_type("dropout", dropout, [float], self.name) self.dropout = validator.check_float_range(dropout, 0, 1, validator.INC_BOTH, 'dropout', self.name) + self.proj_size = validator.check_int_range(proj_size, 0, hidden_size, Rel.INC_LEFT, + 'proj_size', self.name) + if bidirectional: self.num_directions = 2 else: self.num_directions = 1 + def infer_shape(self, x_shape, hx_shape, cx_shape, w_shape, y_shape, hy_shape, cy_shape, dy_shape, dhy_shape, + dcy_shape, reserve_shape): + # dhy and dcy should be same shape + validator.check_equal_int(len(dhy_shape), 3, "h_shape", self.name) + validator.check_equal_int(len(dhy_shape), len(dcy_shape), "h_shape", self.name) + if self.proj_size == 0: + validator.check_equal_int(dhy_shape[0], dcy_shape[0], "h_shape[0]", self.name) + validator.check_equal_int(dhy_shape[1], dcy_shape[1], "h_shape[1]", self.name) + validator.check_equal_int(dhy_shape[2], dcy_shape[2], "h_shape[2]", self.name) + + real_hidden_size = self.proj_size if self.proj_size > 0 else self.hidden_size + validator.check_int(dhy_shape[0], self.num_layers * self.num_directions, Rel.EQ, "h_shape[0]", self.name) + validator.check_equal_int(dhy_shape[2], real_hidden_size, "h_shape[2]", self.name) + + validator.check_equal_int(len(dy_shape), 3, "dy_shape", self.name) + validator.check_equal_int(dy_shape[1], dhy_shape[1], "dy[1]", self.name) + validator.check_int(dy_shape[2], real_hidden_size * self.num_directions, Rel.EQ, "dy[2]", self.name) + + dx_shape = (y_shape[0], y_shape[1], self.input_size) + dhx_shape = dhy_shape + dcx_shape = dcy_shape + weight_size = 0 + gate_size = 4 * self.hidden_size + for layer in range(self.num_layers): + for _ in range(self.num_directions): + input_layer_size = self.input_size if layer == 0 else self.hidden_size * self.num_directions + weight_size += gate_size * input_layer_size + weight_size += gate_size * real_hidden_size + if self.proj_size > 0: + weight_size += self.proj_size * self.hidden_size + if self.has_bias: + weight_size += gate_size + + return (dx_shape, dhx_shape, dcx_shape, (weight_size, 1, 1)) + + def infer_dtype(self, x_dtype, hx_dtype, cx_dtype, w_dtype, y_dtype, hy_dtype, cy_dtype, dy_dtype, dhy_dtype, + dcy_dtype, reserve_dtype): + return (dy_dtype, dy_dtype, dy_dtype, hx_dtype) class DynamicRNNGrad(Primitive): """Computes the input gradients of DynamicRNN.""" diff --git a/mindspore/python/mindspore/ops/operations/nn_ops.py b/mindspore/python/mindspore/ops/operations/nn_ops.py index 3a0eb3d6..8ae747be 100644 --- a/mindspore/python/mindspore/ops/operations/nn_ops.py +++ b/mindspore/python/mindspore/ops/operations/nn_ops.py @@ -4356,7 +4356,7 @@ class LSTM(Primitive): """ @prim_attr_register - def __init__(self, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout): + def __init__(self, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout, proj_size=0): """Initialize LSTM.""" self.input_size = validator.check_positive_int(input_size, "input_size", self.name) self.hidden_size = validator.check_positive_int(hidden_size, "hidden_size", self.name) @@ -4365,12 +4365,40 @@ class LSTM(Primitive): self.bidirectional = validator.check_value_type("bidirectional", bidirectional, (bool,), self.name) self.dropout = validator.check_value_type("dropout", dropout, [float], self.name) self.dropout = validator.check_float_range(dropout, 0, 1, validator.INC_BOTH, 'dropout', self.name) + self.proj_size = validator.check_int_range(proj_size, 0, hidden_size, validator.INC_LEFT, + 'proj_size', self.name) if bidirectional: self.num_directions = 2 else: self.num_directions = 1 + def infer_shape(self, x_shape, h_shape, c_shape, w_shape): + validator.check_equal_int(len(x_shape), 3, "x rank", self.name) + validator.check_equal_int(x_shape[2], self.input_size, "x[2]", self.name) + + # h and c should be same shape + validator.check_equal_int(len(h_shape), 3, "h rank", self.name) + if self.proj_size == 0: + validator.check("h_shape", h_shape, "c_shape", c_shape, Rel.EQ, self.name) + + real_hidden_size = self.proj_size if self.proj_size > 0 else self.hidden_size + validator.check_int(h_shape[0], self.num_layers * self.num_directions, Rel.EQ, "h[0]", self.name) + validator.check_equal_int(h_shape[1], x_shape[1], "h[1]", self.name) + validator.check_int(h_shape[2], real_hidden_size, Rel.EQ, "h[2]", self.name) + + y_shape = (x_shape[0], x_shape[1], real_hidden_size * self.num_directions) + + # set arbitrary shape for reserved space + reserved_shape = (1, 1) + state_shape = (1, 1) + return y_shape, h_shape, c_shape, reserved_shape, state_shape + + def infer_dtype(self, x_dtype, h_dtype, c_dtype, w_dtype): + args = {'x': x_dtype, 'h': h_dtype, 'c': c_dtype, 'w': w_dtype} + validator.check_tensors_dtypes_same_and_valid(args, (mstype.float32, mstype.float16), self.name) + return x_dtype, x_dtype, x_dtype, x_dtype, x_dtype + class SigmoidCrossEntropyWithLogits(Primitive): r"""